In [15]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import Imputer, LabelEncoder, MinMaxScaler, FunctionTransformer
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.tree import DecisionTreeRegressor



In [16]:
# Read in dataset
dataset = pd.read_csv('../data/train_dataset.csv')

In [17]:
# Select relevant columns
X = dataset[['icon', 'precipType', 'cloudCover', 'humidity', 'temperature', 'visibility', 'month', 'Hour']]
y = dataset[['Irradiance']]

In [18]:
# The precipType feature contains NAs which will throw up issues later so we need to think of a strategy to replace the NAs.
# For now I have just replaced them with 'rain'

X['precipType'] = X['precipType'].fillna('rain')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [19]:
# Splitting the dataset into the Training set and Test set.

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [20]:
# Custom Transform script for separating categorical and numerical variables for future use

def numFeat(data):
    return data[['cloudCover', 'humidity', 'temperature', 'visibility', 'month', 'Hour']]

def catFeat(data):
    return pd.get_dummies(data[['icon', 'precipType']])



In [21]:
# Transform features using the custom transformer. The numFeat simply returns the numerical variables (no transform).
# catFeat returns the encoded categorical variables. FeatureUnion cocatenates both datasets

features = FeatureUnion([('f1',FunctionTransformer(numFeat, validate=False)),
                         ('f2',FunctionTransformer(catFeat, validate=False))] )
encoder_pipeline = Pipeline( [('f', features)] )

encoder_pipeline.fit(X, y)
X_new = encoder_pipeline.transform(X)

In [22]:
# Write the transformed dataset into a csv

#np.savetxt("encoded.csv", X_new, delimiter=",")

In [23]:
# Build Pipeline

parameters = [];
parameters.append(('imp', Imputer(missing_values='NaN', strategy='mean', axis=0, copy=False)))
#parameters.append(('clf', BaggingRegressor(n_estimators=50, random_state = 0)))
parameters.append(('clf',  DecisionTreeRegressor(random_state = 0)))



In [24]:
# Model Pipeline
model_pipeline = Pipeline(parameters)

# Fit the pipeline
model_pipeline.fit(X_new, y)

Pipeline(memory=None,
     steps=[('imp', Imputer(axis=0, copy=False, missing_values='NaN', strategy='mean', verbose=0)), ('clf', DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best'))])

In [25]:
# Test predictor Pipeline using test data 'temp'. Data in temp is from dataset

temp = [0.27,0.82,49.49,9.49,12,11,0,0,0,0,1,0,0,0,0,1,0]
#temp2 = [0.27, 0.82, 49.49, 9.49, 12, 11]
#temp2 = np.array(temp2).reshape((1, -1))
temp = np.array(temp).reshape((1, -1))

In [26]:
# Compare the result to original value (from data) Irradiance = 162.73
y_pred = model_pipeline.predict(temp)
print(y_pred)

[ 162.73040833]
