In [1]:
import pandas as pd
import numpy as np
import datetime
import sklearn
from sklearn.cross_validation import train_test_split 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.grid_search import GridSearchCV
from sklearn import linear_model
from sklearn.metrics import *
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

def mean_absolute_percentage_error(y_true, y_pred): 
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100



In [2]:
data = pd.read_csv('energydata_Transformation.csv', parse_dates=['date'])

In [3]:
weekType = pd.get_dummies(data['weekType'], prefix = 'weekType')
day_of_week = pd.get_dummies(data['day_of_week'], prefix = 'day_of_week')
#['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
#['Weekend', 'Weekday']

# Concat above dummies variable dataframe to the main dataframe
data = pd.concat((data,weekType),axis=1)
data = pd.concat((data,day_of_week),axis=1)

# Drop the WeekStatus and Day_of_week column
data = data.drop(['weekType','day_of_week','date', 'time'],axis=1)

In [4]:
data.head()

Unnamed: 0,Appliances,temp_kitchen,hum_kitchen,temp_living,hum_living,temp_laundry,hum_laundry,temp_office,hum_office,temp_bathroom,...,Numerical_Week,weekType_Weekday,weekType_Weekend,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday
0,90,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,...,0,1,0,0,1,0,0,0,0,0
1,90,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,...,0,1,0,0,1,0,0,0,0,0
2,80,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,...,0,1,0,0,1,0,0,0,0,0
3,90,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,...,0,1,0,0,1,0,0,0,0,0
4,100,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,...,0,1,0,0,1,0,0,0,0,0


In [5]:
data.describe()

Unnamed: 0,Appliances,temp_kitchen,hum_kitchen,temp_living,hum_living,temp_laundry,hum_laundry,temp_office,hum_office,temp_bathroom,...,Numerical_Week,weekType_Weekday,weekType_Weekend,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,101.496833,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,19.592106,...,2.977249,0.722726,0.277274,0.14416,0.140765,0.138637,0.138637,0.145934,0.145934,0.145934
std,104.380829,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,1.844623,...,1.985617,0.447664,0.447664,0.351261,0.347788,0.345576,0.345576,0.353049,0.353049,0.353049
min,10.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,15.33,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,18.2775,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,60.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,19.39,...,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,100.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,20.619643,...,5.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1110.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,25.795,...,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
data_train,data_test = train_test_split(data,train_size=0.7,random_state=42)
x_train=data_train.iloc[:,1:]
y_train=data_train['Appliances']
scaler.fit(x_train)
x_train_sc=scaler.transform(x_train)
x_test=data_test.iloc[:,1:]
y_test=data_test['Appliances']
x_test_sc=scaler.transform(x_test)

In [7]:
lm=linear_model.LinearRegression()
lm.fit(x_train_sc,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:
data.columns

Index(['Appliances', 'temp_kitchen', 'hum_kitchen', 'temp_living',
       'hum_living', 'temp_laundry', 'hum_laundry', 'temp_office',
       'hum_office', 'temp_bathroom', 'hum_bathroom', 'temp_building_out',
       'hum_building_out', 'temp_ironing', 'hum_ironing', 'temp_teenRoom',
       'hum_teenRoom', 'hum_parentRoom', 'Pressure', 'hum_out', 'Windspeed',
       'Visibility', 'Tdewpoint', 'rv1', 'month', 'hour', 'day',
       'Numerical_Week', 'weekType_Weekday', 'weekType_Weekend',
       'day_of_week_Friday', 'day_of_week_Monday', 'day_of_week_Saturday',
       'day_of_week_Sunday', 'day_of_week_Thursday', 'day_of_week_Tuesday',
       'day_of_week_Wednesday'],
      dtype='object')

In [9]:
y_train_pred=lm.predict(x_train_sc)
print('Results Before Removing Outliers')
print("R2   :",r2_score(y_train,y_train_pred))
print("MAE  :",mean_absolute_error(y_train,y_train_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
print("MAPE :",mean_absolute_percentage_error(y_train,y_train_pred))

Results Before Removing Outliers
R2   : 0.163478284779894
MAE  : 55.1548681580834
RMSE : 95.8702506077817
MAPE : 63.63715418864386


In [10]:
y_test_pred=lm.predict(x_test_sc)
print("R2   :",r2_score(y_test,y_test_pred))
print("MAE  :",mean_absolute_error(y_test,y_test_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_test,y_test_pred)))
print("MAPE :",mean_absolute_percentage_error(y_test,y_test_pred))

R2   : 0.16030624503859658
MAE  : 55.18527090544636
RMSE : 94.69484191578468
MAPE : 65.2855213226856


In [11]:
check_outliers = ['Appliances', 'temp_kitchen', 'hum_kitchen', 'temp_living',
       'hum_living', 'temp_laundry', 'hum_laundry', 'temp_office',
       'hum_office', 'temp_bathroom', 'hum_bathroom', 'temp_building_out',
       'hum_building_out', 'temp_ironing', 'hum_ironing', 'temp_teenRoom',
       'hum_teenRoom', 'hum_parentRoom', 'Pressure', 'hum_out', 'Windspeed',
       'Visibility', 'Tdewpoint', 'rv1']

for x in data[check_outliers]:
    data = data[np.abs(data[x]-data[x].mean()) <= (3*data[x].std())]

In [12]:
data_train,data_test = train_test_split(data,train_size=0.7,random_state=42)
x_train=data_train.iloc[:,1:]
y_train=data_train['Appliances']
scaler.fit(x_train)
x_train_sc=scaler.transform(x_train)
x_test=data_test.iloc[:,1:]
y_test=data_test['Appliances']
x_test_sc=scaler.transform(x_test)

In [13]:
lm=linear_model.LinearRegression()
lm.fit(x_train_sc,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [14]:
y_train_pred=lm.predict(x_train_sc)
print('Results After Removing Outliers')
print("R2   :",r2_score(y_train,y_train_pred))
print("MAE  :",mean_absolute_error(y_train,y_train_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
print("MAPE :",mean_absolute_percentage_error(y_train,y_train_pred))

Results After Removing Outliers
R2   : 0.20240041240037587
MAE  : 37.44305018302269
RMSE : 61.084103906660495
MAPE : 48.79207936771256


In [15]:
y_test_pred=lm.predict(x_test_sc)
print("R2   :",r2_score(y_test,y_test_pred))
print("MAE  :",mean_absolute_error(y_test,y_test_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_test,y_test_pred)))
print("MAPE :",mean_absolute_percentage_error(y_test,y_test_pred))

R2   : 0.18439732423351962
MAE  : 39.546224360147626
RMSE : 65.34756813104556
MAPE : 49.83422396156097


As we can see the error is reduced when removing outliers, we will continue with outliers removed 

In [16]:
rf=RandomForestRegressor()
rf.fit(x_train_sc, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [17]:
y_train_pred=rf.predict(x_train_sc)
print("R2   :",r2_score(y_train,y_train_pred))
print("MAE  :",mean_absolute_error(y_train,y_train_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
print("MAPE :",mean_absolute_percentage_error(y_train,y_train_pred))

R2   : 0.9102802495175761
MAE  : 9.77177152984774
RMSE : 20.487066418534514
MAPE : 11.171528615375328


In [18]:
y_test_pred=rf.predict(x_test_sc)
print("R2   :",r2_score(y_test,y_test_pred))
print("MAE  :",mean_absolute_error(y_test,y_test_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_test,y_test_pred)))
print("MAPE :",mean_absolute_percentage_error(y_test,y_test_pred))

R2   : 0.512892800989398
MAE  : 25.955271565495206
RMSE : 50.501279706758886
MAPE : 28.762757601563


In [19]:
mlp = MLPRegressor(hidden_layer_sizes=(155),max_iter=500,alpha=1.00000000e-06,random_state=42)
mlp.fit(x_train_sc,y_train)

MLPRegressor(activation='relu', alpha=1e-06, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=155, learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=42, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [20]:
y_train_pred=mlp.predict(x_train_sc)
print("R2   :",r2_score(y_train,y_train_pred))
print("MAE  :",mean_absolute_error(y_train,y_train_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
print("MAPE :",mean_absolute_percentage_error(y_train,y_train_pred))

R2   : 0.47227016978667913
MAE  : 29.41322138035721
RMSE : 49.686861902156664
MAPE : 36.67145149855105


In [21]:
y_test_pred=mlp.predict(x_test_sc)
print("R2   :",r2_score(y_test,y_test_pred))
print("MAE  :",mean_absolute_error(y_test,y_test_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_test,y_test_pred)))
print("MAPE :",mean_absolute_percentage_error(y_test,y_test_pred))

R2   : 0.3923425324631702
MAE  : 32.40734537499993
RMSE : 56.405250063096744
MAPE : 38.84803738852857
