In [5]:
import os
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

In [8]:
train = pd.read_csv('files/training.csv')
test = pd.read_csv('files/testing.csv')
print(train.shape)
print(test.shape)



(14803, 32)
(4932, 32)


In [9]:
train.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,NSM,WeekStatus,Day_of_week
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433,61200,Weekday,Monday
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195,61800,Weekday,Monday
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668,62400,Weekday,Monday
3,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097,63600,Weekday,Monday
4,2016-01-11 17:50:00,50,40,19.89,46.026667,19.2,44.5,19.79,44.933333,18.89,...,734.0,92.0,5.333333,43.833333,4.8,44.919484,44.919484,64200,Weekday,Monday


In [11]:
train.columns

Index(['date', 'Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3',
       'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8',
       'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed',
       'Visibility', 'Tdewpoint', 'rv1', 'rv2', 'NSM', 'WeekStatus',
       'Day_of_week'],
      dtype='object')

In [12]:
14803 + 4932

19735

In [13]:
4932 / 19735

0.2499113250570053

In [15]:
x_train = train.drop(['date', 'Appliances', 'WeekStatus','Day_of_week'], axis=1)
y_train = train['Appliances']
x_test = test.drop(['date', 'Appliances', 'WeekStatus','Day_of_week'], axis=1)
y_test = test['Appliances']
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)




(14803, 28)
(14803,)
(4932, 28)
(4932,)


In [19]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [26]:
# Linear Regression modelling using 2B features

# instantiating the model

model1 = LinearRegression()

# model training
model1.fit(x_train, y_train)

# predicting on training and testing set
y_train_pred = model1.predict(x_train)
y_test_pred = model1.predict(x_test)

# calculating metric scores 
print("Metric scores on training set: ")
print("RMSE: ", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print("R2-score: ", r2_score(y_train, y_train_pred))
print("MAE: ", mean_absolute_error(y_train, y_train_pred))
print("MAPE: ", mean_absolute_percentage_error(y_train, y_train_pred))
print()
print("Metric scores on training set: ")
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print("R2-score: ", r2_score(y_test, y_test_pred))
print("MAE: ", mean_absolute_error(y_test, y_test_pred))
print("MAPE: ", mean_absolute_percentage_error(y_test, y_test_pred))


Metric scores on training set: 
RMSE:  93.571434685038
R2-score:  0.17188056846730504
MAE:  53.25503129295651
MAPE:  61.494124418441665

Metric scores on training set: 
RMSE:  93.56425120887648
R2-score:  0.15199183390308701
MAE:  52.07501531659513
MAPE:  60.10475096677042


In [27]:
train['WeekStatus']

0        Weekday
1        Weekday
2        Weekday
3        Weekday
4        Weekday
          ...   
14798    Weekday
14799    Weekday
14800    Weekday
14801    Weekday
14802    Weekday
Name: WeekStatus, Length: 14803, dtype: object

In [28]:
set(train['WeekStatus'])

{'Weekday', 'Weekend'}

In [30]:
train_status_df = pd.get_dummies(train['WeekStatus'])
test_status_df = pd.get_dummies(test['WeekStatus'])
train_status_df.head()

Unnamed: 0,Weekday,Weekend
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [31]:
train_status = train.join(train_status_df)
test_status = test.join(test_status_df)
print(train_status.shape)
print(test_status.shape)


(14803, 34)
(4932, 34)


In [32]:
train_day_df = pd.get_dummies(train['Day_of_week'])
test_day_df = pd.get_dummies(test['Day_of_week'])
train_day_df.head()


Unnamed: 0,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,0,1,0,0,0,0,0
1,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0
3,0,1,0,0,0,0,0
4,0,1,0,0,0,0,0


In [33]:
train_full = train_status.join(train_day_df)
test_full = test_status.join(test_day_df)
print(train_full.shape)
print(test_full.shape)

(14803, 41)
(4932, 41)


In [34]:
train_full.head()


Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,Day_of_week,Weekday,Weekend,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,Monday,1,0,0,1,0,0,0,0,0
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,Monday,1,0,0,1,0,0,0,0,0
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,Monday,1,0,0,1,0,0,0,0,0
3,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,Monday,1,0,0,1,0,0,0,0,0
4,2016-01-11 17:50:00,50,40,19.89,46.026667,19.2,44.5,19.79,44.933333,18.89,...,Monday,1,0,0,1,0,0,0,0,0


In [35]:
x_train = train_full.drop(['date', 'Appliances', 'WeekStatus','Day_of_week'], axis=1)
y_train = train_full['Appliances']
x_test = test_full.drop(['date', 'Appliances', 'WeekStatus','Day_of_week'], axis=1)
y_test = test_full['Appliances']
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)


(14803, 37)
(14803,)
(4932, 37)
(4932,)


In [36]:
# Linear Regression modelling using all features

# instantiating the model

model2 = LinearRegression()

# model training
model2.fit(x_train, y_train)

# predicting on training and testing set
y_train_pred = model2.predict(x_train)
y_test_pred = model2.predict(x_test)

# calculating metric scores 
print("Metric scores on training set: ")
print("RMSE: ", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print("R2-score: ", r2_score(y_train, y_train_pred))
print("MAE: ", mean_absolute_error(y_train, y_train_pred))
print("MAPE: ", mean_absolute_percentage_error(y_train, y_train_pred))
print()
print("Metric scores on training set: ")
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print("R2-score: ", r2_score(y_test, y_test_pred))
print("MAE: ", mean_absolute_error(y_test, y_test_pred))
print("MAPE: ", mean_absolute_percentage_error(y_test, y_test_pred))

Metric scores on training set: 
RMSE:  93.20557248110671
R2-score:  0.17834376492372517
MAE:  53.13891233505815
MAPE:  61.33117488093768

Metric scores on training set: 
RMSE:  93.17643426587738
R2-score:  0.1590071277867552
MAE:  51.98492960938633
MAPE:  59.95512561148597


do plots +rfe parts