#### Data Ingestion

In [1]:
import pandas as pd
import numpy as np
import datetime
import sklearn
import tsfresh
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import make_forecasting_frame
from sklearn.ensemble import AdaBoostRegressor
from tsfresh.utilities.dataframe_functions import impute
import warnings
warnings.filterwarnings('ignore')
import datetime
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import *
from sklearn.neural_network import MLPRegressor
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

In [2]:
data1 = pd.read_csv("energydata_complete.csv", parse_dates=['date'])

In [3]:
def featureengineering(data):
    data.rename(columns={'T1': 'temp_kitchen', 'RH_1': 'hum_kitchen', 'T2': 'temp_living', 'RH_2': 'hum_living','T3': 'temp_laundry', 'RH_3': 'hum_laundry','T4': 'temp_office', 'RH_4': 'hum_office','T5': 'temp_bathroom', 'RH_5': 'hum_bathroom','T6': 'temp_building_out', 'RH_6': 'hum_building_out','T7': 'temp_ironing', 'RH_7': 'hum_ironing','T8': 'temp_teenRoom', 'RH_8': 'hum_teenRoom','T9': 'temp_parentRoom', 'RH_9': 'hum_parentRoom', 'T_out': 'temp_out', 'RH_out': 'hum_out', 'Press_mm_hg': 'Pressure' }, inplace=True)
    data['date'] = pd.to_datetime(data['date'], errors='coerce')
    data['month'] = data['date'].dt.month
    data['time'] = data['date'].dt.time
    data['hour'] = data['date'].dt.hour
    data['day'] =  data['date'].dt.day
    data['seconds'] = data['hour']*3600 + data['date'].dt.minute*60 + data['date'].dt.second
    data['day_of_week'] = data['date'].dt.weekday_name
    data['Numerical_Week'] = data['date'].dt.weekday
    data['calweek'] = data['date'].dt.dayofweek
    data['weekType'] = np.where(data['calweek'] < 5, 'Weekday', 'Weekend')
    
    
    data = data.drop(['rv2', 'seconds', 'temp_out', 'temp_parentRoom'], axis=1)
    
    data = data.drop(['calweek'], axis=1)
    
    data['Appliances'] = data['Appliances'] + data['lights']
    data = data.drop(['lights'], axis = 1)
    
    return data

In [4]:
def getvariables(data):
    weekType = pd.get_dummies(data['weekType'], prefix = 'weekType')
    day_of_week = pd.get_dummies(data['day_of_week'], prefix = 'day_of_week')
    #['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    #['Weekend', 'Weekday']

    # Concat above dummies variable dataframe to the main dataframe
    data = pd.concat((data,weekType),axis=1)
    data = pd.concat((data,day_of_week),axis=1)

    # Drop the WeekStatus and Day_of_week column
    data = data.drop(['weekType','day_of_week','date', 'time'],axis=1)
    
    check_outliers = ['Appliances', 'temp_kitchen', 'hum_kitchen', 'temp_living',
       'hum_living', 'temp_laundry', 'hum_laundry', 'temp_office',
       'hum_office', 'temp_bathroom', 'hum_bathroom', 'temp_building_out',
       'hum_building_out', 'temp_ironing', 'hum_ironing', 'temp_teenRoom',
       'hum_teenRoom', 'hum_parentRoom', 'Pressure', 'hum_out', 'Windspeed',
       'Visibility', 'Tdewpoint', 'rv1']

    for x in data[check_outliers]:
        data = data[np.abs(data[x]-data[x].mean()) <= (3*data[x].std())]
    
    data_train,data_test = train_test_split(data,train_size=0.7,random_state=42)
    x_train=data_train.iloc[:,1:]
    y_train=data_train['Appliances']
    scaler.fit(x_train)
    x_train_sc=scaler.transform(x_train)
    x_test=data_test.iloc[:,1:]
    y_test=data_test['Appliances']
    x_test_sc=scaler.transform(x_test)
    
    return x_train_sc,y_train,x_test,y_test

In [5]:
def featureselection(x_train_sc,y_train,x_test,y_test):
    from boruta import BorutaPy
    X = x_train_sc
    y = y_train
    rf = RandomForestRegressor(n_jobs=-1, max_depth=25)
    feature_selector = BorutaPy(rf, n_estimators='auto', verbose=2)
    feature_selector.fit(X, y)
    
    feature_selector.support_
    feature_selector.ranking_
    
    X_filtered = feature_selector.transform(X)
    
    selected_fea = []
    
    for i in range(len(x_train_sc.columns)):
        print(str(x_train_sc.columns[i])+" : "+str(feature_selector.support_[i])+", rank: "+str(feature_selector.ranking_[i]))
        if feature_selector.ranking_[i] == 1:
            selected_fea.append(str(X_trn.columns[i]))
            
    x_train=data_train.iloc[:,1:]
    x_train= x_train[selected_fea]
    print(x_train.shape)
    y_train=data_train['Appliances']
    scaler.fit(x_train)
    x_train_sc=scaler.transform(x_train)
    x_test=data_test.iloc[:,1:]
    x_test = x_test[selected_fea]
    print(x_test.shape)
    y_test=data_test['Appliances']
    x_test_sc=scaler.transform(x_test)
            
    return x_train_sc,y_train,x_test,y_test

In [6]:
def modelimplementation(x_train_sc,y_train,x_test,y_test):
    
    models = [LinearRegression(),
              Ridge(random_state=20),
              Lasso(random_state=20),
              ElasticNet(random_state=20),
              RandomForestRegressor(random_state=20),
              MLPRegressor(random_state=20)
              ]

    TestModels = pd.DataFrame()
    tmp = {}

    for model in models:
        # get model name
        m = str(model)
        tmp['Model'] = m[:m.index('(')]
        # fit model on training dataset
        model.fit(x_train_sc, y_train)
        # predict consumption
        predictions = model.predict(x_test)
        #Evaluation for Testing set
        #R2 score
        tmp['R2_Test'] = r2_score(y_test,predictions)
        #Mean Absolute Error(MAE)
        tmp['MAE_Test']= mean_absolute_error(y_test,predictions)
        #Mean Squared Error(MSE)
        tmp['MSE_Test']= mean_squared_error(y_test,predictions)
        #Root Mean Squared Error (RMSE)
        tmp['RMSE_Test'] = np.sqrt(mean_squared_error(y_test,predictions))
        #Evaluation for Training test
        predictions_trn = model.predict(x_train_sc)
        #R2_Score
        tmp['R2_Train'] = r2_score(y_train,predictions_trn)
        #Mean Absolute Error(MAE)
        tmp['MAE_Train']= mean_absolute_error(y_train,predictions_trn)
    #Mean Squared Error(MSE)
        tmp['MSE_Train']= mean_squared_error(y_train,predictions_trn)
    #Root Mean Squared Error (RMSE)
        tmp['RMSE_Train'] = np.sqrt(mean_squared_error(y_train,predictions_trn))
    
        tmp['Training Score(%)'] = round(model.score(x_train_sc, y_train) * 100,3)
        tmp['Testing Score(%)'] = round(model.score(x_test, y_test) * 100,3)
    
        tmp['MAPE_Test'] =  round(mean_absolute_percentage_error(y_test,predictions))
        tmp['MAPE_Train'] =  round(mean_absolute_percentage_error(y_train,predictions_trn))
        # write obtained data
        TestModels = TestModels.append([tmp])

    TestModels.set_index('Model', inplace=True)
    TestModels
    return TestModels

In [7]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mae = mean_absolute_error(test_labels,predictions)
    mape = 100 * np.mean(errors / test_labels)
    rmse = np.sqrt(mean_squared_error(test_labels,predictions))
    accuracy = model.score(test_features, test_labels)
    print('Model Performance')
    print('R2 : {:0.3f}'.format(accuracy))
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('RMSE : {:0.4f}'.format(rmse)) 
    print('MAPE : {:0.4f}'.format(mae*100))
    print('MAE: {:0.4f}'.format(mae))
    return accuracy

In [8]:
def hypertuningvariables(x_train_sc,y_train,x_test,y_test):

    # Initialize the model based on best performance from above, We got ExtraTreesRegressor 
    sel_model = RandomForestRegressor(random_state=42)

    # Define the parameter subset

    param_grid = {
        "n_estimators": [10, 50, 100, 200, 250, 300, 500, 800],
        "max_features": ["auto", "sqrt", "log2"],
        "max_depth": [None, 10, 50, 100, 200, 500]
    }

    # Use Randomized search to try 20 subsets from parameter space with 5-fold cross validation
    random_search = RandomizedSearchCV(sel_model, param_grid, n_iter=20, scoring="r2", cv=5, n_jobs=-1, verbose=2, random_state=42)
    random_search.fit(x_train_sc, y_train)
    base_model = RandomForestRegressor(random_state = 42)
    base_model.fit(x_train_sc, y_train)
    base_accuracy = evaluate(base_model, x_test, y_test)
    base_accuracy = evaluate(base_model, x_train_sc, y_train)
    best_model = random_search.best_estimator_
    best_accuracy = evaluate(best_model, x_train_sc, y_train)
    print('Improvement of {:0.2f}%. for RandomForestRegressor'.format( 100 * (best_accuracy - base_accuracy) / base_accuracy))

In [11]:
def exec_pipeline(data1):
    print('Data Ingestion')
    data = featureengineering(data1)
    print('Feature Engineering')
    x_train_sc,y_train,x_test,y_test = getvariables(data)
    print('Feature Selection')
    x_train_sc,y_train,x_test,y_test = featureselection(x_train_sc,y_train,x_test,y_test)
    print('Model Training')
    results = modelimplementation(x_train_sc,y_train,x_test,y_test)
    print('Hypertuning variables')
    hypertuning(x_train_sc,y_train,x_test,y_test)
    print('Done')
    return results

In [12]:
results = exec_pipeline(data1)

Data Ingestion
Feature Engineering
Feature Selection
Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	18
Tentative: 	5
Rejected: 	13
Iteration: 	9 / 100
Confirmed: 	18
Tentative: 	5
Rejected: 	13
Iteration: 	10 / 100
Confirmed: 	18
Tentative: 	5
Rejected: 	13
Iteration: 	11 / 100
Confirmed: 	18
Tentative: 	5
Rejected: 	13
Iteration: 	12 / 100
Confirmed: 	20
Tentative: 	3
Rejected: 	13
Iteration: 	13 / 100
Confirmed: 	20
Tentative: 	3
Rejected: 	13
Iteration: 	14 / 100
Confirmed: 	20
Tentative: 	3
Rejected: 	13
Iteration: 	15 / 100
Confirmed: 	20
Tentative: 	3
Rejected: 	13
Ite

NameError: name 'feat_selector' is not defined

In [None]:
results