In [6]:
import pandas as pd
import numpy as np
import os
import ast
import sys
import pickle
import warnings
warnings.filterwarnings('ignore')




This is a second level of imports, using packages built for this context

In [None]:
additional_code_path = '/Users/briandalessandro/Documents/CrossBoundary/code/rf2019-crossboundary'

sys.path.append(additional_code_path)
from CbModelBuilder.CbModelBuilderUtils import non_model_features, evaluate_regression,evaluate_classification,cap_outlier,train_test_split,plot_confusion_matrix
from CbModelBuilder.FeatureImportanceSummary import FeatureImportanceSummary
from CbModelBuilder.ModelCVWrapper import ModelCVWrapper


## Example Pipelines

Set up file paths (these should be changed based on whatever local file system logic you have set up)

In [None]:
data_dir_june = '/Users/briandalessandro/Documents/CrossBoundary/E4I-Datasets/June_2019_DataShare/'
modeldir = '/Users/briandalessandro/Documents/CrossBoundary/models/'

Read in data

In [19]:
model_df = pd.read_csv(data_dir_june + 'training_all_in.csv')
model_df = cap_outlier(model_df, 'avg_consumption', 0.5)

### Feature Selection and Ranking

Get a ranked list of features by their feature importance. Use the convenience class created for this

In [20]:
fi = FeatureImportanceSummary(model_df, non_model_features, 'avg_consumption')
fi.get_feature_importance_summary()

Take a quick look at the top K features that contain 80% of the normalized information gain (the importance metric)

In [34]:
fi.feat_imp_df.loc[0:fi.feat_imp_index[0.8]].head(20)['x'].values

array(['tariff', 'non_self_generated_electricity_monthly_consumption_kwh',
       'energy', 'uses_of_non_self_generated_electricity_has_fan',
       'years_in_community', 'hh_expenses', 'hh_size_between_5_18',
       'cooking_tech', 'rent', 'age',
       'non_self_generated_electricity_monthly_expenses', 'sleep_time',
       'hours_of_use_on_days_used_of_lighting_items',
       'fetching_time_minutes',
       'hours_of_use_on_days_used_of_lighting_items_by_interviewee',
       'rank_appliances_to_buy__3rd_choice', 'phone_charger_type_of_use',
       'hh_income', 'achieved_important_things_i_want',
       'non_self_generated_electricity_unit_price_per_kw'], dtype=object)

Split the data for modeling, using the splitting function we created

In [22]:
train_df, test_df = train_test_split(model_df, 0.1)

### Regression

Get a best performing regression model using grid search and cross-validation (this can take some time)

In [23]:
from sklearn.ensemble import RandomForestRegressor

reg_cv = ModelCVWrapper(5)

rf_grid = {'n_estimators':[200,500], 'max_depth':[5,10], 'criterion':['mse']}
grid_dict = {RandomForestRegressor():rf_grid}


reg_cv.get_best_model(train_df, test_df, 'avg_consumption', fi, [0.8], grid_dict)

Running Treshold 0.8


Look at results from the test

In [24]:
reg_cv.cv_summary_df

Unnamed: 0,algo_string,subset_index,cv_score,params,test_score
0,RandomForestRegressor,29,-0.008713,"{'criterion': 'mse', 'max_depth': 10, 'n_estim...",0.006506


Do an evaluation summary of the best performing model

In [25]:
preds = reg_cv.best_model.predict(test_df[reg_cv.best_subset])

evaluate_regression(preds, test_df['avg_consumption'])

MAE 0.05241449059195152
MSE 0.006331462308712751
R2 0.4572775800306279
Average Error:  0.052414490591951496 kwh


Save the regression model for other uses

In [26]:
modelfile = modeldir + 'random_forest_regression.pickle'
with open(modelfile, 'wb') as w:
    pickle.dump(reg_cv, w)

### Classification

Define this as a classification problem and get a best performing classification model using grid search and cross-validation (this can take some time)

First create a binary label, targeting top 10 %

In [28]:
Y_r = 'avg_consumption'
Y_c = 'Y_cat'

label_cut = np.percentile(train_df[Y_r], 90)


train_df[Y_c] = 1*(train_df[Y_r] > label_cut)
test_df[Y_c] = 1*(test_df[Y_r] > label_cut)

Train the classifier using the wrapper class we created

In [29]:
from sklearn.ensemble import RandomForestClassifier

class_cv = ModelCVWrapper(5)

rfc_grid = {'n_estimators':[100,200], 'max_depth':[5,10], 'criterion':['entropy']}
grid_dict = {RandomForestClassifier():rfc_grid}


class_cv.get_best_model(train_df, test_df, Y_c, fi, [0.8,0.9], grid_dict)

Running Treshold 0.8
Running Treshold 0.9


Evaluate the classifier

In [30]:
preds_prob = class_cv.best_model.predict_proba(test_df[class_cv.best_subset])[:,1]

evaluate_classification(preds_prob, test_df[Y_c])

AUC 0.9034468438538206
Accuracy 0.06451612903225812
Precision 0.625
Recall:  0.35714285714285715


In [None]:
# Plot normalized confusion matrix
y_pred = class_cv.best_model.predict(test_df[class_cv.best_subset])

plot_confusion_matrix(test_df[Y_c], y_pred, classes=['low','hi'], normalize=True,
                      title='Normalized confusion matrix')

In [31]:
modelfile = modeldir + 'random_forest_classifier.pickle'
with open(modelfile, 'wb') as w:
    pickle.dump(class_cv, w)