In [10]:
# import both main files for house 1 and combine them into 1 dataframe

import numpy as np
import pandas as pd

main1 = open('low_freq/house_1/channel_1.dat')
main2 = open('low_freq/house_1/channel_2.dat')
channel1 = list()
channel2 = list()
for line in main1:
    channel1.append(line.split())
for line in main2:
    channel2.append(line.split())
df1 = pd.DataFrame(channel1)
df1.set_index(0, drop=True, inplace=True)
df2 = pd.DataFrame(channel2)
df2.set_index(0, drop=True, inplace=True)
df_main = df1.merge(df2, how='inner', left_index=True, right_index=True)
df_main.columns = ['main1', 'main2']

# import the rest of the channels for house 1 (ie refrigerator, oven, etc) and
# combine into 1 dataframe to later merge with the main data frame

file = open('low_freq/house_1/channel_3.dat')
channel = list()
for line in file:
    channel.append(line.split())
df_channel = pd.DataFrame(channel)
df_channel.set_index(0, drop=True, inplace=True)

channel_no = 20 #3-20
for i in range(4, channel_no+1):
    file = open('low_freq/house_1/channel_' + str(i) + '.dat')
    channel = list()
    for line in file:
        channel.append(line.split())
    temp = pd.DataFrame(channel)
    temp.set_index(0, drop=True, inplace=True)
    df_channel = df_channel.merge(temp, how='left', right_index=True, left_index=True)

df_channel.columns = ['oven1','oven2','refrigerator','dishwasher','kitchen_outlets1','kitchen_outlets2',
                      'lighting1','washer_dryer1','microwave','bathroom_gfi','electric_heat','stove',
                      'kitchen_outlets3','kitchen_outlets4','lighting2','lighting3','washer_dryer2','washer_dryer3']

# merge both main and channel dataframes and re-order columns

df_merged = df_channel.merge(df_main, how='left', right_index=True, left_index=True)
ordered = ['main1', 'main2', 'oven1','oven2','refrigerator','dishwasher','kitchen_outlets1','kitchen_outlets2',
                      'lighting1','washer_dryer1','microwave','bathroom_gfi','electric_heat','stove',
                      'kitchen_outlets3','kitchen_outlets4','lighting2','lighting3','washer_dryer2','washer_dryer3']
df_merged = df_merged[ordered]

# change all numbers to floating point then drop nan
df_merged = df_merged.apply(lambda x: pd.to_numeric(x), axis=0)
df_merged.dtypes

df_dropNAN = df_merged.copy().dropna(axis = 0, how='any')
print('done')

done


In [61]:
#convert all channel values to binary
channels = ['oven1','oven2','refrigerator','dishwasher','kitchen_outlets1','kitchen_outlets2',
                      'lighting1','washer_dryer1','microwave','bathroom_gfi','electric_heat','stove',
                      'kitchen_outlets3','kitchen_outlets4','lighting2','lighting3','washer_dryer2','washer_dryer3']
df_binary = df_dropNAN.copy()
for i in channels:
    df_binary[i] = (df_binary[i] > 0).astype(int)

#have a non-binary copy of df_dropNAN
df = df_dropNAN.copy()

In [65]:
# Random forest classifier with binary dependent variables
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X = df_binary[['main1','main2']]
Y = df_binary.copy().drop(['main1','main2'],axis=1)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=100)
rf = RandomForestClassifier(max_depth=10, random_state=100)
rf.fit(x_train, y_train)
rf.feature_importances_
train_acc = rf.score(x_train,y_train)
test_acc = rf.score(x_test,y_test)
print('Training accuracy: ',train_acc)
print('Test accuracy: ',test_acc)

Training accuracy:  0.88234812054
Test accuracy:  0.881253257545


In [58]:
# Random forest regressor with non-binary dependent variables (original values)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

X = df[['main1','main2']]
Y = df.copy().drop(['main1','main2'],axis=1)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=100)
rf = RandomForestRegressor(max_depth=10, random_state=100)
rf.fit(x_train, y_train)
rf.feature_importances_
train_acc = rf.score(x_train,y_train)
test_acc = rf.score(x_test,y_test)
print('Training accuracy: ',train_acc)
print('Test accuracy: ',test_acc)

Training accuracy:  0.957214690092
Test accuracy:  0.93802660889


In [62]:
# Decision tree classifier with binary dependent variables
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

X = df_binary[['main1','main2']]
Y = df_binary.copy().drop(['main1','main2'],axis=1)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=100)
rf = DecisionTreeClassifier(max_depth=10, random_state=100)
rf.fit(x_train, y_train)
rf.feature_importances_
train_acc = rf.score(x_train,y_train)
test_acc = rf.score(x_test,y_test)
print('Training accuracy: ',train_acc)
print('Test accuracy: ',test_acc)

Training accuracy:  0.890264570037
Test accuracy:  0.887960112895


In [63]:
# Decision tree regressor with non-binary dependent variables (original values)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

X = df[['main1','main2']]
Y = df.copy().drop(['main1','main2'],axis=1)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=100)
rf = DecisionTreeRegressor(max_depth=10, random_state=100)
rf.fit(x_train, y_train)
rf.feature_importances_
train_acc = rf.score(x_train,y_train)
test_acc = rf.score(x_test,y_test)
print('Training accuracy: ',train_acc)
print('Test accuracy: ',test_acc)

Training accuracy:  0.954449700245
Test accuracy:  0.934059903421
