In [1]:
# Importing Standard Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import re
from random import randint
%matplotlib inline

In [2]:
# Importing train, test and sample_sub
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

In [3]:
alldata = pd.concat([train,test])

In [4]:
alldata.head()

Unnamed: 0,Energy,Observation,Press_mm_hg,RH_1,RH_2,RH_3,RH_4,RH_5,RH_6,RH_7,...,T4,T5,T6,T7,T8,T9,T_out,Tdewpoint,Visibility,Windspeed
0,70.0,1111,760.05,37.2,38.0,37.29,34.942857,42.826667,9.633333,35.754,...,22.832857,20.5,12.533333,22.06,22.7,20.2,12.8,5.65,28.0,3.5
1,210.0,1112,764.166667,41.045,39.133333,39.526667,34.126667,44.663333,11.926667,25.133333,...,21.7,18.633333,10.19,20.79,22.926667,19.73,8.47,-1.92,26.5,8.0
2,50.0,1113,757.6,38.0,35.7,36.59,35.53,46.2,21.533333,33.663333,...,22.0,19.0,10.36,19.6,21.89,19.79,10.6,2.4,27.0,2.0
3,50.0,1114,760.6,38.53,38.0,36.2,35.09,43.625714,9.59,32.79,...,23.1,22.042857,10.39,23.39,24.89,22.6,11.6,4.67,40.0,1.0
4,250.0,1115,756.983333,42.56,34.356,40.333333,40.9,49.09,1.0,38.20875,...,24.5,21.5,20.29,23.365,23.39,22.6,17.7,10.4,21.5,1.0


In [5]:
rename_dict = {'Observation': 'ID',
 'Pressure': 'Outside pressure',
 'RH_1': 'Humid in kitchen area',
 'RH_2': 'Humid in living',
 'RH_3': 'Humid in laundry',
 'RH_4': 'Humid in office',
 'RH_5': 'Humidin bath',
 'RH_6': 'Humid outside the building (north side)',
 'RH_7': 'Humid in ironing',
 'RH_8': 'Humid in teenager room',
 'RH_9': 'Humid in parents',
 'RH_out': 'Humid outside',
 'T1': 'Temp in kitchen',
 'T2': 'Temp in living room',
 'T3': 'Temp in laundry',
 'T4': 'Temp in office',
 'T5': 'Temp in bath',
 'T6': 'Temp outside the building (north side)',
 'T7': 'Temp in ironing',
 'T8': 'Temp in teenager',
 'T9': 'Temp in parents',
 'T_out': 'Temp outside'}
alldata = alldata.rename(columns=rename_dict)

In [6]:
 alldata.drop('ID', axis=1, inplace=True)

In [7]:
# Inspect alldata set
print "Length of alldata rows:", len(alldata)
inspect_alldata = pd.DataFrame({'Dtype': alldata.dtypes, 'Unique values': alldata.nunique() ,
             'Number of Missing values': alldata.isnull().sum() ,
              'Percentage Missing': (alldata.isnull().sum() / len(alldata)) * 100
             }).sort_values(by='Number of Missing values',ascending = False)
inspect_alldata

Length of alldata rows: 19725


Unnamed: 0,Dtype,Number of Missing values,Percentage Missing,Unique values
Energy,float64,3945,20.0,82
Temp in living room,float64,0,0.0,1649
Visibility,float64,0,0.0,412
Tdewpoint,float64,0,0.0,1118
Temp outside,float64,0,0.0,1055
Temp in parents,float64,0,0.0,924
Temp in teenager,float64,0,0.0,2228
Temp in ironing,float64,0,0.0,1954
Temp outside the building (north side),float64,0,0.0,4445
Temp in bath,float64,0,0.0,2263


## Feature Engineering

In [8]:
# alldata['Saturation'] = (alldata['RH_out'] == 100)

In [9]:
Humidity_cols = [c for c in alldata.columns if c.startswith("Humid")] 
Temp_cols = [c for c in alldata.columns if c.startswith("Temp")]

In [10]:
Humidity_cols

['Humid in kitchen area',
 'Humid in living',
 'Humid in laundry',
 'Humid in office',
 'Humidin bath',
 'Humid outside the building (north side)',
 'Humid in ironing',
 'Humid in teenager room',
 'Humid in parents',
 'Humid outside']

In [11]:
alldata['Mean Temperature'] = alldata[['Temp in kitchen',
 'Temp in living room',
 'Temp in laundry',
 'Temp in office',
 'Temp in bath',
 'Temp in ironing',
 'Temp in teenager',
 'Temp in parents']].sum(axis=1)/8.0

In [12]:
alldata['Mean Humidity'] = alldata[['Humid in kitchen area',
 'Humid in living',
 'Humid in laundry',
 'Humid in office',
 'Humidin bath',
 'Humid in ironing',
 'Humid in teenager room',
 'Humid in parents']].sum(axis=1)/8.0

In [13]:
alldata['Temp Total Out'] = (alldata['Temp outside the building (north side)'] + alldata['Temp outside'])
alldata['Humid Total Out'] = alldata['Humid outside the building (north side)'] + alldata['Humid outside']

In [14]:
# split the merged data file into train and test respectively
train_feats = alldata[~pd.isnull(alldata.Energy)]
test_feats = alldata[pd.isnull(alldata.Energy)]

In [15]:
X = train_feats.drop('Energy', axis=1)
y = train_feats.Energy

In [16]:
#X.drop(['Humid in laundry'], axis=1, inplace=True)

In [17]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [18]:
X.shape

(15780, 28)

In [19]:
n = 5000
X_train = X[:n]
y_train = y[:n]
X_test = X[n:]
y_test = y[n:]

In [20]:
model = ExtraTreesRegressor(n_estimators=700)
model.fit(X_train,y_train)
preds = model.predict(X_test)
print mean_squared_error(y_test, preds)**.5

77.4806750934


In [21]:
# Feature Importance
Imp_df = pd.DataFrame({'Features': X.columns, 'Importance': model.feature_importances_}).sort_values(
                                                                            by='Importance',
                                                                            ascending= False)
Imp_df

Unnamed: 0,Features,Importance
1,Humid in kitchen area,0.057481
3,Humid in laundry,0.048198
13,Temp in laundry,0.044592
8,Humid in teenager room,0.043621
2,Humid in living,0.041658
23,Windspeed,0.039092
10,Humid outside,0.038803
0,Press_mm_hg,0.038728
12,Temp in living room,0.038525
9,Humid in parents,0.038437


# Submission Zone

In [36]:
model.fit(X,y)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=700, n_jobs=1, oob_score=False, random_state=None,
          verbose=0, warm_start=False)

In [37]:
FINALPREDS = model.predict(test_feats[X.columns])

In [38]:
FINALPREDS

array([  49.91428571,   91.28571429,   54.65714286, ...,   88.57142857,
         68.35714286,  143.71428571])

In [39]:
sample_submission['Energy'] = FINALPREDS

In [40]:
sample_submission.to_csv('HJ.csv', index = None)

In [41]:
sample_submission.head()

Unnamed: 0,Observation,Energy
0,50001,49.914286
1,50002,91.285714
2,50003,54.657143
3,50004,74.842857
4,50005,46.157143
