In [185]:
import pandas as pd
import numpy as np
import requests
import json
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
%matplotlib inline
from patsy import dmatrices
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_columns', None)  

In [186]:
#read in databases
chart1 = requests.get("http://localhost:5000/chartData/1")
chart2 = requests.get("http://localhost:5000/pastWeather")

In [187]:
#load as json
mainJson = json.loads(chart1.text)
weatherJson=json.loads(chart2.text)


In [188]:
#flatten the json
available_bikes=[]
mainDayofyear=[]
weekday=[]
hour=[]
for i in mainJson['available_bikes']:
    available_bikes.append(mainJson['available_bikes'][i])

for i in mainJson['weekday']:
    weekday.append(mainJson['weekday'][i])    

for i in mainJson['dayofyear']:
    mainDayofyear.append(mainJson['dayofyear'][i])
for i in mainJson['hour']:
    hour.append(mainJson['hour'][i])

In [189]:
#append to dict, for making dataframe
mainDict={}
mainDict['availableBikes']=available_bikes
mainDict['Dayofyear']= mainDayofyear
mainDict['weekday']=weekday
mainDict['hour']=hour

In [190]:
#flatten the json
dayofyear=[]
description=[]
weekday=[]
hour=[]
for i in weatherJson['dayofyear']:
    dayofyear.append(weatherJson['dayofyear'][i])

for i in weatherJson['description']:
    description.append(weatherJson['description'][i])

for i in weatherJson['weekday']:
    weekday.append(weatherJson['weekday'][i])
    
for i in weatherJson['hour']:
    hour.append(weatherJson['hour'][i])
    

In [191]:
#append to dict, for making dataframe
weatherDict={}
weatherDict['Dayofyear']= dayofyear
weatherDict['description'] = description
weatherDict['hour']=hour

    

In [192]:
#change types for join
mainDf = pd.DataFrame.from_dict(mainDict)
mainDf['Dayofyear']=mainDf['Dayofyear'].astype(float)
mainDf.drop_duplicates()
mainDf.reset_index(drop=True)
mainDf.dtypes

Dayofyear         float64
availableBikes      int64
hour                int64
weekday            object
dtype: object

In [193]:
# drop duplicates that share an hour and day 
weatherDf=pd.DataFrame.from_dict(weatherDict)
weatherDf.drop_duplicates()
weatherDf= weatherDf.drop_duplicates(['hour','Dayofyear'])
weatherDf.head()

Unnamed: 0,Dayofyear,description,hour
0,72.0,broken clouds,14.0
3,74.0,shower rain,17.0
5,74.0,light intensity shower rain,19.0
11,74.0,broken clouds,20.0
13,73.0,light rain,16.0


In [194]:
#change types for merging the two tables
weatherDf['Dayofyear']=weatherDf['Dayofyear'].astype(str)
weatherDf['hour']=weatherDf['hour'].astype(str)
weatherDf['period'] = weatherDf[['Dayofyear', 'hour']].apply(lambda x: ''.join(x), axis=1)
weatherDf.head()

Unnamed: 0,Dayofyear,description,hour,period
0,72.0,broken clouds,14.0,72.014.0
3,74.0,shower rain,17.0,74.017.0
5,74.0,light intensity shower rain,19.0,74.019.0
11,74.0,broken clouds,20.0,74.020.0
13,73.0,light rain,16.0,73.016.0


In [195]:
#change data types to allow merging tables
mainDf['hour']=mainDf['hour'].astype(float)
mainDf.head()
mainDf['Dayofyear']=mainDf['Dayofyear'].astype(str)
mainDf['hour']=mainDf['hour'].astype(str)
mainDf['period'] = mainDf[['Dayofyear', 'hour']].apply(lambda x: ''.join(x), axis=1)
mainDf.head()

Unnamed: 0,Dayofyear,availableBikes,hour,weekday,period
0,71.0,30,14.0,Monday,71.014.0
1,71.0,30,14.0,Monday,71.014.0
2,74.0,18,17.0,Thursday,74.017.0
3,74.0,20,18.0,Thursday,74.018.0
4,74.0,21,18.0,Thursday,74.018.0


In [196]:
# merge tables
newdf= pd.merge(mainDf, weatherDf, how='right', left_on='period', right_on = 'period')

In [197]:
newdf.shape

(9082, 8)

In [198]:
# drop duplicate columns and period column
newdf = newdf.drop('hour_y', 1)
newdf = newdf.drop('period', 1)
newdf = newdf.drop('Dayofyear_y', 1)
newdf.head()

Unnamed: 0,Dayofyear_x,availableBikes,hour_x,weekday,description
0,74.0,18.0,17.0,Thursday,shower rain
1,74.0,17.0,17.0,Thursday,shower rain
2,74.0,19.0,17.0,Thursday,shower rain
3,74.0,17.0,17.0,Thursday,shower rain
4,74.0,19.0,17.0,Thursday,shower rain


In [199]:
# tidy up data types and names
newdf['dayOfYear']=newdf['Dayofyear_x'].astype(float)
newdf['availableBikes']=newdf['availableBikes'].astype(float)
newdf['hour']=newdf['hour_x'].astype(float)
newdf = newdf.drop('hour_x', 1)
newdf = newdf.drop('Dayofyear_x', 1)
newdf.head()

Unnamed: 0,availableBikes,weekday,description,dayOfYear,hour
0,18.0,Thursday,shower rain,74.0,17.0
1,17.0,Thursday,shower rain,74.0,17.0
2,19.0,Thursday,shower rain,74.0,17.0
3,17.0,Thursday,shower rain,74.0,17.0
4,19.0,Thursday,shower rain,74.0,17.0


In [200]:
# check correlation between cont features
newdf[['dayOfYear', 'availableBikes', 'hour']].corr()

Unnamed: 0,dayOfYear,availableBikes,hour
dayOfYear,1.0,0.125873,-0.047626
availableBikes,0.125873,1.0,0.096743
hour,-0.047626,0.096743,1.0


In [201]:
# remove all NaN columsn from target feature
newdf = newdf[np.isfinite(newdf['availableBikes'])]

In [202]:
# set categorical features
newdf['description'].astype('category')
newdf.dtypes
newdf = newdf[newdf.description !='light shower snow']
newdf = newdf[newdf.description !='description_light snow']


# start training model

In [203]:
# Prepare the data, turn categorical feature EnergyRating into dummies.
# decided not to use dayofyear as a continuous feature
df_cont_feat = training[['hour']]
df_dummies_weekday = pd.get_dummies(training[['weekday']])
df_dummies_weather=pd.get_dummies(training[['description']])

In [204]:
df_dummies_weather.shape

(6203, 14)

In [205]:
# Add dummies to the other continuous features
X = pd.concat([df_cont_feat, df_dummies_weekday[['weekday_Monday', 'weekday_Saturday', 'weekday_Sunday', 'weekday_Thursday', 'weekday_Tuesday','weekday_Wednesday']], df_dummies_weather[['description_broken clouds', 'description_clear sky', 'description_few clouds', 'description_fog', 'description_light intensity drizzle','description_light intensity drizzle rain', 'description_light intensity shower rain','description_light rain',  'description_mist', 'description_moderate rain','description_overcast clouds' , 'description_scattered clouds', 'description_shower rain', 'description_shower sleet']]], axis =1)
y = training[['availableBikes']]
print(y)

      availableBikes
2853             0.0
2854             0.0
2855             0.0
2856             0.0
2857             0.0
2858             0.0
2859             0.0
2860             0.0
2861             0.0
2862             0.0
2863             0.0
2864             0.0
2865             0.0
2866             0.0
2867             0.0
2868             0.0
2869             0.0
2870             0.0
2871             0.0
2872             0.0
2873             0.0
2874             0.0
2875             0.0
2876             0.0
2877             0.0
2878             0.0
2879             0.0
2880             0.0
2881             1.0
2882             1.0
2883             0.0
2884             1.0
2885             1.0
2886             1.0
2887             1.0
2888             1.0
2889             1.0
2890             1.0
2891             1.0
2892             2.0
2893             2.0
2894             2.0
2895             3.0
2896             3.0
2897             4.0
2898             4.0
2899         

In [206]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print("Training data:\n", pd.concat([X_train, y_train], axis=1))
print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))


Training data:
       hour  weekday_Monday  weekday_Saturday  weekday_Sunday  \
5389  10.0               0                 0               1   
5273   0.0               0                 0               1   
4315  11.0               0                 0               0   
3369   3.0               0                 0               1   
7365  12.0               0                 0               1   
8855  20.0               0                 0               0   
4386  16.0               0                 0               0   
8265  18.0               0                 0               0   
5624   6.0               1                 0               0   
7072  12.0               0                 1               0   
7288   6.0               0                 0               1   
7257   3.0               0                 0               1   
4802   4.0               0                 0               0   
4032  12.0               0                 0               0   
8069   2.0              

       hour  weekday_Monday  weekday_Saturday  weekday_Sunday  \
4883  11.0               0                 0               0   
7325   9.0               0                 0               1   
3458  11.0               0                 0               1   
6222   8.0               0                 0               0   
7149  18.0               0                 1               0   
3483  13.0               0                 0               1   
7989  16.0               0                 0               0   
7038   9.0               0                 1               0   
4275   8.0               0                 0               0   
5170  11.0               0                 1               0   
5444  15.0               0                 0               1   
6559  12.0               0                 0               0   
7316   8.0               0                 0               1   
6161   3.0               0                 0               0   
8933   2.0               0             

In [207]:
# Train RF with 100 trees
rfc = RandomForestClassifier(n_estimators=1000, max_features='auto', oob_score=True, random_state=1)
# Fit model on full dataset
rfc.fit(X_train, y_train)


  after removing the cwd from sys.path.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=True, random_state=1, verbose=0, warm_start=False)

In [208]:
# Don't know why it does this (a, v, a, i, l, a, b, l, e, B, i, k, e, s) thing. seems to be making good predictions though
rfc_predictions_test = rfc.predict(X_test)
df_true_vs_rfc_predicted_test = pd.DataFrame({'ActualClass': y_test, 'PredictedClass': rfc_predictions_test})
df_true_vs_rfc_predicted_test

Unnamed: 0,ActualClass,PredictedClass
0,"(a, v, a, i, l, a, b, l, e, B, i, k, e, s)",27.0
1,"(a, v, a, i, l, a, b, l, e, B, i, k, e, s)",31.0
2,"(a, v, a, i, l, a, b, l, e, B, i, k, e, s)",30.0
3,"(a, v, a, i, l, a, b, l, e, B, i, k, e, s)",8.0
4,"(a, v, a, i, l, a, b, l, e, B, i, k, e, s)",27.0
5,"(a, v, a, i, l, a, b, l, e, B, i, k, e, s)",20.0
6,"(a, v, a, i, l, a, b, l, e, B, i, k, e, s)",2.0
7,"(a, v, a, i, l, a, b, l, e, B, i, k, e, s)",31.0
8,"(a, v, a, i, l, a, b, l, e, B, i, k, e, s)",28.0
9,"(a, v, a, i, l, a, b, l, e, B, i, k, e, s)",31.0


In [177]:
print("Accuracy: ", metrics.accuracy_score(y_test, rfc_predictions_test))
print("Confusion matrix: \n", metrics.confusion_matrix(y_test, rfc_predictions_test))
print("Classification report:\n ", metrics.classification_report(y_test, rfc_predictions_test))

Accuracy:  0.411606663084
Confusion matrix: 
 [[198   9  15 ...,   0   0   0]
 [ 25  16   3 ...,   0   0   0]
 [ 18   4  16 ...,   0   0   0]
 ..., 
 [  0   0   0 ...,  10  10  15]
 [  0   0   0 ...,   9  22  47]
 [  0   0   0 ...,   8  19 118]]
Classification report:
               precision    recall  f1-score   support

        0.0       0.76      0.77      0.76       256
        1.0       0.42      0.26      0.32        61
        2.0       0.34      0.37      0.36        43
        3.0       0.60      0.67      0.63        39
        4.0       0.40      0.41      0.40        46
        5.0       0.06      0.04      0.05        26
        6.0       0.38      0.38      0.38        50
        7.0       0.31      0.37      0.34        38
        8.0       0.33      0.28      0.31        32
        9.0       0.28      0.32      0.30        53
       10.0       0.43      0.33      0.38        54
       11.0       0.08      0.12      0.10        25
       12.0       0.26      0.31      0

In [178]:
pd.DataFrame({'feature': X.columns, 'importance':rfc.feature_importances_})

Unnamed: 0,feature,importance
0,hour,0.598582
1,weekday_Monday,0.029484
2,weekday_Saturday,0.031735
3,weekday_Sunday,0.030957
4,weekday_Thursday,0.036383
5,weekday_Tuesday,0.027934
6,weekday_Wednesday,0.028525
7,description_broken clouds,0.033291
8,description_clear sky,0.012177
9,description_few clouds,0.01724


In [184]:
# Evaluate the model using 3-fold cross-validation.
# Uses 2 parts of data for training and the last part for testing. 
# This process is repeated 3 times. More details about cross-validation here: http://www-bcf.usc.edu/~gareth/ISL/
scores = cross_val_score(RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1), X, y, cv=10)
print(scores)
print(scores.mean())

IndexError: too many indices for array

In [179]:
rfc.oob_score_

0.40879778903731001