## Test a random forest model on full dataset

In [1]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.

import pandas as pd
# Make the graphs a bit prettier, and bigger
pd.set_option('display.mpl_style', 'default')

# This is necessary to show lots of columns in pandas 0.12. 
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)
pd.set_option('display.max_rows', 20)

import numpy as np
import math
# The usual preamble
%matplotlib inline00
%pylab inline
import matplotlib.pyplot as plt
plt.rcParams['axes.color_cycle'] = ['r', 'g', 'b', 'c']
plt.rcParams['lines.color'] = 'r'
plt.rcParams['figure.figsize'] = (15, 5)

import process
from modeling import categorize

KeyError: u'inline00'

Load in dataset

In [None]:
officer_demographics

In [None]:
total_officers = officer_demographics.sum(axis=1)
total_officers
officer_demographics_norm = officer_demographics.div(total_officers,axis=0)
officer_demographics_norm
officer_demographics_minus_one = officer_demographics - 1
officer_demographics_minus_one
for col in officer_demographics_minus_one.columns.tolist():
    officer_demographics_minus_one[col] = officer_demographics_minus_one[col].map(correct)
officer_demographics_minus_one
total_officers_minus_one = total_officers -1 
total_officers_minus_one

In [None]:
test_data,val_data = process.get_split_add_data()
val_data

In [None]:
val_data.to_sql('val_data',process.engine,if_exists='replace')



In [None]:
val_data

# Build Model

In [None]:
processor = process.Processor(val_data,categorize=False)
X_val,y_val = processor.get_scaled_Xy(val_data)
X_test,y_test = processor.get_scaled_Xy(test_data)

In [None]:
len(val_data)

In [None]:
val_data_scaled = pd.DataFrame(np.c_[X_val,y_val],index=val_data.index,columns=val_data.drop('rpsi',1).columns.tolist()+['rpsi'])
val_data_scaled.to_sql('val_data_scaled',process.engine,if_exists='replace')#print X_val+y_val

In [None]:
plt.hist(val_data['rpsi'].tolist(),bins=40)

In [None]:
val_data_scaled['rpsi']

In [None]:
for cat in sort(val_data_scaled['rpsi'].map(categorize).unique().tolist()):
    
    print cat, shape(val_data_scaled[val_data_scaled['rpsi'].map(categorize)==cat])[0]

In [None]:
np.shape(X_val)[1]

In [None]:
from sklearn import linear_model,cross_validation,metrics,grid_search
from sklearn.ensemble import RandomForestRegressor
n_features = np.shape(X_val)[1]
values = range(10,n_features+10,10)
n_iter = len(values)

param_grid = {'n_estimators': values}
clf = RandomForestRegressor(n_jobs=4,max_features=n_features)
clfmodel = grid_search.RandomizedSearchCV(clf,param_grid,
                                          n_iter,
                                          cv=5, #cross_validation.LeaveOneOut(len(y_val)/2),
                                          verbose=1,random_state=2)
clfmodel.fit(X_val,y_val)

In [None]:
print clfmodel.best_params_
print clfmodel.best_score_
est = clfmodel.best_estimator_

features = []
importances = []
for feature,importance in zip(val_data.drop('rpsi',1).columns.tolist(),clfmodel.best_estimator_.feature_importances_):
    #print feature,importance
    features.append(feature)
    importances.append(importance)
indices = [i[0] for i in sorted(enumerate(importances), key=lambda x:x[1])]
best_feature_names = []
for index in reversed(indices):
    print features[index],importances[index]
    if importances[index] < 0.005:
        continue
    best_feature_names.append(features[index])
print len(best_feature_names)

In [None]:
import pickle
pickle.dump(best_feature_names,open('best_features_names_reg.p','wb'))


In [None]:
import pickle
best_feature_names = pickle.load(open('best_features_names_reg.p','rb'))
print best_feature_names

In [None]:
import pickle
pickle.dump(est,open('randomforestclassifier_fixed_reg.p','wb'))

In [None]:
est = pickle.load(open('randomforestclassifier_fixed_reg.p','rb'))
print est

In [None]:
est.score(X_val,y_val)

In [None]:
est.score(X_test,y_test)

In [None]:
def categorize(rpsi):
    if rpsi >=0 and rpsi <=1.6:
        return 0
    elif rpsi < 2.2:
        return 1
    else: return 2


In [None]:
#classification
nmatch = 0
total = 0
y_predict=[]
for y in est.predict(X_test).tolist():
    y_predict.append(categorize(y))
y_actual=[]
for y in y_test.tolist():
    y_actual.append(categorize(y))

from collections import defaultdict
cattotal = defaultdict(int)
for prediction, actual in zip(y_predict,y_actual):
    ismatch = prediction==actual
    cattotal[actual]+=1
    if ismatch:
        nmatch+=1
    total+=1

print "test accuracy =",float(nmatch)/float(total)
print "total",total
for cat in cattotal:
    print cat,float(cattotal[cat])/float(total)

In [None]:

x = est.predict(X_val)
y = y_val


In [None]:
plt.scatter(est.predict(X_val),y_val)
plt.scatter(est.predict(X_test),y_test,color='r')



plt.plot(np.linspace(0, 12, 1000),np.linspace(0, 12, 1000))

x = est.predict(X_val)
y = y_val
plt.plot(x, np.poly1d(np.polyfit(x, y, 1))(x))

x = est.predict(X_test)
y = y_test
plt.plot(x, np.poly1d(np.polyfit(x, y, 1))(x))

plt.ylim([0,12])
plt.xlim([0,12])

In [None]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_actual,y_predict) #clfmodel2.predict(X_test))
plt.pcolor(matrix.T.T)
plt.colorbar()
matrix

In [None]:
sql_query = """
    SELECT  * FROM traffic_joined_with_features;
    """
data = add_features(get_data())

descriptors = []
descriptors = ['agency','city','state','zipcode']
test = pd.read_sql_query(sql_query,con).drop('index',axis=1)
test = test[test['surveyid'].isin(data.index.tolist())]
#test = test.merge(data,how='right',right_on='surveyid')
#data = add_features(get_data())[best_feature_names+['agency']]
#test
test = test.sort(columns='surveyid').set_index('surveyid',drop=True)
test = test[descriptors]
test
sorted_data = data.sort_index()
sorted_data = sorted_data[best_feature_names]
output = pd.concat([test,sorted_data],axis=1)
output

In [None]:
import pickle
#clf = pickle.load(open('randomforestclassifier.p','rb'))
best_feature_names = pickle.load(open('best_features_names.p','rb'))

In [None]:
plt.pcolor(np.abs(np.corrcoef((X_val.T))))
plt.colorbar()

In [None]:
plot_data

In [None]:
plot_data = process.add_features(process.get_data())
processor = process.Processor(val_data,categorize=False)
X,y = processor.get_scaled_Xy(plot_data)
Xy = np.c_[y,X]

feature_names = ['rpsi']+plot_data.drop(['rpsi'],1).columns.tolist()
correlations = np.corrcoef(Xy)[0].tolist()
indices = np.argsort(correlations).tolist()
indices.reverse()
for index in indices:
    print feature_names[index],correlations[index]