In [41]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score,make_scorer,f1_score,classification_report,average_precision_score
from sklearn.preprocessing import Normalizer,MinMaxScaler,StandardScaler,normalize
from sklearn.cross_validation import train_test_split
import multiprocessing
from grid_search_funs import *

### Read and clean data

In [42]:
df_x,df_y = get_data()

In [43]:
df_x,df_y=remove_bad_rows(df_x,df_y) #drops rows with codej1=codej2, codej2=nan
df_x=drop_unneeded_cols(df_x) #drops unneeded cols
df_x=drop_dissent(df_x) #drops dissent, concur columns

In [44]:
#sanity check
print df_x.shape
print df_y.shape

(111538, 708)
(111538, 1)


In [45]:
#dummifies data
df_x=dummify(df_x)

dropped:  Ads3
dropped:  F1Ads3
dropped:  F2Ads3
dropped:  L1Ads3
dropped:  L2Ads3
dropped:  L3Ads3
dropped:  L4Ads3
dropped:  L5Ads3
dropped:  Unnamed: 0.1
dropped:  appel1
dropped:  appel2
dropped:  citevol
dropped:  codej3
dropped:  id
dropped:  usc2sect
dropped:  usc1sect
dropped:  age2
dropped:  distjudg
dropped:  respond1
dropped:  respond2
dropped:  yearb
dropped:  pred
dropped:  csb
# of dummy columns:  4342
(111538, 685)
['___char', 'amicusapp', 'amicusresp', 'casetyp1', 'casetyp2', 'city', 'codej1', 'codej2', 'congresi', 'endyear', 'ls', 'pos2', 'pos3', 'president', 'president_f1', 'president_f2', 'seatno2', 'seatno3', 'senate', 'senate_f1', 'senate_f2', 'sseatno2', 'totalcites']
(111538, 5013)


In [46]:
#GET X, Y AS NUMPY ARRAYS

X=df_x.values
y=df_y.ix[:,0].values

In [47]:
#MAKE SURE Y LOOKS LIKE [1 1 1 ... 1 1] (SOMETIMES IT CAN STORE INDICES)

print X.shape
print y.shape

print X[:10]
print y[:10]

(111538, 5013)
(111538,)
[[  1.  11.  11. ...,   0.   0.   0.]
 [  1.  11.  11. ...,   0.   0.   0.]
 [  1.  11.  11. ...,   0.   0.   0.]
 ..., 
 [  1.   5.   5. ...,   0.   0.   0.]
 [  1.   5.   5. ...,   0.   0.   0.]
 [  1.   5.   5. ...,   0.   0.   0.]]
[1 1 1 1 1 1 1 1 1 1]


### Split data

In [48]:
#############################################
# Split into training and test set
#############################################

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [51]:
#look at size of df_x and X to make sure you have enough RAM

print df_x.info()

print "Size of X in GB: ",X.nbytes/1024/1024/1024 #size of X in GB


<class 'pandas.core.frame.DataFrame'>
Int64Index: 111538 entries, 0 to 111537
Columns: 5013 entries, ElecYear_AndPrior to totalcites_nan
dtypes: float64(5012), int64(1)
memory usage: 591.4 MB
None
Size of X in GB:  4


In [52]:
#check sizes match

print X_train.shape
print y_train.shape
print X_test.shape
print y_test.shape


(78076, 5013)
(78076,)
(33462, 5013)
(33462,)


### Scale features (Do MinMaxScaler)

In [None]:
#DONT DO FOR RANDOM FOREST

# #############################################
# # Standard scale
# #############################################

# scaler = StandardScaler()
# scaler.fit(X_train)

# X_test = scaler.transform(X_test)



In [None]:
#DONT DO FOR RANDOM FOREST

# #############################################
# # Min-Max scale
# #############################################

# scaler = MinMaxScaler()
# scaler.fit(X_train)

# X_test = scaler.transform(X_test)


### Option 1: Do grid search

In [None]:

#############################################
# [OPTIONAL]
# Random Forest Grid Search
#############################################

num_cores = multiprocessing.cpu_count()

print "numcores = ", num_cores

#modify/add params here you want to search over
paramgrid = {'n_estimators': [10, 50, 100, 150, 200], 'max_depth': [1, 5, 10, 15, 20, 25]}

rf_clf = RandomForestClassifier(random_state=42)

gridclf = grid_search(X_train, y_train, rf_clf, paramgrid, n_jobs=num_cores)

print gridclf.best_params_
print gridclf.best_score_


In [None]:

#############################################
# [OPTIONAL] Random Forest (RUN OVER BEST MODEL FROM GRID SEARCH)
#############################################

# Replace labels (in case SVM was run)
# y_train[y_train == 0.] = -1.
# y_test[y_test == 0.] = -1.


rf_clf = RandomForestClassifier(random_state=42, **gridclf.best_params_)
#                                 class_weight={1.0: 1, -1.0: 150})

rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)

print_report(y_test, y_pred)

#############################################
# [OPTIONAL]
# Feature importance analysis
#############################################

top_n = get_top_n_feats(25, rf_clf.feature_importances_, df_x.columns)

for t in top_n:
    print t

### Option 2: Run individual model (for testing)

In [None]:
#sample random forest run below

In [37]:
rf_clf = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=15,class_weight={1.0: 1, -1.0: 35})

In [38]:
%%time
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)

print_report(y_test, y_pred)

             precision    recall  f1-score   support

         -1       0.18      0.66      0.28      1370
          1       0.98      0.87      0.92     32092

avg / total       0.95      0.86      0.90     33462

CPU times: user 59.2 s, sys: 420 ms, total: 59.7 s
Wall time: 59.7 s


In [39]:
##prints top 25 features
top_n = get_top_n_feats(25, rf_clf.feature_importances_, df_x.columns)

for t in top_n:
    print t

['Wopinionlenght', '0.0238311762751']
['Wlengthopin', '0.0229291335576']
['lengthopin', '0.0220870675947']
['votingvalence', '0.0181971102661']
['Wtotalcites', '0.0134327669155']
['opinstat', '0.0108831510041']
['propneg', '0.010627325893']
['decade2', '0.00923690684669']
['d13', '0.00923270714389']
['signed', '0.0076670435348']
['negativecites', '0.00755880357634']
['j3score', '0.00753907933609']
['fartherd', '0.00729415435453']
['treat', '0.00718588724809']
['distance', '0.007172403572']
['decade', '0.00698642288304']
['pagelgth', '0.00696001031277']
['day', '0.00684200675229']
['state', '0.00679113479606']
['sat_together_count', '0.0067014658775']
['j1score', '0.00653692884376']
['liberalvote', '0.00641137353297']
['j2score', '0.00620303280375']
['d12', '0.00602330797119']
['d23', '0.00599882287965']
