In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score,make_scorer,f1_score,classification_report,average_precision_score
from sklearn.preprocessing import Normalizer,MinMaxScaler,StandardScaler,normalize
from sklearn.cross_validation import train_test_split
import multiprocessing
import datetime
from time import gmtime, strftime
from grid_search_funs import *

In [30]:
def print_log(log_str):
    
    log_file_name = "jvap_log.txt"
    
    with open(log_file_name, "a") as f:
        
        entry = strftime("%Y-%m-%d %H:%M:%S") + '\t' + str(log_str) + '\n'

        f.write(entry)

        print(entry[:-1])

### Read and clean data

In [16]:
df_x,df_y = get_data(1000)

In [17]:
df_x,df_y=remove_bad_rows(df_x,df_y) #drops rows with codej1=codej2, codej2=nan
df_x=drop_unneeded_cols(df_x) #drops unneeded cols
df_x=drop_dissent(df_x) #drops dissent, concur columns

In [20]:
# Sanity check

print_log((df_x.shape, df_y.shape))

2016-05-11 00:10:54	((1000, 653), (1000, 1))



In [21]:
#dummifies data
df_x=dummify(df_x)

dropped:  Ads3
dropped:  F1Ads3
dropped:  F2Ads3
dropped:  L1Ads3
dropped:  L2Ads3
dropped:  L3Ads3
dropped:  L4Ads3
dropped:  L5Ads3
dropped:  Unnamed: 0.1
dropped:  appel1
dropped:  appel2
dropped:  citevol
dropped:  codej3
dropped:  id
dropped:  usc2sect
dropped:  usc1sect
dropped:  age2
dropped:  distjudg
dropped:  respond1
dropped:  respond2
dropped:  yearb
dropped:  pred
dropped:  csb
# of dummy columns:  997
(1000, 630)
['___char', 'city', 'codej1', 'codej2', 'congresi', 'endyear', 'president', 'president_f1', 'president_f2', 'seatno2', 'senate', 'senate_f1', 'senate_f2', 'sseatno2']
(1000, 1621)


In [22]:
#GET X, Y AS NUMPY ARRAYS

X = df_x.values
y = df_y.ix[:,0].values

In [23]:
#MAKE SURE Y LOOKS LIKE [1 1 1 ... 1 1] (SOMETIMES IT CAN STORE INDICES)

print_log((X.shape, y.shape))

print_log(X[:10])
print_log(y[:10])

2016-05-11 00:12:53	((1000, 1621), (1000,))

2016-05-11 00:12:53	[[  1.  11.  11. ...,   0.   0.   1.]
 [  1.  11.  11. ...,   0.   0.   1.]
 [  1.  11.  11. ...,   0.   0.   1.]
 ..., 
 [  1.   5.   5. ...,   0.   0.   1.]
 [  1.   5.   5. ...,   0.   0.   1.]
 [  1.   5.   5. ...,   0.   0.   1.]]

2016-05-11 00:12:53	[1 1 1 1 1 1 1 1 1 1]



### Split data

In [24]:
#############################################
# Split into training and test set
#############################################

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [25]:
#look at size of df_x and X to make sure you have enough RAM

print_log(df_x.info())
print_log(("Size of X in GB: ", (X.nbytes * 1.0)/(1024 * 1024 *1024))) #size of X in GB


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Columns: 1621 entries, ElecYear_AndPrior to sseatno2_nan
dtypes: float64(1620), int64(1)
memory usage: 4.9 MB
2016-05-11 00:14:07	None

2016-05-11 00:14:07	('Size of X in GB: ', 0.012077391147613525)



In [27]:
#check sizes match

print_log((X_train.shape, y_train.shape))
print_log((X_test.shape, y_test.shape))


2016-05-11 00:15:18	((700, 1621), (700,))

2016-05-11 00:15:18	((300, 1621), (300,))



### Scale features (Do MinMaxScaler)

In [11]:
#DONT DO FOR RANDOM FOREST

# #############################################
# # Standard scale
# #############################################

# scaler = StandardScaler()
# scaler.fit(X_train)

# X_test = scaler.transform(X_test)



In [12]:
#DONT DO FOR RANDOM FOREST

# #############################################
# # Min-Max scale
# #############################################

# scaler = MinMaxScaler()
# scaler.fit(X_train)

# X_test = scaler.transform(X_test)


### Option 1: Do grid search

In [28]:

#############################################
# [OPTIONAL]
# Random Forest Grid Search
#############################################

num_cores = multiprocessing.cpu_count()

print "numcores = ", num_cores

#modify/add params here you want to search over
param_grid = {'n_estimators': [10, 50, 100, 150, 200], 'max_depth': [1, 5, 10, 15, 20, 25]}


rf_clf = RandomForestClassifier(random_state=42)

gridclf = grid_search(X=X_train, y=y_train, clf=rf_clf, param_grid=param_grid, n_jobs=num_cores)

print_log(gridclf.best_params_)
print_log(gridclf.best_score_)


numcores =  8
Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    3.7s
[Parallel(n_jobs=8)]: Done  90 out of  90 | elapsed:    9.1s finished


2016-05-11 00:18:07	{'n_estimators': 10, 'max_depth': 15}

2016-05-11 00:18:07	0.988286745008



In [31]:

#############################################
# [OPTIONAL] Random Forest (RUN OVER BEST MODEL FROM GRID SEARCH)
#############################################

# Replace labels (in case SVM was run)
# y_train[y_train == 0.] = -1.
# y_test[y_test == 0.] = -1.


rf_clf = RandomForestClassifier(random_state=42, **gridclf.best_params_)
#                                 class_weight={1.0: 1, -1.0: 150})

rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)

print_log(classification_report(y_test, y_pred))

#############################################
# [OPTIONAL]
# Feature importance analysis
#############################################

top_n = get_top_n_feats(25, rf_clf.feature_importances_, df_x.columns)

for t in top_n:
    print_log(t)

2016-05-11 00:21:09	             precision    recall  f1-score   support

         -1       0.40      0.50      0.44         4
          1       0.99      0.99      0.99       296

avg / total       0.99      0.98      0.98       300

2016-05-11 00:21:09	['oththres', '0.0412653287558']
2016-05-11 00:21:09	['sat_together_count', '0.0330381025049']
2016-05-11 00:21:09	['prejud', '0.029617707081']
2016-05-11 00:21:09	['totalcites', '0.028409232365']
2016-05-11 00:21:09	['juryinst', '0.0254775419026']
2016-05-11 00:21:09	['genapel1', '0.0232928033918']
2016-05-11 00:21:09	['codej2_20402.0', '0.0221868345444']
2016-05-11 00:21:09	['decade2', '0.0211969284101']
2016-05-11 00:21:09	['codej2_328.0', '0.0208969753576']
2016-05-11 00:21:09	['state', '0.0204939247085']
2016-05-11 00:21:09	['codej2_10352.0', '0.019636016877']
2016-05-11 00:21:09	['codej1_929.0', '0.0194634204728']
2016-05-11 00:21:09	['caseload', '0.0189764681383']
2016-05-11 00:21:09	['Wtotalcites', '0.0188525087581']
2016-05-11 

### Option 2: Run individual model (for testing)

In [15]:
#sample random forest run below

In [16]:
rf_clf = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=15,class_weight={1.0: 1, -1.0: 35})

In [17]:
%%time
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)

print_log(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

         -1       0.40      0.50      0.44         4
          1       0.99      0.99      0.99       296

avg / total       0.99      0.98      0.98       300

CPU times: user 262 ms, sys: 4 ms, total: 266 ms
Wall time: 264 ms


In [18]:
##prints top 25 features
top_n = get_top_n_feats(25, rf_clf.feature_importances_, df_x.columns)

for t in top_n:
    print_log(t)

['totalcites', '0.030548071939']
['prejud', '0.0263864284649']
['Wtotalcites', '0.0241080182892']
['j2score', '0.0204876453575']
['genapel2', '0.0186087358415']
['bank_ap2', '0.017230026499']
['popularpct', '0.0172065013808']
['day', '0.0169206167912']
['state', '0.0153712760024']
['district', '0.0152810958649']
['electoralpct', '0.0136923519334']
['_Iquarter_2', '0.0132653628724']
['applfrom', '0.0123476434609']
['casetyp1', '0.0122440947296']
['genresp2', '0.012081591366']
['decade', '0.0112540088449']
['othadmis', '0.0109070951253']
['distance', '0.0101321616979']
['propneg', '0.0101160234641']
['repres', '0.00966612809742']
['month', '0.00927770277668']
['treat', '0.00927029888949']
['multdoc', '0.00887887791631']
['negativecites', '0.00850748368554']
['semiannumtoelect', '0.00845184709227']
