In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score,make_scorer,f1_score,classification_report,average_precision_score
from sklearn.preprocessing import Normalizer,MinMaxScaler,StandardScaler,normalize
from sklearn.cross_validation import train_test_split
import multiprocessing

In [12]:
def get_data(n_rows=None):
    
    if n_rows is not None:
        df = pd.read_csv('final_feats_without_dummies_3.csv', low_memory=False, nrows=n_rows)
        df_y = pd.read_csv('final_outs_3.csv', low_memory=False, nrows=n_rows)
    else:
        df = pd.read_csv('final_feats_without_dummies_3.csv', low_memory=False)
        df_y = pd.read_csv('final_outs_3.csv', low_memory=False)
    
    
    # Drop labels and a redundant column
    remove_columns_from_data_frame(['Unnamed: 0', 'Unnamed: 0.1' 'dissent', 'dissentdummy'], df)
#     df,df_y=remove_bad_rows(df,df_y)
#     df=drop_unneeded_cols(df)
#     df=drop_dissent(df)
#     df=dummify(df)
    
    # Extras -- for analysis
    # CASE 1: REMOVE TOP 2
    # CASE 2: REMOVE ALL 'DISS'
    
#     remove_columns_from_data_frame(['type', 'turnonthresh'], df)
#     remove_columns_from_data_frame(['type1', 'last3'], df)
#     remove_columns_like('diss', df)
    if ('Unnamed: 0' or 'Unnamed: 0.1') in df_y.columns:
        df_y.drop(labels=['Unnamed: 0','Unnamed: 0.1'],axis=1,inplace=True)
    
    return df, df_y


def get_x_y(n_rows=None):
    
    df, df_y = get_data(n_rows)

    #fill_nas(0, df)
    for y in df_y.columns:
        if len(pd.unique(df_y.ix[:,y]))==2:
		y=df_y.ix[:,y].values
		break
    return df.values, y

In [13]:

#############################################
# PANDAS HELPERS
#############################################


def remove_column_from_data_frame(col_to_remove, data_frame):

    if col_to_remove in list(data_frame.columns):
        data_frame.drop(col_to_remove, axis=1, inplace=True)

        
def remove_columns_from_data_frame(cols_to_remove, data_frame):

    column_dict = {x: None for x in list(data_frame.columns)}

    cols_to_remove = [x for x in cols_to_remove if x in column_dict]

    data_frame.drop(labels=cols_to_remove, axis=1, inplace=True)
    

def remove_columns_like(column_pattern, data_frame):
    
    for column in list(data_frame.columns):
        if column_pattern in column:
            data_frame.drop(column, axis=1, inplace=True)


def fill_nas(value, data_frame):
    
    data_frame.fillna(0, inplace=True)



#############################################
# DATA RETRIEVAL HELPERS
#############################################


def get_columns():
    
    df, df_y = get_data(1000) 
    return list(df.columns)


def print_report(y, y_pred):

    print classification_report(y, y_pred)
    

#############################################
# MODEL HELPERS
#############################################

def grid_search(X, y, clf, param_grid, n_jobs=1):
    
#     param_dict={'average': 'weighted'}
    scorer = make_scorer(average_precision_score)


    gridclf = GridSearchCV(clf, paramgrid, scoring=scorer, cv=3, verbose=1, n_jobs=n_jobs)

    gridclf.fit(X, y)

    print gridclf.best_params_
    print gridclf.best_estimator_

    print_report(y_test, gridclf.predict(X_test))
    
    return gridclf


def get_top_n_feats(n, feat_arr, cols):
    args=np.argsort(feat_arr)
    assert len(feat_arr)==len(cols)
    col_scores=col_scores=np.array(zip(cols,feat_arr))
    return col_scores[args[-n:]].tolist()[::-1]


# def get_top_n(n, arr, col_names, prev_list=[]):
    
#     if n <= 0:
#         return []
    
#     most_imp = -1
#     most_imp_index = -1

#     for i in range(len(arr)):

#         if i in prev_list:
#             continue

#         if arr[i] > most_imp:
#             most_imp = arr[i]
#             most_imp_index = i

#     prev_list.append(most_imp_index)

#     return [ (col_names[most_imp_index], most_imp) ] + get_top_n(n - 1, arr, col_names, prev_list)


# In[3]:

def drop_unneeded_cols(df):
    del_cols = ['fileid','cite','vol','beginpg','endopin','endpage','docnum','priorpub','_merge','year',
            'circuit','pseatno','decision_date','aatty_first_name','aatty_last_name','afirm_name',
            'ratty_first_name','ratty_last_name','rname_of_first_listed_amicus_gro','rfirm_namew','decisiondatenew2',
           'j1name','j2name','j3name','quartertoelect','pname','seatno','success','lsuc','ls1','ls2','ls3','lp',
            'lp2','lp3','sseatno','congress','congreso','afirst_listed_amicus_group','yearquarter','name','Name','State','j',
            'codej4','j4vote1','j4vote2','j4maj1','j4maj2','codej5','j5vote1','j5vote2','j5maj1','j5maj2',
            'codej6','j6vote1','j6vote2','j6maj1','j6maj2','codej7','j7vote1','j7vote2','j7maj1','j7maj2',
            'codej8','j8vote1','j8vote2','j8maj1','j8maj2','codej9','j9vote1','j9vote2','j9maj1','j9maj2',
            'codej10','j10vote1','j10vote2','j10maj1','j10maj2','codej11','j11vote1','j11vote2','j11maj1','j11maj2',
            'codej12','j12vote1','j12vote2','j12maj1','j12maj2','codej13','j13vote1','j13vote2','j13maj1','j13maj2',
            'codej14','j14vote1','j14vote2','j14maj1','j14maj2','codej15','j15vote1','j15vote2','j15maj1','j15maj2','j16maj1','j16vote1']
    df.drop(labels=del_cols,axis=1,inplace=True)
    moredropcolumns=df.columns.tolist() # .tolist?
    for i in moredropcolumns:
        if len(pd.unique(df[i]))==1:
            df.drop(labels=i,axis=1,inplace=True)
    df.drop(labels=['casenum','j2vote1','j2vote2','j2maj1','direct1',
                          'j2maj2','j3vote1','j3vote2','j3maj1','j3maj2','majvotes','ids'],axis=1,inplace=True)
    return df
    
def dummify(df):
    new_cols=df.columns
    new_cols=new_cols.tolist()
#     keep_cols=['j1score','j2score','j3score','popularpct','electoralpct','closerd','fartherd','dAds3','dF2Ads3',
#            'dF1Ads3','dL1Ads3','dL2Ads3','dL3Ads3','dL4Ads3','dL5Ads3','logAds3','logL1Ads3','logL2Ads3','logF1Ads3',
#           'logF2Ads3','decade2','propneg','likely_elev2','score','d12','d13','d23','sat_together_count']

    float_cols=['j1score','j2score','j3score','popularpct','electoralpct','closerd','fartherd','dAds3','dF2Ads3',
           'dF1Ads3','dL1Ads3','dL2Ads3','dL3Ads3','dL4Ads3','dL5Ads3','logAds3','logL1Ads3','logL2Ads3','logF1Ads3',
          'logF2Ads3','decade2','propneg','likely_elev2','score','d12','d13','d23',
           'judgecitations','experience','experiencetrun','age2trun','agego','assets','ba','liable',
            'networth','totalcities','sat_together_count','keytotal','lengthopin','Wopinionlenght','Wtotalcites','age']

    remove_for_now=['Ads3','F1Ads3','F2Ads3','L1Ads3','L2Ads3','L3Ads3','L4Ads3','L5Ads3','Unnamed: 0.1','appel1','appel2',
               'citevol','codej3','id','usc2sect','usc1sect','age2','distjudg','respond1','respond2','yearb','pred','csb']

#    df.drop(labels=remove_for_now,inplace=True,axis=1)
 
    for x in remove_for_now:
    	if x in df.columns:
        	print "dropped: ",x
        	df.drop(labels=[x],inplace=True,axis=1)
    
    sum1=0
    
    dummy_cols=[]
    for col in df.columns:
        if col not in float_cols:
            if len(pd.unique(df.ix[:,col]))>100 or (df.ix[:,col].dtype!='float64' and df.ix[:,col].dtype!='int64'): 
                sum1+= len(pd.unique(df.ix[:,col]))
                dummy_cols.append(col)
    print "# of dummy columns: ",sum1
    print df.shape
    print dummy_cols
    df2=pd.get_dummies(df,columns=dummy_cols,dummy_na=True,sparse=True)
    print df2.shape
    df2.fillna(value=0,inplace=True)
    return df2


def remove_bad_rows(df_x,df_y):
    
    #remove rows where codej1==codej2
#     df[df.codej1==df.codej2].index
    same_cols = df_x[df_x.codej1==df_x.codej2].index
    df_x=df_x.drop(same_cols).reset_index(drop=True)
    df_y=df_y.drop(same_cols).reset_index(drop=True)
    #remove rows where >3 judges occur
#     pp = pd.read_csv('../raw/Votelevel_stuffjan2013.csv')
#     qq=pp.groupby(by=['casenum']).count()
#     pd.unique(qq.month)
#     rr=qq[qq.month==6].reset_index()
#     rr.shape
    
    #remove rows where codej2==null
    #df[map(lambda x: not(x),pd.notnull(df.ix[:]["codej2"]).tolist())]
    nan_cols=df_x[map(lambda x: not(x),pd.notnull(df_x.ix[:]["codej2"]).tolist())].index
    nan_cols.append(df_x[map(lambda x: not(x),pd.notnull(df_x.ix[:]["codej1"]).tolist())].index)
    df_x=df_x.drop(nan_cols).reset_index(drop=True)
    df_y=df_y.drop(nan_cols).reset_index(drop=True)
    
    return df_x,df_y

def drop_dissent(df,drop=['diss','concur','unan']):
    
    def func(a, b):
        return not set(a).isdisjoint(set(b))
    
    diss_list=[]
    for col in df.columns:
        for x in drop:
            if x in col:
                diss_list.append(col)
    diss_list=list(set(diss_list))
    df.drop(labels=diss_list,axis=1,inplace=True)
    return df


In [None]:
    df,df_y=remove_bad_rows(df,df_y)
    df=drop_unneeded_cols(df)
    df=drop_dissent(df)
    df=dummify(df)
    

In [14]:
df_x,df_y = get_data()

In [15]:
df_x,df_y=remove_bad_rows(df_x,df_y)
df_x=drop_unneeded_cols(df_x)
df_x=drop_dissent(df_x)

In [17]:
print df_x.shape
print df_y.shape

(111538, 708)
(111538, 1)


In [19]:
df_x=dummify(df_x)

dropped:  Ads3
dropped:  F1Ads3
dropped:  F2Ads3
dropped:  L1Ads3
dropped:  L2Ads3
dropped:  L3Ads3
dropped:  L4Ads3
dropped:  L5Ads3
dropped:  Unnamed: 0.1
dropped:  appel1
dropped:  appel2
dropped:  citevol
dropped:  codej3
dropped:  id
dropped:  usc2sect
dropped:  usc1sect
dropped:  age2
dropped:  distjudg
dropped:  respond1
dropped:  respond2
dropped:  yearb
dropped:  pred
dropped:  csb
# of dummy columns:  4342
(111538, 685)
['___char', 'amicusapp', 'amicusresp', 'casetyp1', 'casetyp2', 'city', 'codej1', 'codej2', 'congresi', 'endyear', 'ls', 'pos2', 'pos3', 'president', 'president_f1', 'president_f2', 'seatno2', 'seatno3', 'senate', 'senate_f1', 'senate_f2', 'sseatno2', 'totalcites']
(111538, 5013)


In [20]:
X=df_x.values
y=df_y.ix[:,0].values

In [21]:
print X.shape
print y.shape
#sanity check
print X[:10]
print y[:10]

(111538, 5013)
(111538,)
[[  1.  11.  11. ...,   0.   0.   0.]
 [  1.  11.  11. ...,   0.   0.   0.]
 [  1.  11.  11. ...,   0.   0.   0.]
 ..., 
 [  1.   5.   5. ...,   0.   0.   0.]
 [  1.   5.   5. ...,   0.   0.   0.]
 [  1.   5.   5. ...,   0.   0.   0.]]
[1 1 1 1 1 1 1 1 1 1]


In [22]:

#############################################
# Split into training and test set
#############################################

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



print X.nbytes/1024/1024/1024 #size of X in GB


4


In [23]:

#df_x,df_y = get_data(1000) 




#print df_x.info()


#check sizes match

print X_train.shape
print y_train.shape
print X_test.shape
print y_test.shape


(78076, 5013)
(78076,)
(33462, 5013)
(33462,)


In [None]:


#DONT DO FOR RANDOM FOREST

# #############################################
# # Standard scale
# #############################################

# scaler = StandardScaler()
# scaler.fit(X_train)

# X_test = scaler.transform(X_test)


# In[ ]:

#############################################
# [OPTIONAL]
# Random Forest Grid Search
#############################################
num_cores = multiprocessing.cpu_count()

print "numcores = ",num_cores
paramgrid = {'n_estimators': [10, 50, 100, 150, 200], 'max_depth': [1, 5, 10, 15, 20, 25]}

rf_clf = RandomForestClassifier(random_state=42)

gridclf = grid_search(X_train, y_train, rf_clf, paramgrid, n_jobs=num_cores)

print gridclf.best_params_
print gridclf.best_score_
#############################################
# Random Forest
#############################################

# Replace labels (in case SVM was run)
# y_train[y_train == 0.] = -1.
# y_test[y_test == 0.] = -1.



#rf_clf = RandomForestClassifier(random_state=42, n_estimators=gridclf.best_params_['n_estimators'], max_depth=[gridclf.best_params_['max_depth'])
rf_clf = RandomForestClassifier(random_state=42, **gridclf.best_params_)
#                                 class_weight={1.0: 1, -1.0: 150})

rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)

print_report(y_test, y_pred)

#############################################
# [OPTIONAL]
# Feature importance analysis
#############################################

top_n = get_top_n_feats(25, rf_clf.feature_importances_, get_columns())

for t in top_n:
    print t

In [37]:
rf_clf = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=15,class_weight={1.0: 1, -1.0: 35})

In [38]:
%%time
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)

print_report(y_test, y_pred)

             precision    recall  f1-score   support

         -1       0.18      0.66      0.28      1370
          1       0.98      0.87      0.92     32092

avg / total       0.95      0.86      0.90     33462

CPU times: user 59.2 s, sys: 420 ms, total: 59.7 s
Wall time: 59.7 s


In [35]:
%%time
##class weight: 1 and 70
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)

print_report(y_test, y_pred)

             precision    recall  f1-score   support

         -1       0.07      0.91      0.13      1370
          1       0.99      0.48      0.65     32092

avg / total       0.95      0.50      0.63     33462

CPU times: user 59.7 s, sys: 400 ms, total: 1min
Wall time: 1min


In [39]:
##class weight: 1 and 35
top_n = get_top_n_feats(25, rf_clf.feature_importances_, df_x.columns)

for t in top_n:
    print t

['Wopinionlenght', '0.0238311762751']
['Wlengthopin', '0.0229291335576']
['lengthopin', '0.0220870675947']
['votingvalence', '0.0181971102661']
['Wtotalcites', '0.0134327669155']
['opinstat', '0.0108831510041']
['propneg', '0.010627325893']
['decade2', '0.00923690684669']
['d13', '0.00923270714389']
['signed', '0.0076670435348']
['negativecites', '0.00755880357634']
['j3score', '0.00753907933609']
['fartherd', '0.00729415435453']
['treat', '0.00718588724809']
['distance', '0.007172403572']
['decade', '0.00698642288304']
['pagelgth', '0.00696001031277']
['day', '0.00684200675229']
['state', '0.00679113479606']
['sat_together_count', '0.0067014658775']
['j1score', '0.00653692884376']
['liberalvote', '0.00641137353297']
['j2score', '0.00620303280375']
['d12', '0.00602330797119']
['d23', '0.00599882287965']


Exogenous:

Citation:
['Wtotalcites', '0.0134327669155']
['negativecites', '0.00755880357634']

Endogenous:
['Wopinionlenght', '0.0238311762751']
['Wlengthopin', '0.0229291335576']
['lengthopin', '0.0220870675947']
['votingvalence', '0.0181971102661']
['opinstat', '0.0108831510041']

Seating:
['sat_together_count', '0.0067014658775']


Unclassif:
['propneg', '0.010627325893']
['decade2', '0.00923690684669']
['d13', '0.00923270714389']
['signed', '0.0076670435348']
['fartherd', '0.00729415435453']

dxy - dist. b/w judges - spatial metric, estimate of ideology (lib/cons)

In [36]:
##class weight: 1 and 70
top_n = get_top_n_feats(25, rf_clf.feature_importances_, df_x.columns)

for t in top_n:
    print t

['votingvalence', '0.0261547496099']
['lengthopin', '0.0230530517623']
['Wopinionlenght', '0.0225879213156']
['Wlengthopin', '0.0224313759724']
['opinstat', '0.0141275262243']
['Wtotalcites', '0.011530246961']
['decade2', '0.0096729089471']
['propneg', '0.00823910550041']
['negativecites', '0.00796099978763']
['signed', '0.00756256306951']
['decade', '0.00755522639548']
['j3score', '0.00742370899574']
['d13', '0.00731213796115']
['distance', '0.0069955304234']
['sat_together_count', '0.006940075614']
['day', '0.00675750405747']
['state', '0.00663529136889']
['liberalvote', '0.00645739940594']
['treat', '0.00643418098166']
['d23', '0.00591955394343']
['weeks', '0.00582960418287']
['j2score', '0.00582324043509']
['month', '0.00573972915478']
['fartherd', '0.00567211057412']
['caseload', '0.00566699254826']


In [33]:
##class weight: 1 and 150
top_n = get_top_n_feats(25, rf_clf.feature_importances_, df_x.columns)

for t in top_n:
    print t

['votingvalence', '0.031820253526']
['Wopinionlenght', '0.0307333470406']
['lengthopin', '0.0259333289836']
['Wlengthopin', '0.0194242829752']
['opinstat', '0.0114878351882']
['Wtotalcites', '0.0111129239341']
['decade2', '0.0103847135557']
['decade', '0.00872973047309']
['signed', '0.0082874547415']
['distance', '0.00711918735182']
['day', '0.0068660132445']
['negativecites', '0.00674048858062']
['propneg', '0.00657687569546']
['state', '0.00615852611257']
['sat_together_count', '0.00610524337817']
['j3score', '0.00599288617793']
['d13', '0.00595437915445']
['popularpct', '0.00579284283268']
['totalcites_0.0', '0.00561865680217']
['procedur', '0.00555179653346']
['caseload', '0.00552298275005']
['weeks', '0.00540221329263']
['d23', '0.00520571465573']
['d12', '0.00520386912836']
['month', '0.00513730043263']
