In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [41]:
df_train = pd.read_csv('after1900_train.csv', encoding='ISO 8859-1', dtype='object', index_col=0)

In [42]:
df_train.columns

Index(['adminAction', 'adminActionState', 'authorityDecision1',
       'authorityDecision2', 'caseDisposition', 'caseDispositionUnusual',
       'caseId', 'caseIssuesId', 'caseName', 'caseOrigin', 'caseOriginState',
       'caseSource', 'caseSourceState', 'certReason', 'chief', 'dateArgument',
       'dateDecision', 'dateRearg', 'decisionDirection',
       'decisionDirectionDissent', 'decisionType', 'declarationUncon',
       'docket', 'docketId', 'issue', 'issueArea', 'jurisdiction', 'lawMinor',
       'lawSupp', 'lawType', 'lcDisagreement', 'lcDisposition',
       'lcDispositionDirection', 'ledCite', 'lexisCite', 'majOpinAssigner',
       'majOpinWriter', 'majVotes', 'minVotes', 'naturalCourt', 'partyWinning',
       'petitioner', 'petitionerState', 'precedentAlteration', 'respondent',
       'respondentState', 'sctCite', 'splitVote', 'term', 'threeJudgeFdc',
       'usCite', 'voteId', 'voteUnclear'],
      dtype='object')

In [43]:
dropcols = ['caseId', 'docketId', 'caseIssuesId', 'voteId', 'dateDecision',
       'decisionType', 'usCite', 'sctCite', 'ledCite', 'lexisCite', 'term',
            'chief', 'docket', 'caseName', 'dateArgument',
       'dateRearg', 'petitionerState',
       'respondentState', 'jurisdiction', 'adminAction', 'adminActionState',
       'threeJudgeFdc', 'caseOriginState',
       'caseSourceState', 'lcDisagreement', 'certReason', 'lcDisposition',
        'declarationUncon', 'caseDisposition',
       'caseDispositionUnusual', 'partyWinning', 'precedentAlteration',
       'voteUnclear', 'issue',
       'decisionDirectionDissent', 'authorityDecision1', 'authorityDecision2',
       'lawType', 'lawSupp', 'lawMinor', 'majOpinWriter', 'majOpinAssigner',
       'splitVote', 'majVotes', 'minVotes']

for col in dropcols:
    df_train.drop(col, axis=1, inplace=True)




In [44]:
df_train = df_train[df_train.decisionDirection != '3']
df_train.dropna(axis=0, inplace=True)


In [45]:
df_train.describe()

Unnamed: 0,caseOrigin,caseSource,decisionDirection,issueArea,lcDispositionDirection,naturalCourt,petitioner,respondent
count,10506,10506,10506,10506,10506,10506,10506,10506
unique,133,115,2,13,3,48,264,249
top,302,300,2,8,1,1103,27,27
freq,2314,2075,5617,3011,5638,812,986,1354


In [46]:
target = 'decisionDirection'
y_train = df_train[target]
X_train = df_train[[col for col in df_train.columns if col != target]]

In [47]:
parties = pd.read_csv("codelist.csv", index_col=None)
def party_categorizer(val):
    if val in parties.State.values:
        return '1'
    elif val in parties.Federal.values:
        return '2'
    elif val in parties.Criminal.values:
        return '3'
    elif val in parties.Liberal.values:
        return '4'
    elif val in parties.RTY.values:
        return '5'
    elif val in parties.Finance.values:
        return '6'
    elif val in parties.Business.values:
        return '7'
    elif val in parties.Political.values:
        return '8'
    elif val in parties.famend.values:
        return '9'
    else:
        return '10'
 

def issue_consolidate(num):
    if num == "None":
        return '13'
    if int(num) in [0, 3, 5, 6, 7, 11, 14]:
        return '13'
    else:
        return str(num)

In [48]:
X_train.issueArea = X_train.issueArea.apply(lambda x: issue_consolidate(x))
X_train.petitioner = X_train.petitioner.apply(lambda x: party_categorizer(int(x)))                                  
X_train.respondent = X_train.respondent.apply(lambda x: party_categorizer(int(x)))
X_train.caseSource = X_train.caseSource.apply(lambda x :'1' if x in ['300', '301', '301'] else '0')
X_train.caseOrigin = X_train.caseOrigin.apply(lambda x :'1' if x in ['300', '301', '301'] else '0')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [49]:
df_test = pd.read_csv('xcase_level_test_set_mod.csv', encoding='ISO 8859-1', dtype='object', index_col=0)
for col in dropcols:
    df_test.drop(col, axis=1, inplace=True) 
df_test = df_test[df_test.decisionDirection != '3']
df_test.dropna(axis=0, inplace=True)
y_test = df_test[target]
X_test = df_test[[col for col in X_train.columns]]
X_test.issueArea = X_test.issueArea.apply(lambda x: issue_consolidate(x))
X_test.petitioner = X_test.petitioner.apply(lambda x: party_categorizer(int(x)))                                  
X_test.respondent = X_test.respondent.apply(lambda x: party_categorizer(int(x)))
X_test.caseSource = X_test.caseSource.apply(lambda x :'1' if x in ['300', '301', '301'] else '0')
X_test.caseOrigin = X_test.caseOrigin.apply(lambda x :'1' if x in ['300', '301', '301'] else '0')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [50]:
X_train.head()

Unnamed: 0,caseOrigin,caseSource,issueArea,lcDispositionDirection,naturalCourt,petitioner,respondent
15569,0,1,8,2,1001,5,1
15570,0,0,8,2,1001,7,7
15571,0,0,1,2,1001,2,3
15572,0,0,1,2,1001,2,3
15573,0,0,10,2,1001,5,2


In [51]:
judges = pd.read_csv('xjustice_level_train_set.csv', encoding='ISO 8859-1', dtype='object', index_col=0)
judges = judges[['naturalCourt', 'justice']]
courtdict = {}
for nc in judges['naturalCourt'].unique():
    courtdict[nc] = [x for x in (judges.loc[judges['naturalCourt'] == nc, :]['justice'].unique())]

def justices(key):
    return courtdict[key]

# X_train['judgeList'] = X_train.naturalCourt.apply(justices)
# X_test['judgeList'] = X_test.naturalCourt.apply(justices)

# justice_list = judges.justice.unique()

# for item in justice_list:
#     X_train[item] = X_train.judgeList.apply(lambda x: '1' if item in x else '0')
#     X_test[item] = X_test.judgeList.apply(lambda x: '1' if item in x else '0')
    
# X_train.drop(['naturalCourt', 'judgeList'], axis=1, inplace=True)
# X_test.drop(['naturalCourt', 'judgeList'], axis=1, inplace=True)

In [52]:
X_train['judgeList'] = X_train.naturalCourt.apply(justices)

In [53]:
X_test['judgeList'] = X_test.naturalCourt.apply(justices)

In [54]:
justice_list = judges.justice.unique()

In [55]:
for item in justice_list:
    X_train[item] = X_train.judgeList.apply(lambda x: '1' if item in x else '0')
    X_test[item] = X_test.judgeList.apply(lambda x: '1' if item in x else '0')

In [56]:
    
X_train.drop(['naturalCourt', 'judgeList'], axis=1, inplace=True)
X_test.drop(['naturalCourt', 'judgeList'], axis=1, inplace=True)

In [57]:
goodcols = [x for x in X_train.describe().iloc[1, :][X_train.describe().iloc[1, :] != 1].index]

ohc = OneHotEncoder(sparse=False)
ohc.fit(X_train[goodcols])
X_train = ohc.transform(X_train[goodcols])
X_test = ohc.transform(X_test[goodcols])

In [58]:
X_train.size

1502358

In [60]:
gbcl = GradientBoostingClassifier() ## using max featrues can limit features sort of like a Forest
gbcl_params = {
    "n_estimators": [125],
    "max_depth": range(4, 9),
    'max_features': [None]
}
gbcl_model = GridSearchCV(gbcl, param_grid=gbcl_params)
gbcl_model.fit(X_train, y_train)
print("best score ", gbcl_model.best_score_)
print("best params ", gbcl_model.best_params_)
print("test score ", gbcl_model.score(X_test, y_test))

best score  0.475442604226
best params  {'max_depth': 6, 'max_features': None, 'n_estimators': 125}
test score  0.641055718475


In [578]:
gsrfe = GridSearchCV(RandomForestClassifier(random_state=2017), param_grid=rfc_params, verbose=1, n_jobs=2)

In [579]:
gsrfe.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   30.1s
[Parallel(n_jobs=2)]: Done  81 out of  81 | elapsed:   49.5s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=2017,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=2,
       param_grid={'n_estimators': [100, 125, 150], 'criterion': ['gini'], 'min_samples_split': [2, 3, 4], 'min_samples_leaf': [1, 2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [580]:
gsrfe.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=125, n_jobs=1, oob_score=False, random_state=2017,
            verbose=0, warm_start=False)

In [581]:
gsrfe.best_score_

0.43423423423423424

In [582]:
gsrfe.score(X_test, y_test)

0.62287390029325518

In [583]:
gsrfe.best_params_

{'criterion': 'gini',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 125}

In [584]:
Model = RandomForestClassifier(n_estimators=150, criterion='gini', min_samples_leaf=2, min_samples_split=3,
                              random_state=2017)

In [585]:
Model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=2,
            min_samples_split=3, min_weight_fraction_leaf=0.0,
            n_estimators=150, n_jobs=1, oob_score=False, random_state=2017,
            verbose=0, warm_start=False)

In [586]:
Model.score(X_train, y_train)

0.77759009009009006

In [587]:
Model.score(X_test, y_test)

0.64868035190615836