This notebook shows my attempts to use KNN and Logistic Regression as well and why I chose to use a Random Forest

In [120]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [121]:
df = pd.read_csv('legacy_case.csv', encoding='ISO 8859-1', dtype=object)

In [122]:
df = df[df.decisionDirection != '3'] ## eliminate cases with no (political) direction

In [123]:
print(df.columns)

Index(['caseId', 'docketId', 'caseIssuesId', 'voteId', 'dateDecision',
       'decisionType', 'usCite', 'sctCite', 'ledCite', 'lexisCite', 'term',
       'naturalCourt', 'chief', 'docket', 'caseName', 'dateArgument',
       'dateRearg', 'petitioner', 'petitionerState', 'respondent',
       'respondentState', 'jurisdiction', 'adminAction', 'adminActionState',
       'threeJudgeFdc', 'caseOrigin', 'caseOriginState', 'caseSource',
       'caseSourceState', 'lcDisagreement', 'certReason', 'lcDisposition',
       'lcDispositionDirection', 'declarationUncon', 'caseDisposition',
       'caseDispositionUnusual', 'partyWinning', 'precedentAlteration',
       'voteUnclear', 'issue', 'issueArea', 'decisionDirection',
       'decisionDirectionDissent', 'authorityDecision1', 'authorityDecision2',
       'lawType', 'lawSupp', 'lawMinor', 'majOpinWriter', 'majOpinAssigner',
       'splitVote', 'majVotes', 'minVotes'],
      dtype='object')


In [124]:
must_drop = ['caseId', 'docketId', 'caseIssuesId', 'voteId', 'dateDecision',
       'decisionType', 'usCite', 'sctCite', 'ledCite', 'lexisCite',  'docket', 'caseName', 'dateArgument',
       'dateRearg', 'declarationUncon', 'caseDisposition',
       'caseDispositionUnusual', 'partyWinning', 'precedentAlteration',
       'voteUnclear', 'issue', 'issueArea',
       'decisionDirectionDissent', 'authorityDecision1', 'authorityDecision2',
       'lawType', 'lawSupp', 'lawMinor', 'majOpinWriter', 'majOpinAssigner',
       'splitVote', 'majVotes', 'minVotes']

optional_drop = ['term', 'chief', 'petitionerState',
       'respondentState', 'adminAction', 'adminActionState',
       'threeJudgeFdc', 'caseOrigin', 'caseOriginState', 'caseSource',
       'caseSourceState', 'lcDisagreement', 'certReason', 'lcDisposition']


drop_columns = must_drop + optional_drop

In [125]:
df.drop(drop_columns, axis=1, inplace=True)

In [126]:
df.isnull().sum()

naturalCourt                0
petitioner                  2
respondent                  3
jurisdiction                1
lcDispositionDirection    716
decisionDirection          55
dtype: int64

In [127]:
df.dropna(axis=0, how='any', inplace=True)

In [128]:
target = 'decisionDirection'
y = df[target]
X = df[[col for col in df.columns if col != target]]

In [129]:
parties = set(X.petitioner.value_counts().index[:24].append(X.respondent.value_counts().index[:24]))

X.petitioner = X.copy().petitioner.apply(lambda x: x if x in parties else 0)
X.respondent = X.copy().respondent.apply(lambda x: x if x in parties else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [130]:
X_dummies = pd.get_dummies(X, drop_first=True)

In [131]:
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y)

In [132]:
lr_params = {
    'C': np.linspace(.25, 2.75, 11),
    'penalty': ['l1', 'l2'],
    'tol': np.logspace(-5, -3, 10)
}

In [133]:
GS_LR = GridSearchCV(LogisticRegression(), param_grid=lr_params, n_jobs=2)

In [134]:
GS_LR.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=2,
       param_grid={'C': array([ 0.25,  0.5 ,  0.75,  1.  ,  1.25,  1.5 ,  1.75,  2.  ,  2.25,
        2.5 ,  2.75]), 'penalty': ['l1', 'l2'], 'tol': array([  1.00000e-05,   1.66810e-05,   2.78256e-05,   4.64159e-05,
         7.74264e-05,   1.29155e-04,   2.15443e-04,   3.59381e-04,
         5.99484e-04,   1.00000e-03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [135]:
cross_val_score(GS_LR, X_train, y_train)

array([ 0.66830403,  0.67088935,  0.67106284])

In [136]:
GS_LR.score(X_test, y_test)

0.68019648397104449

In [137]:
GS_LR.best_params_

{'C': 0.5, 'penalty': 'l2', 'tol': 1.0000000000000001e-05}

In [139]:
knn_params = {
    'n_neighbors': [4, 5, 6, 10, 15, 20], 
    'weights': ['uniform', 'distance'], 
    'leaf_size': [20, 30, 40],
    'p': [1, 2], 
    'metric': ['minkowski']
}

In [140]:
GS_KNN = GridSearchCV(KNeighborsClassifier(), param_grid=knn_params, n_jobs=2, verbose=2)

In [141]:
GS_KNN.fit(X_train, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
[CV] leaf_size=20, metric=minkowski, n_neighbors=4, p=1, weights=uniform 
[CV] leaf_size=20, metric=minkowski, n_neighbors=4, p=1, weights=uniform 
[CV]  leaf_size=20, metric=minkowski, n_neighbors=4, p=1, weights=uniform, total=   8.1s
[CV] leaf_size=20, metric=minkowski, n_neighbors=4, p=1, weights=uniform 
[CV]  leaf_size=20, metric=minkowski, n_neighbors=4, p=1, weights=uniform, total=   8.1s
[CV] leaf_size=20, metric=minkowski, n_neighbors=4, p=1, weights=distance 
[CV]  leaf_size=20, metric=minkowski, n_neighbors=4, p=1, weights=distance, total=   7.8s
[CV] leaf_size=20, metric=minkowski, n_neighbors=4, p=1, weights=distance 
[CV]  leaf_size=20, metric=minkowski, n_neighbors=4, p=1, weights=uniform, total=   7.9s
[CV] leaf_size=20, metric=minkowski, n_neighbors=4, p=1, weights=distance 
[CV]  leaf_size=20, metric=minkowski, n_neighbors=4, p=1, weights=distance, total=   8.4s
[CV] leaf_size=20, metric=minkowski, n_neigh

[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:  8.0min


[CV]  leaf_size=20, metric=minkowski, n_neighbors=10, p=1, weights=uniform, total=   8.3s
[CV] leaf_size=20, metric=minkowski, n_neighbors=10, p=1, weights=distance 
[CV]  leaf_size=20, metric=minkowski, n_neighbors=10, p=1, weights=distance, total=   8.3s
[CV] leaf_size=20, metric=minkowski, n_neighbors=10, p=1, weights=distance 
[CV]  leaf_size=20, metric=minkowski, n_neighbors=10, p=1, weights=distance, total=   8.2s
[CV] leaf_size=20, metric=minkowski, n_neighbors=10, p=2, weights=uniform 
[CV]  leaf_size=20, metric=minkowski, n_neighbors=10, p=1, weights=distance, total=   8.2s
[CV] leaf_size=20, metric=minkowski, n_neighbors=10, p=2, weights=uniform 
[CV]  leaf_size=20, metric=minkowski, n_neighbors=10, p=2, weights=uniform, total=   9.0s
[CV] leaf_size=20, metric=minkowski, n_neighbors=10, p=2, weights=uniform 
[CV]  leaf_size=20, metric=minkowski, n_neighbors=10, p=2, weights=uniform, total=   9.0s
[CV] leaf_size=20, metric=minkowski, n_neighbors=10, p=2, weights=distance 
[CV]

[CV]  leaf_size=30, metric=minkowski, n_neighbors=5, p=1, weights=distance, total=   8.1s
[CV] leaf_size=30, metric=minkowski, n_neighbors=5, p=2, weights=uniform 
[CV]  leaf_size=30, metric=minkowski, n_neighbors=5, p=1, weights=distance, total=   8.0s
[CV] leaf_size=30, metric=minkowski, n_neighbors=5, p=2, weights=uniform 
[CV]  leaf_size=30, metric=minkowski, n_neighbors=5, p=2, weights=uniform, total=   8.7s
[CV] leaf_size=30, metric=minkowski, n_neighbors=5, p=2, weights=uniform 
[CV]  leaf_size=30, metric=minkowski, n_neighbors=5, p=2, weights=uniform, total=   8.8s
[CV] leaf_size=30, metric=minkowski, n_neighbors=5, p=2, weights=distance 
[CV]  leaf_size=30, metric=minkowski, n_neighbors=5, p=2, weights=uniform, total=   9.2s
[CV] leaf_size=30, metric=minkowski, n_neighbors=5, p=2, weights=distance 
[CV]  leaf_size=30, metric=minkowski, n_neighbors=5, p=2, weights=distance, total=   9.2s
[CV] leaf_size=30, metric=minkowski, n_neighbors=5, p=2, weights=distance 
[CV]  leaf_size=

[CV]  leaf_size=30, metric=minkowski, n_neighbors=20, p=2, weights=uniform, total=   9.5s
[CV] leaf_size=30, metric=minkowski, n_neighbors=20, p=2, weights=uniform 
[CV]  leaf_size=30, metric=minkowski, n_neighbors=20, p=2, weights=uniform, total=   9.5s
[CV] leaf_size=30, metric=minkowski, n_neighbors=20, p=2, weights=distance 
[CV]  leaf_size=30, metric=minkowski, n_neighbors=20, p=2, weights=uniform, total=   9.5s
[CV] leaf_size=30, metric=minkowski, n_neighbors=20, p=2, weights=distance 
[CV]  leaf_size=30, metric=minkowski, n_neighbors=20, p=2, weights=distance, total=   9.5s
[CV] leaf_size=30, metric=minkowski, n_neighbors=20, p=2, weights=distance 
[CV]  leaf_size=30, metric=minkowski, n_neighbors=20, p=2, weights=distance, total=   9.2s
[CV] leaf_size=40, metric=minkowski, n_neighbors=4, p=1, weights=uniform 
[CV]  leaf_size=30, metric=minkowski, n_neighbors=20, p=2, weights=distance, total=   9.2s
[CV] leaf_size=40, metric=minkowski, n_neighbors=4, p=1, weights=uniform 
[CV]  

[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed: 33.5min


[CV]  leaf_size=40, metric=minkowski, n_neighbors=5, p=1, weights=uniform, total=   7.4s
[CV] leaf_size=40, metric=minkowski, n_neighbors=5, p=1, weights=distance 
[CV]  leaf_size=40, metric=minkowski, n_neighbors=5, p=1, weights=distance, total=   7.4s
[CV] leaf_size=40, metric=minkowski, n_neighbors=5, p=1, weights=distance 
[CV]  leaf_size=40, metric=minkowski, n_neighbors=5, p=1, weights=distance, total=   7.1s
[CV] leaf_size=40, metric=minkowski, n_neighbors=5, p=2, weights=uniform 
[CV]  leaf_size=40, metric=minkowski, n_neighbors=5, p=1, weights=distance, total=   7.0s
[CV] leaf_size=40, metric=minkowski, n_neighbors=5, p=2, weights=uniform 
[CV]  leaf_size=40, metric=minkowski, n_neighbors=5, p=2, weights=uniform, total=   7.5s
[CV] leaf_size=40, metric=minkowski, n_neighbors=5, p=2, weights=uniform 
[CV]  leaf_size=40, metric=minkowski, n_neighbors=5, p=2, weights=uniform, total=   7.5s
[CV] leaf_size=40, metric=minkowski, n_neighbors=5, p=2, weights=distance 
[CV]  leaf_size=

[CV]  leaf_size=40, metric=minkowski, n_neighbors=20, p=1, weights=distance, total=   7.3s
[CV] leaf_size=40, metric=minkowski, n_neighbors=20, p=2, weights=uniform 
[CV]  leaf_size=40, metric=minkowski, n_neighbors=20, p=1, weights=distance, total=   7.2s
[CV] leaf_size=40, metric=minkowski, n_neighbors=20, p=2, weights=uniform 
[CV]  leaf_size=40, metric=minkowski, n_neighbors=20, p=2, weights=uniform, total=   7.7s
[CV] leaf_size=40, metric=minkowski, n_neighbors=20, p=2, weights=uniform 
[CV]  leaf_size=40, metric=minkowski, n_neighbors=20, p=2, weights=uniform, total=   7.6s
[CV] leaf_size=40, metric=minkowski, n_neighbors=20, p=2, weights=distance 
[CV]  leaf_size=40, metric=minkowski, n_neighbors=20, p=2, weights=uniform, total=   7.9s
[CV] leaf_size=40, metric=minkowski, n_neighbors=20, p=2, weights=distance 
[CV]  leaf_size=40, metric=minkowski, n_neighbors=20, p=2, weights=distance, total=   7.8s
[CV] leaf_size=40, metric=minkowski, n_neighbors=20, p=2, weights=distance 
[CV]

[Parallel(n_jobs=2)]: Done 216 out of 216 | elapsed: 44.1min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=2,
       param_grid={'n_neighbors': [4, 5, 6, 10, 15, 20], 'weights': ['uniform', 'distance'], 'leaf_size': [20, 30, 40], 'p': [1, 2], 'metric': ['minkowski']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

In [145]:
GS_KNN.score(X_train, y_train)

0.70369731965870896

In [143]:
GS_KNN.score(X_test, y_test)

0.67269906928645296

In [146]:
GS_KNN.best_params_

{'leaf_size': 40,
 'metric': 'minkowski',
 'n_neighbors': 20,
 'p': 1,
 'weights': 'uniform'}

In [150]:
model = KNeighborsClassifier(n_neighbors=20, leaf_size=40, p=1)

In [151]:
model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=40, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=20, p=1,
           weights='uniform')

In [152]:
model.score(X_train, y_train)

0.70369731965870896

In [153]:
model.score(X_test, y_test)

0.67269906928645296

In [154]:
rfc_params = {
        'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120],
        'max_depth': [12, 13, 14, 15, 16],
        'criterion': ['gini'],
        'min_samples_split': [2, 3],
        'min_samples_leaf': [1, 2, 3]
}

In [155]:
GS_RFC = GridSearchCV(RandomForestClassifier(), param_grid=rfc_params, n_jobs=2, verbose=1)

In [156]:
GS_RFC.fit(X_train, y_train)

Fitting 3 folds for each of 360 candidates, totalling 1080 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   11.9s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   52.8s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:  2.1min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  3.9min
[Parallel(n_jobs=2)]: Done 1080 out of 1080 | elapsed:  5.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=2,
       param_grid={'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120], 'max_depth': [12, 13, 14, 15, 16], 'criterion': ['gini'], 'min_samples_split': [2, 3], 'min_samples_leaf': [1, 2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [157]:
GS_RFC.score(X_train, y_train)

0.71119538050504183

In [158]:
GS_RFC.score(X_test, y_test) 

0.68872802481902795

In [159]:
GS_RFC.best_params_

{'criterion': 'gini',
 'max_depth': 16,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

These models each had similar accuracy, but I had more consistency with the Random Forest
                             