In [282]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPool2D, Flatten, Input
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import to_categorical
import tensorflow as tf

In [283]:
df_train = pd.read_csv('case_level_train_set.csv', encoding='ISO 8859-1', dtype='object', index_col=0)

In [284]:
df_train.columns

Index(['caseId', 'docketId', 'caseIssuesId', 'voteId', 'dateDecision',
       'decisionType', 'usCite', 'sctCite', 'ledCite', 'lexisCite', 'term',
       'naturalCourt', 'chief', 'docket', 'caseName', 'dateArgument',
       'dateRearg', 'petitioner', 'petitionerState', 'respondent',
       'respondentState', 'jurisdiction', 'adminAction', 'adminActionState',
       'threeJudgeFdc', 'caseOrigin', 'caseOriginState', 'caseSource',
       'caseSourceState', 'lcDisagreement', 'certReason', 'lcDisposition',
       'lcDispositionDirection', 'declarationUncon', 'caseDisposition',
       'caseDispositionUnusual', 'partyWinning', 'precedentAlteration',
       'voteUnclear', 'issue', 'issueArea', 'decisionDirection',
       'decisionDirectionDissent', 'authorityDecision1', 'authorityDecision2',
       'lawType', 'lawSupp', 'lawMinor', 'majOpinWriter', 'majOpinAssigner',
       'splitVote', 'majVotes', 'minVotes'],
      dtype='object')

In [285]:
dropcols = ['docketId', 'caseIssuesId', 'voteId', 'dateDecision',
       'decisionType', 'usCite', 'sctCite', 'ledCite', 'lexisCite', 'term', 'chief', 'docket', 'caseName', 'dateArgument',
       'dateRearg', 'petitionerState', 
       'respondentState', 'adminAction', 'adminActionState', 'caseOriginState', 'threeJudgeFdc',
       'caseSourceState', 'declarationUncon', 'caseDisposition',
       'caseDispositionUnusual', 'partyWinning', 'precedentAlteration',
       'voteUnclear', 'issue', 'lcDisposition',
       'decisionDirectionDissent', 'authorityDecision1', 'authorityDecision2',
       'lawType', 'lawSupp', 'lawMinor', 'majOpinWriter', 'majOpinAssigner',
       'splitVote', 'majVotes', 'minVotes']

for col in dropcols:
    df_train.drop(col, axis=1, inplace=True)

df_train = df_train[df_train.decisionDirection != '3']


In [286]:
df_train.dropna(axis=0, inplace=True)

In [287]:
target = 'decisionDirection'
y_train = df_train[target]
X_train = df_train[[col for col in df_train.columns if col != target]]

In [288]:
X_train.describe()

Unnamed: 0,caseId,naturalCourt,petitioner,respondent,jurisdiction,caseOrigin,caseSource,lcDisagreement,certReason,lcDispositionDirection,issueArea
count,18904,18904,18904,18904,18904,18904,18904,18904,18904,18904,18904
unique,18904,105,267,254,10,202,185,2,13,3,14
top,1987-035,706,27,27,1,302,300,0,1,1,8
freq,1,910,1750,2247,7605,4213,3830,17029,11327,9970,6413


In [289]:
parties = pd.read_csv("codelist.csv", index_col=None)
parties = parties.applymap(lambda x: int(x))
def party_categorizer(val):
    if val in parties.State.values:
        return '1'
    elif val in parties.Federal.values:
        return '2'
    elif val in parties.Criminal.values:
        return '3'
    elif val in parties.Liberal.values:
        return '4'
    elif val in parties.RTY.values:
        return '5'
    elif val in parties.Finance.values:
        return '6'
    elif val in parties.Business.values:
        return '7'
    elif val in parties.Political.values:
        return '8'
    elif val in parties.famend.values:
        return '9'
    else:
        return '10'
 

def issue_consolidate(num):
    if num == "None":
        return '13'
    if int(num) in [0, 3, 5, 6, 7, 11, 14]:
        return '13'
    else:
        return str(num)

In [290]:
X_train.issueArea = X_train.issueArea.apply(lambda x: issue_consolidate(x))
X_train.petitioner = X_train.petitioner.apply(lambda x: party_categorizer(int(x)))                                  
X_train.respondent = X_train.respondent.apply(lambda x: party_categorizer(int(x)))
X_train.jurisdiction = X_train.jurisdiction.apply(lambda x: '1' if x=='1' else '0')
X_train.certReason = X_train.certReason.apply(lambda x: '1' if x == '1' or x == '12' else '0')
X_train.lcDisagreement = X_train.lcDisagreement.apply(lambda x: '1' if x == '1' else '0')
X_train.caseSource = X_train.caseSource.apply(lambda x :1 if x in ['300', '301', '301'] else '0')
X_train.caseOrigin = X_train.caseOrigin.apply(lambda x :1 if x in ['300', '301', '301'] else '0')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [291]:
X_train_full = X_train
X_train = X_train_full.iloc[:, 2:]

In [292]:
ohc = OneHotEncoder(sparse=False)
ohc.fit(X_train)
X_train = ohc.transform(X_train)


In [293]:
df_test = pd.read_csv('case_level_test_set.csv', encoding='ISO 8859-1', dtype='object', index_col=0)
for col in dropcols:
    df_test.drop(col, axis=1, inplace=True)
df_test = df_test[df_test.decisionDirection != '3']
df_test.dropna(axis=0, inplace=True)
y_test = df_test[target]
X_test = df_test[[col for col in df_test.columns if col != target]]
X_test.issueArea = X_test.issueArea.apply(lambda x: issue_consolidate(x))
X_test.petitioner = X_test.petitioner.apply(lambda x: party_categorizer(int(x)))                                  
X_test.respondent = X_test.respondent.apply(lambda x: party_categorizer(int(x)))
X_test.jurisdiction = X_test.jurisdiction.apply(lambda x: '1' if x=='1' else '0')
X_test.certReason = X_test.certReason.apply(lambda x: '1' if x == '1' or x == '12' else '0')
X_test.lcDisagreement = X_test.lcDisagreement.apply(lambda x: '1' if x == '1' else '0')
X_test.caseSource = X_test.caseSource.apply(lambda x :1 if x in ['300', '301', '301'] else '0')
X_test.caseOrigin = X_test.caseOrigin.apply(lambda x :1 if x in ['300', '301', '301'] else '0')
X_test_full = X_test
X_test = X_test_full.iloc[:, 2:]
X_test = ohc.transform(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [294]:
y_train = y_train.apply(lambda x: 1 if x == '2' else 0)
y_test = y_test.apply(lambda x: 1 if x == '2' else 0)
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
y_test.shape[1]

2

In [295]:
model = Sequential()
model.add(Dense(X_train.shape[1], input_dim=X_train.shape[1], activation='relu')) ## start with neurons=features
model.add(Dense(20, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])#

In [296]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=100)

Train on 18904 samples, validate on 4658 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x11c298780>

In [297]:
model.evaluate(X_test, y_test)



[0.61883426116944995, 0.66552168307462034]

In [298]:
model.metrics_names

['loss', 'binary_accuracy']

In [244]:
y_hat = [np.round(x, 0) for x in model.predict(X_test)]

In [245]:
y_hat

[array([ 1.,  0.], dtype=float32),
 array([ 1.,  0.], dtype=float32),
 array([ 0.,  1.], dtype=float32),
 array([ 0.,  1.], dtype=float32),
 array([ 0.,  1.], dtype=float32),
 array([ 1.,  0.], dtype=float32),
 array([ 1.,  0.], dtype=float32),
 array([ 1.,  0.], dtype=float32),
 array([ 0.,  1.], dtype=float32),
 array([ 0.,  1.], dtype=float32),
 array([ 1.,  0.], dtype=float32),
 array([ 1.,  0.], dtype=float32),
 array([ 1.,  0.], dtype=float32),
 array([ 1.,  0.], dtype=float32),
 array([ 1.,  0.], dtype=float32),
 array([ 0.,  1.], dtype=float32),
 array([ 1.,  0.], dtype=float32),
 array([ 0.,  1.], dtype=float32),
 array([ 1.,  0.], dtype=float32),
 array([ 0.,  1.], dtype=float32),
 array([ 1.,  0.], dtype=float32),
 array([ 1.,  0.], dtype=float32),
 array([ 0.,  1.], dtype=float32),
 array([ 1.,  0.], dtype=float32),
 array([ 1.,  0.], dtype=float32),
 array([ 0.,  1.], dtype=float32),
 array([ 1.,  0.], dtype=float32),
 array([ 0.,  1.], dtype=float32),
 array([ 1.,  0.], d

In [246]:
model.predict(X_test)

array([[ 0.7156027 ,  0.2843973 ],
       [ 0.68066657,  0.31933343],
       [ 0.28839904,  0.71160096],
       ..., 
       [ 0.33292359,  0.66707641],
       [ 0.35954827,  0.64045179],
       [ 0.57928586,  0.42071411]], dtype=float32)

In [247]:
y_test

array([[ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       ..., 
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.]])