# Imports

In [43]:
import pandas as pd 
import json
import os
import numpy as np
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import punkt
import nltk
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
import string
from nltk.probability import FreqDist
import seaborn as sns
pd.options.display.max_rows = 999
pd.options.display.max_columns = 30
import lexnlp as lnlp
import src
import importlib
import unidecode as unidecode
importlib.reload(src)
%matplotlib inline


# Train Test Split

In [116]:
df = pd.read_csv('data/SCDB_2020_01_caseCentered_Citation.csv',encoding='cp1252' ) #importing data
df.head() #checking the DataFrame

Unnamed: 0,caseId,docketId,caseIssuesId,voteId,dateDecision,decisionType,usCite,sctCite,ledCite,lexisCite,term,naturalCourt,chief,docket,caseName,...,voteUnclear,issue,issueArea,decisionDirection,decisionDirectionDissent,authorityDecision1,authorityDecision2,lawType,lawSupp,lawMinor,majOpinWriter,majOpinAssigner,splitVote,majVotes,minVotes
0,1946-001,1946-001-01,1946-001-01-01,1946-001-01-01-01,11/18/1946,1,329 U.S. 1,67 S. Ct. 6,91 L. Ed. 3,1946 U.S. LEXIS 1724,1946,1301,Vinson,24,HALLIBURTON OIL WELL CEMENTING CO. v. WALKER e...,...,0.0,80180.0,8.0,2.0,0.0,4.0,,6.0,600.0,35 U.S.C. § 33,78.0,78.0,1,8,1
1,1946-002,1946-002-01,1946-002-01-01,1946-002-01-01-01,11/18/1946,1,329 U.S. 14,67 S. Ct. 13,91 L. Ed. 12,1946 U.S. LEXIS 1725,1946,1301,Vinson,12,CLEVELAND v. UNITED STATES,...,0.0,10500.0,1.0,1.0,0.0,4.0,,6.0,600.0,18 U.S.C. § 398,81.0,87.0,1,6,3
2,1946-003,1946-003-01,1946-003-01-01,1946-003-01-01-01,11/18/1946,1,329 U.S. 29,67 S. Ct. 1,91 L. Ed. 22,1946 U.S. LEXIS 3037,1946,1301,Vinson,21,CHAMPLIN REFINING CO. v. UNITED STATES ET AL.,...,0.0,80250.0,8.0,2.0,0.0,1.0,,2.0,207.0,,84.0,78.0,1,5,4
3,1946-004,1946-004-01,1946-004-01-01,1946-004-01-01-01,11/25/1946,7,329 U.S. 40,67 S. Ct. 167,91 L. Ed. 29,1946 U.S. LEXIS 1696,1946,1301,Vinson,26,UNITED STATES v. ALCEA BAND OF TILLAMOOKS ET AL.,...,0.0,20150.0,2.0,2.0,0.0,4.0,,6.0,600.0,49 Stat. 801,87.0,87.0,1,5,3
4,1946-005,1946-005-01,1946-005-01-01,1946-005-01-01-01,11/25/1946,1,329 U.S. 64,67 S. Ct. 154,91 L. Ed. 44,1946 U.S. LEXIS 2997,1946,1301,Vinson,50,"UNITED STATES v. HOWARD P. FOLEY CO., INC.",...,0.0,80060.0,8.0,2.0,0.0,7.0,,,,,78.0,87.0,1,6,3


In [63]:
#Filtering down to close votes and removing cases where who 
#the outcome was favorable for was unclear.
df = df[df.majVotes < 7]
df = df[df.partyWinning != 2]

### Extracting Important Features Based on my EDA

Since this data is almost all categorical data there are many Nans, they also decided to use floats for some reason. I'm going to extract my features then drop nans to preserve more rows.

In [64]:
#going down to just the factors I want to use based on EDA and intuition
#ie not including variable only known after the opinion is released etc...
df = df[['usCite','lcDispositionDirection', 'issue', 'naturalCourt', 'jurisdiction', 'caseSource', 'petitioner','partyWinning']]

In [19]:
#dropping nans
df.dropna(inplace= True)

In [21]:
#checking for floats
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3518 entries, 1 to 9026
Data columns (total 7 columns):
lcDispositionDirection    3518 non-null float64
issue                     3518 non-null float64
naturalCourt              3518 non-null int64
jurisdiction              3518 non-null float64
caseSource                3518 non-null float64
petitioner                3518 non-null int64
partyWinning              3518 non-null float64
dtypes: float64(5), int64(2)
memory usage: 219.9 KB


In [28]:
#turning floats into ints 
for column in df.columns:
    df[column] = df[column].apply(lambda x: int(x))

In [29]:
#making sure it worked 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3518 entries, 1 to 9026
Data columns (total 7 columns):
lcDispositionDirection    3518 non-null int64
issue                     3518 non-null int64
naturalCourt              3518 non-null int64
jurisdiction              3518 non-null int64
caseSource                3518 non-null int64
petitioner                3518 non-null int64
partyWinning              3518 non-null int64
dtypes: int64(7)
memory usage: 219.9 KB


In [31]:
#assigning my target to y and the predictive variables to y 
X = df.drop('partyWinning', axis = 1)
Y = df['partyWinning']

In [32]:
#train-test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=34)

# Logistic Regression Baseline

In [53]:
#instancing a logistic regression model 
lgr = LogisticRegression(penalty = 'l2', class_weight= 'balanced')

In [54]:
#fitting 
lgr.fit(X_train, y_train)

LogisticRegression(class_weight='balanced')

In [55]:

preds = lgr.predict(X_test)
accuracy_score(preds, y_test)

0.4928977272727273

In [56]:
lgr.coef_

array([[ 1.68268731e-05, -2.04735637e-06,  1.30133603e-04,
        -9.29141660e-06, -3.31223364e-04, -4.88717479e-04]])

# RFC

In [37]:
rfc = RandomForestClassifier(n_estimators=100, random_state=0, class_weight= 'balanced')

In [38]:
rfc.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced', random_state=0)

In [39]:
rfc_preds = rfc.predict(X_test)

In [40]:
accuracy_score(rfc_preds, y_test)

0.6335227272727273

We are officially better than the legal experts!

In [41]:
confusion = confusion_matrix(y_test, rfc_preds)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print( f'True Positives: {TP}\n'
       f'True Negatives: {TN}\n', 
      f'False Positives: {FP}\n',
      f'False Negatives: {FN}\n')

True Positives: 329
True Negatives: 117
 False Positives: 139
 False Negatives: 119



In [44]:
f1_score(rfc_preds, y_test)

0.7183406113537117

In [59]:
df2 = pd.read_csv("probs_case.csv")

In [60]:
df2.head()

Unnamed: 0,case,pwin,ploss
0,352us282,3.101469e-13,1.0
1,353us586,3.409636e-05,0.9999659
2,352us249,1.0,2.622615e-13
3,354us147,1.0,1.790286e-14
4,352us407,1.686313e-11,1.0


In [71]:
df.rename(columns = {'usCite': 'case'}, inplace = True)

In [74]:
df.case = df['case'].apply(lambda x: str(x).replace('.', '').lower().replace(' ', ''))

In [75]:
df.head()

Unnamed: 0,case,lcDispositionDirection,issue,naturalCourt,jurisdiction,caseSource,petitioner,partyWinning
1,329us14,1.0,10500.0,1301,1.0,30.0,100,0.0
2,329us29,2.0,80250.0,1301,2.0,107.0,209,0.0
3,329us40,2.0,20150.0,1301,1.0,3.0,27,0.0
4,329us64,2.0,80060.0,1301,1.0,3.0,27,1.0
6,329us90,2.0,80120.0,1301,1.0,21.0,148,0.0


In [79]:
df2 = df2.merge(df, on = 'case')

In [91]:
df2 = df2.drop('case', axis = 1)

In [94]:
df2.dropna(inplace= True)

In [95]:
for column in df2.columns:
    df2[column] = df2[column].apply(lambda x: int(x))

In [97]:
X = df2.drop(['partyWinning'], axis = 1)
Y = df2['partyWinning']

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=34)

In [99]:
rfc.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced', random_state=0)

In [100]:
rfc_preds = rfc.predict(X_test)

In [101]:
accuracy_score(rfc_preds, y_test)

0.7659137577002053

In [102]:
from sklearn.model_selection import GridSearchCV

In [110]:
param_grid = { 
    'n_estimators': [100, 150, 200, 250],
    'criterion': ['gini'],
    'max_depth': range(9,15),
    'max_features': ['auto']
}

In [111]:
grid_tree =GridSearchCV(RandomForestClassifier(), param_grid, cv=10, scoring='accuracy', verbose=1, n_jobs=-1)

In [112]:
grid_tree.fit(X_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   22.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  1.3min finished


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': range(9, 15),
                         'max_features': ['auto'],
                         'n_estimators': [100, 150, 200, 250]},
             scoring='accuracy', verbose=1)

In [113]:
rfc_preds = grid_tree.predict(X_test)

In [114]:
accuracy_score(rfc_preds, y_test)

0.7679671457905544

In [115]:
grid_tree.best_params_

{'criterion': 'gini',
 'max_depth': 9,
 'max_features': 'auto',
 'n_estimators': 100}

In [None]:
grid_tree