In [1]:
import pandas as pd 
import json
import os
import numpy as np
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import punkt
import nltk
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix
import string
from nltk.probability import FreqDist
import seaborn as sns
pd.options.display.max_rows = 999
pd.options.display.max_columns = 30
import lexnlp as lnlp
import src
from src import *
import importlib
import unidecode as unidecode
importlib.reload(src)
%matplotlib inline

# NLP Feature Engineering 

In [2]:
final_df = pd.read_csv("../data/Final_Merge.csv")#importing the text data I cleaned
final_df.head()#checking it out

Unnamed: 0,case,text,target,lib_or_con,majVotes
0,352us282,may it please the court this case be here on a...,1,2.0,6
1,353us586,mr chief justice if the court please when the ...,1,2.0,4
2,352us599,mr chief justice and associate justice of the ...,0,1.0,8
3,352us82,may it please the court mr williams this matte...,0,2.0,8
4,352us220,may it please the court mr plauche you may pro...,0,1.0,8


## Tokenizing all that sweet sweet text


In [3]:
tokenizer = RegexpTokenizer(r'[a-zA-Z0-9!]+')#instaniating a reg ex tokenizer

final_df.text = final_df.text.apply(lambda x: tokenizer.tokenize(x)) #applying it to the text column

## Lemmatizing

In [4]:
#using a custom function to lemmatize my text data
final_df.text = final_df.text.apply(lambda x: src.lemm_text(x))

In [5]:
#checking my work
final_df.head()

Unnamed: 0,case,text,target,lib_or_con,majVotes
0,352us282,"[may, it, please, the, court, this, case, be, ...",1,2.0,6
1,353us586,"[mr, chief, justice, if, the, court, please, w...",1,2.0,4
2,352us599,"[mr, chief, justice, and, associate, justice, ...",0,1.0,8
3,352us82,"[may, it, please, the, court, mr, williams, th...",0,2.0,8
4,352us220,"[may, it, please, the, court, mr, plauche, you...",0,1.0,8


# Modeling 


I will be using accuracy as my metric as this model will have little actionable impact, its not important to maximise other metrics. The more accurate the model the better a data point it will be for legal prognosticators.

## Train Test Split and vectorizing with TFIDF scores

I used sklearn to get the tfidf scores for my data. I tried a couple of different ngram ranges and 1-3 seemed to be the best.

In [6]:
#to use the tfidf model as I know it, I rejoined the lemmatized words into one string
final_df.text = final_df.text.apply(lambda x: ','.join(x)) 
final_df.text =final_df.text.apply(lambda x: x.replace(',',' '))

In [7]:
X = final_df.text#assigning my features
Y = final_df.target#and my targets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


In [18]:
#creating stopwords, I included some legal ones, I could have done this in my src file 
#and imported them, but it was easier to expirement with different options by generating
#them within the notebook
sw_list = stopwords.words('english')
sw_list += list(string.punctuation)
sw_list += ["''", '""', '...', '``', '’', '“', '’', '”', '‘', '‘', '©',
        'said', 'one', 'com','-', '–', '—', 'co', 'wa', 'ha', '1', 'amp',
       'said', 'one', 'com', 'http', '-', '–', '—', 'co', 'wa', 'ha', '1', 'amp','court', 'would', 'case', 'say', 'think', 'state', 'well', 'make','right', 'question', 'mr', 'go', 'could', 'statute', 'yes','honor', 'fact', 'justice', 'law', 'time', 'may','whether', 'take', 'get', 'act', 'know', 'point', 'issue', 'first', 'rule', 'give', 'government', 'federal', 'two', 'congress', 'judge','appeal', 'district','mean','use' 'may', 
        'it', 'please', 'the', 'court', 'justice', 'thank', 'you', 'mrs']  
sw_set = set(sw_list)

In [19]:
#train test split
X_train_lem, X_test_lem, y_train_lem, y_test_lem = train_test_split(X, Y, test_size=0.6, random_state=34)
#vectorizing with TFIDF scores
tfidf = TfidfVectorizer(ngram_range= (1,3), stop_words= sw_set)#I tried different ranges of ngrams and 1-3 was the best
#transforming the data 
tfidf_data_train_lem = tfidf.fit_transform(X_train_lem)
tfidf_data_test_lem = tfidf.transform(X_test_lem)

### top ten TFIDF scores

In [20]:
indices = np.argsort(tfidf.idf_)[::-1]#grabbing the top ten tfidf score indices
features = tfidf.get_feature_names() #getting the feature names
top_n = 10
top_features = [features[i] for i in indices[:top_n]] # matching them up 
print (top_features)

['zywicki grace several', 'finding adopt proper', 'finding administrator industry', 'finding admit necessary', 'finding admit trial', 'advertiser spend extra', 'finding adopt gordon', 'finding adopt ninth', 'finding adopt trial', 'finding administrative procedure']


The main interesting word here is forbid, it's featured in nine of the top ten tri grams, bigrams or words. Furthermore, almost every high tfidf word is a trigram with only three bigrams in the top ten. Obscenity and free speech show up which are interrelated issues and were also more likely to be winning issues for the petitioners. 

## Untuned RFC (baseline)

I will use an RFC as my baseline model. It's generally good for NLP classification tasks and not overly difficult to implement untuned.

In [21]:
#instantiating the model
rf_classifier_lem = RandomForestClassifier(n_estimators=10, random_state=0, class_weight= 'balanced')


In [22]:
#fitting
rf_classifier_lem.fit(tfidf_data_train_lem, y_train_lem)
#generating test predictions
rf_test_preds_lem = rf_classifier_lem.predict(tfidf_data_test_lem)
#generating train predictions
rf_train_preds_lem = rf_classifier_lem.predict(tfidf_data_train_lem)

### Evaulating test predictions

In [23]:
from sklearn.metrics import f1_score
rf_acc_score_lem = accuracy_score(y_test_lem, rf_test_preds_lem)#accuracy score
rf_f1_score_lem = f1_score(y_test_lem, rf_test_preds_lem)#F1 score
print('Random Forest with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(rf_acc_score_lem))
print()
print("F1 Score: {:.4}".format(rf_f1_score_lem))

Random Forest with Lemmatization Features
Testing Accuracy: 0.5914

F1 Score: 0.7004


In [24]:
confusion = confusion_matrix(y_test_lem, rf_test_preds_lem)#generating a confusion matrix
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
#printing it in the format which I find most readable, rather than the graphic version
print( f'True Positives: {TP}\n'
       f'True Negatives: {TN}\n', 
      f'False Positives: {FP}\n',
      f'False Negatives: {FN}\n')

True Positives: 1698
True Negatives: 405
 False Positives: 879
 False Negatives: 574



### Evaulating training predictions

In [15]:
rf_acc_score_lem_t = accuracy_score(y_train_lem, rf_train_preds_lem)#accuracy score
rf_f1_score_lem_t = f1_score(y_train_lem, rf_train_preds_lem)# F1 score
print('Random Forest with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(rf_acc_score_lem_t))
print()
print("F1 Score: {:.4}".format(rf_f1_score_lem_t))

Random Forest with Lemmatization Features
Testing Accuracy: 0.9899

F1 Score: 0.992


In [16]:
confusion = confusion_matrix(y_train_lem, rf_train_preds_lem) #generating a confusion matrix
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
#printing it in the format which I find most readable, rather than the graphic version
print( f'True Positives: {TP}\n'
       f'True Negatives: {TN}\n', 
      f'False Positives: {FP}\n',
      f'False Negatives: {FN}\n')

True Positives: 1485
True Negatives: 861
 False Positives: 15
 False Negatives: 9



### Analysis

Seems like the baseline model is way way way overfit, I will try some different models and see if they have similar problems, then try a gridsearch cv to try to adress this problem.

## Multinomial Naive Bayes Classifier

In [25]:
from sklearn.naive_bayes import MultinomialNB 

In [26]:
nb_classifier = MultinomialNB(alpha = .1)#instantiating a multinomial naive bayes model 

In [27]:
nb_classifier.fit(tfidf_data_train_lem, y_train_lem) #fitting


MultinomialNB(alpha=0.1)

In [28]:
nb_test_preds_lem = nb_classifier.predict(tfidf_data_test_lem)
nb_train_preds_lem = nb_classifier.predict(tfidf_data_train_lem)


In [29]:
nb_acc_score_lem = accuracy_score(y_test_lem, nb_test_preds_lem)
nb_f1_score_lem = f1_score(y_test_lem, nb_test_preds_lem)

In [30]:
print('Naive Bayes with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(nb_acc_score_lem))
print()
print("F1 Score: {:.4}".format(nb_f1_score_lem))

Naive Bayes with Lemmatization Features
Testing Accuracy: 0.6378

F1 Score: 0.7787


The Naive Bayes model did a little better on the testing data but not a by a huge margin. It is about the same as predicting the dominant class wil always win (petitioner. 

In [31]:
confusion = confusion_matrix(y_test_lem, rf_test_preds_lem)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print( f'True Positives: {TP}\n'
       f'True Negatives: {TN}\n', 
      f'False Positives: {FP}\n',
      f'False Negatives: {FN}\n')

True Positives: 1698
True Negatives: 405
 False Positives: 879
 False Negatives: 574



In [32]:
nb_acc_score_lem_t = accuracy_score(y_train_lem, nb_train_preds_lem)
nb_f1_score_lem_t = f1_score(y_train_lem, nb_train_preds_lem)

In [33]:
print('Naive Bayes with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(nb_acc_score_lem_t))
print()
print("F1 Score: {:.4}".format(nb_f1_score_lem))

Naive Bayes with Lemmatization Features
Testing Accuracy: 0.9975

F1 Score: 0.7787


In [34]:
confusion = confusion_matrix(y_train_lem, nb_train_preds_lem)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print( f'True Positives: {TP}\n'
       f'True Negatives: {TN}\n', 
      f'False Positives: {FP}\n',
      f'False Negatives: {FN}\n')

True Positives: 1494
True Negatives: 870
 False Positives: 6
 False Negatives: 0



Still having the same drastic overfitting issues with this model 

### Top Ten Most Imporant Features

In [35]:
feature_names = tfidf.get_feature_names()
top10 = np.argsort(nb_classifier.coef_[0])[-10:]

print(" ".join(feature_names[j] for j in top10))

commission decision evidence defendant union section claim trial tax jury


These are largely the same as the most common words I found in my EDA. I thought that with TFIDF scores and modeling there would be more differences.

## Tuned RFC

I dont have high expectations but lets see if a grid search RCF can do any better before we fully decide on using the NB classifier. 

In [36]:
from sklearn.model_selection import GridSearchCV

In [37]:
param_grid = { 
    'n_estimators': [10, 20, 50, 100],
    'criterion': ['gini'],
    'max_depth': range(2,10),
    'max_features': ['auto']
}

In [38]:
grid_tree =GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

In [39]:
grid_tree.fit(tfidf_data_train_lem, y_train_lem)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:  5.7min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': range(2, 10),
                         'max_features': ['auto'],
                         'n_estimators': [10, 20, 50, 100]},
             scoring='accuracy', verbose=1)

In [40]:
grfc_preds_test =grid_tree.predict(tfidf_data_test_lem)
grfc_preds_train = grid_tree.predict(tfidf_data_train_lem)


In [42]:
accuracy_score_test = accuracy_score(grfc_preds_test, y_test_lem)
f1_score_test = f1_score(grfc_preds_test, y_test_lem)

In [43]:
print('Grid searched Random Forest Classifier with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(accuracy_score_test))
print()
print("F1 Score: {:.4}".format(f1_score_test))

Grid searched Random Forest Classifier with Lemmatization Features
Testing Accuracy: 0.64

F1 Score: 0.7795


In [44]:
confusion = confusion_matrix(y_test_lem, grfc_preds_test)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print( f'True Positives: {TP}\n'
       f'True Negatives: {TN}\n', 
      f'False Positives: {FP}\n',
      f'False Negatives: {FN}\n')

True Positives: 2263
True Negatives: 13
 False Positives: 1271
 False Negatives: 9



Unfortunately this model is only predicting the dominant class 

In [45]:
accuracy_score_train = accuracy_score(grfc_preds_train, y_train_lem)
f1_score_train = f1_score(grfc_preds_train, y_train_lem)

In [46]:
print('Grid searched Random Forest Classifier with Lemmatization Features')
print("Training Accuracy: {:.4}".format(accuracy_score_train))
print()
print("F1 Score: {:.4}".format(f1_score_train))

Grid searched Random Forest Classifier with Lemmatization Features
Testing Accuracy: 0.6401

F1 Score: 0.7779


In [47]:
confusion = confusion_matrix(y_train_lem, grfc_preds_train)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print( f'True Positives: {TP}\n'
       f'True Negatives: {TN}\n', 
      f'False Positives: {FP}\n',
      f'False Negatives: {FN}\n')

True Positives: 1494
True Negatives: 23
 False Positives: 853
 False Negatives: 0



It does the same on the training data.

In [80]:
grid_tree.best_params_

{'criterion': 'gini',
 'max_depth': 3,
 'max_features': 'auto',
 'n_estimators': 20}

It looks like the grid search proccess adressed the overfitting by increasing the bias by lowering the max depth and estimators. This eventually must have led to just predicting that the petitioner would win, This indicates to me that modeling based on text data may be pretty much impossible without a sequential nueral net. I will try a few more models with the straight up TFIDF score but I do not think they will be any better. 

## Support Vector Classifier

In [41]:
from sklearn.svm import SVC

In [42]:
svc = SVC(class_weight= 'balanced')

In [43]:
svc.fit(X= tfidf_data_train_lem, y= y_train_lem)

SVC(class_weight='balanced')

In [44]:
SVC_preds_test = svc.predict(tfidf_data_test_lem)
SVC_preds_train = svc.predict(tfidf_data_train_lem)

In [49]:
svc_acc_test = accuracy_score(SVC_preds_test, y_test_lem)
svc_f1_test = f1_score(SVC_preds_test, y_test_lem)

In [50]:
print('Grid searched Random Forest Classifier with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(svc_acc_test))
print()
print("F1 Score: {:.4}".format(svc_f1_test))

Grid searched Random Forest Classifier with Lemmatization Features
Testing Accuracy: 0.5672

F1 Score: 0.6577


In [52]:
confusion = confusion_matrix(y_test_lem, SVC_preds_test)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print( f'True Positives: {TP}\n'
       f'True Negatives: {TN}\n', 
      f'False Positives: {FP}\n',
      f'False Negatives: {FN}\n')

True Positives: 613
True Negatives: 223
 False Positives: 379
 False Negatives: 259



In [None]:
print('Grid searched Random Forest Classifier with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(accuracy_score_test))
print()
print("F1 Score: {:.4}".format(f1_score_test))

Many roads lead to the same place. Every model is fairly overfit and the optimized random forest classifier which I gridsearched finds it best to just guess that the petitioner will win 100% of the cases. I'm unsure exactly why the overfitting is happenening, it might be due to the curse of dimensionality and the multitude of words within the dataset. XG boost models are supposed to be good at adressing overfitting so I will give that a shot then give up on these more straight forward TFIDF models.

# W2V

Lets see if using W2V and getting the mean vector for each argument leads to a better model.

In [49]:
vocab = final_df.text.map(word_tokenize) # generating all my tokens in a format good for Word2Vec

In [None]:
from gensim.models import Word2Vec

model = Word2Vec(vocab, size=100, window=5, min_count=1, workers=4) #generating the model 

model.train(vocab, total_examples=model.corpus_count, epochs= 10) # training

In [None]:
wtv = dict(zip(model.wv.index2word, model.wv.syn0)) #making a dictionary for each words vector

In [None]:
# I will use a use a custom word2vecVectorizer class than can get the mean vector for each case then implement that in a pipeline 
#three different models, an RFC, a SVM and a simple Logistic Regression, then display the mean cross val score for
#each model over 5 folds

from sklearn.model_selection import cross_val_score

rf =  Pipeline([('Word2Vec Vectorizer', src.W2vVectorizer(wtv)),
              ('Random Forest', RandomForestClassifier(n_estimators=100, verbose=True))])
svc = Pipeline([('Word2Vec Vectorizer', src.W2vVectorizer(wtv)),
                ('Support Vector Machine', SVC())])
lr = Pipeline([('Word2Vec Vectorizer', srr.W2vVectorizer(wtv)),
              ('Logistic Regression', LogisticRegression(max_iter= 1000))])


In [None]:
models = [('Random Forest', rf),
          ('Support Vector Machine', svc),
          ('Logistic Regression', lr)]

In [None]:
scores = [(name, cross_val_score(model, final_df.text, final_df['target'], cv=2).mean()) for name, model, in models]

In [None]:
scores

I would do more to cross validate these models, but I am starting to get deja vu from these scores, they all hit a hard limit right around where always guessing the dominant class would get you in terms of accuracy. The SVM with the W2V mean vectors for each case does the "best" but is still not good or worth pursuing from an analysis perspective.

# Conclusions

All these models trained on my language data worked rather poorly. While this is a difficult thing to predict, there was no improvement over simply guessing the dominant class everytime. I would not reccomend using NLP to predict Supreme Court Cases. Other researchers have had far more success using other factors of the case in machine learning algorithims. 