In [1]:
import pandas as pd 
import json
import os
import numpy as np
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import punkt
import nltk
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix
import string
from nltk.probability import FreqDist
import seaborn as sns
pd.options.display.max_rows = 999
pd.options.display.max_columns = 30
import lexnlp as lnlp
import src
import importlib
import unidecode as unidecode
importlib.reload(src)
%matplotlib inline

# NLP Feature Engineering and EDA

In [5]:
final_df = pd.read_csv("data/Final_Merge.csv")

## Tokenizing all that sweet sweet text


In [6]:
tokenizer = RegexpTokenizer(r'[a-zA-Z0-9!]+')

final_df.text = final_df.text.apply(lambda x: tokenizer.tokenize(x))

In [7]:
lemmatizer = WordNetLemmatizer() 

def lemm_text(words):
    lem = []
    for word in words:
        lem.append(lemmatizer.lemmatize(word))
    return lem  

In [8]:
final_df.text = final_df.text.apply(lambda x: lemm_text(x))

In [9]:
final_df.head()

Unnamed: 0.1,Unnamed: 0,case,text,target,lib_or_con,majVotes
0,0,352us282,"[may, it, please, the, court, this, case, be, ...",1,2.0,6
1,1,353us586,"[mr, chief, justice, if, the, court, please, w...",1,2.0,4
2,5,352us249,"[if, the, court, please, you, might, wait, jus...",0,2.0,5
3,9,354us147,"[mr, chief, justice, if, the, court, please, t...",0,2.0,5
4,10,352us407,"[mr, chief, justice, may, it, please, the, cou...",1,1.0,6


# Modeling 


I will be using accuracy as my metric as this model will have little actionable impact, its not important to maximise other metrics. The more accurate the model the better a data point it will be for legal prognosticators.

## Train Test Split

I used sklearn to get the tfidf scores for my data. I tried a couple of different ngram ranges and 1-3 seemed to be the best.

In [10]:
final_df.text = final_df.text.apply(lambda x: ','.join(x))
final_df.text =final_df.text.apply(lambda x: x.replace(',',' '))

In [11]:
X = final_df.text
Y = final_df.target
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


In [12]:
sw_list = stopwords.words('english')
sw_list += list(string.punctuation)
sw_list += ["''", '""', '...', '``', '’', '“', '’', '”', '‘', '‘', '©',
            'said', 'one', 'com','-', '–', '—', 'co', 'wa', 'ha', '1', 'amp']
sw_set = set(sw_list)

In [13]:
X_train_lem, X_test_lem, y_train_lem, y_test_lem = train_test_split(X, Y, test_size=0.20, random_state=34)
tfidf = TfidfVectorizer(ngram_range= (1,3), stop_words= sw_set)

tfidf_data_train_lem = tfidf.fit_transform(X_train_lem)
tfidf_data_test_lem = tfidf.transform(X_test_lem)

## Untuned RFC (baseline)

I will use an RFC as my baseline model. It's generally good for NLP classification tasks and not overly difficult to implement untuned.

In [14]:
rf_classifier_lem = RandomForestClassifier(n_estimators=100, random_state=0, class_weight= 'balanced')

In [15]:
rf_classifier_lem.fit(tfidf_data_train_lem, y_train_lem)

rf_test_preds_lem = rf_classifier_lem.predict(tfidf_data_test_lem)

In [16]:
from sklearn.metrics import f1_score
rf_acc_score_lem = accuracy_score(y_test_lem, rf_test_preds_lem)
rf_f1_score_lem = f1_score(y_test_lem, rf_test_preds_lem)
print('Random Forest with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(rf_acc_score_lem))
print()
print("F1 Score: {:.4}".format(rf_f1_score_lem))

Random Forest with Lemmatization Features
Testing Accuracy: 0.5793

F1 Score: 0.7105


In [29]:
confusion = confusion_matrix(y_test_lem, rf_test_preds_lem)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print( f'True Positives: {TP}\n'
       f'True Negatives: {TN}\n', 
      f'False Positives: {FP}\n',
      f'False Negatives: {FN}\n')

True Positives: 254
True Negatives: 31
 False Positives: 175
 False Negatives: 32



Based on the confusion matrix it seems like the RFC is way over predicting that the petitioner will win. It's interesting that the model is less accurate than predicting the petitioner will always win, yet it also has a decent F1 score. This and the confusion matrix make me think there might be a way to make an NLP model work, though I am a long way off.

## Multinomial Naive Bayes Classifier

In [26]:
from sklearn.naive_bayes import MultinomialNB

In [27]:
nb_classifier = MultinomialNB(alpha = .01)

In [28]:
nb_classifier.fit(tfidf_data_train_lem, y_train_lem)
nb_test_preds_lem = nb_classifier.predict(tfidf_data_test_lem)


In [21]:
nb_acc_score_lem = accuracy_score(y_test_lem, nb_test_preds_lem)
nb_f1_score_lem = f1_score(y_test_lem, nb_test_preds_lem)

In [22]:
print('Naive Bayes with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(nb_acc_score_lem))
print()
print("F1 Score: {:.4}".format(nb_f1_score_lem))

Naive Bayes with Lemmatization Features
Testing Accuracy: 0.5813

F1 Score: 0.7289


The Naive Bayes model did a little better but not a by a huge margin. It is about the same as predicting the dominant class wil always win (petitioner). Since this model is faster and does better, it is now the front runner.

In [23]:
confusion = confusion_matrix(y_test_lem, rf_test_preds_lem)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print( f'True Positives: {TP}\n'
       f'True Negatives: {TN}\n', 
      f'False Positives: {FP}\n',
      f'False Negatives: {FN}\n')

True Positives: 254
True Negatives: 31
 False Positives: 175
 False Negatives: 32



In [24]:
indices = np.argsort(tfidf.idf_)[::-1]
features = tfidf.get_feature_names()
top_n = 10
top_features = [features[i] for i in indices[:top_n]]
print (top_features)

['zwiener question obscenity', 'footnote actually bolster', 'footnote agency', 'footnote advertisement thereby', 'footnote advertisement', 'footnote adopt effect', 'footnote adopt', 'footnote admits take', 'footnote admits', 'footnote admissibility disclosure']


In [25]:
feature_names = tfidf.get_feature_names()
top10 = np.argsort(nb_classifier.coef_[0])[-10:]

print(" ".join(feature_names[j] for j in top10))

honor make right well state think say would case court


## Tuned RFC

I dont have high expectations but lets see if a grid search RCF can do any better before we fully decide on using the NB classifier. 

In [30]:
from sklearn.model_selection import GridSearchCV

In [36]:
param_grid = { 
    'n_estimators': [10, 20, 50, 100],
    'criterion': ['gini'],
    'max_depth': range(2,10),
    'max_features': ['auto']
}

In [37]:
grid_tree =GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

In [39]:
grid_tree.fit(tfidf_data_train_lem, y_train_lem)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:  7.3min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': range(2, 10),
                         'max_features': ['auto'],
                         'n_estimators': [10, 20, 50, 100]},
             scoring='accuracy', verbose=1)

In [42]:
grfc_preds =grid_tree.predict(tfidf_data_test_lem)
accuracy_score(grfc_preds, y_test_lem)

0.5813008130081301

In [43]:
grid_tree.best_params_

{'criterion': 'gini',
 'max_depth': 7,
 'max_features': 'auto',
 'n_estimators': 20}

This model is doing about exactly the same as the NB classifier, and about the same as just guessing the petitioner will win, none of these are proving to be accurate. 

## SVC

In [47]:
from sklearn.svm import SVC

In [48]:
svc = SVC(class_weight= 'balanced')

In [49]:
svc.fit(X= tfidf_data_train_lem, y= y_train_lem)

SVC(class_weight='balanced')

In [50]:
preds = svc.predict(tfidf_data_test_lem)

In [51]:
accuracy_score(preds, y_test_lem)

0.5914634146341463

This model is the best, in fact its 1% more accurate then just guessing that the petitioner won! Not great but since it seems the most promising I will try tuning its hyperparameters and see if I can squeeze some more juice out of it and cross validate my results.

In [52]:
param_gridsvc = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

In [53]:
grid_svc = GridSearchCV(SVC(), param_gridsvc, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)

In [55]:
grid_svc.fit(tfidf_data_train_lem, y_train_lem)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 18.5min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 62.0min finished


GridSearchCV(cv=3, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001],
                         'kernel': ['rbf', 'poly', 'sigmoid']},
             scoring='accuracy', verbose=1)

In [57]:
predsvc = grid_svc.predict(tfidf_data_test_lem)

In [58]:
accuracy_score(predsvc, y_test_lem)

0.5813008130081301

Many roads lead to the same place. I think I will stick with the nb classifier at this point as it is fast and competitive, but none of the models are amazing.

# W2V

Lets see if using W2V and getting the mean vector for each argument leads to a better model.

In [486]:
vocab = df.text.map(word_tokenize)

In [487]:
from gensim.models import Word2Vec

model = Word2Vec(vocab, size=100, window=5, min_count=1, workers=4)

model.train(vocab, total_examples=model.corpus_count, epochs= 10)

(237773039, 795415180)

In [490]:
wtv = dict(zip(model.wv.index2word, model.wv.syn0))

  """Entry point for launching an IPython kernel.


In [491]:
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        # Takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(wtv))])
    
    # Note: Even though it doesn't do anything, it's required that this object implement a fit method or else
    # it can't be used in a scikit-learn pipeline  
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

In [493]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

rf =  Pipeline([('Word2Vec Vectorizer', W2vVectorizer(wtv)),
              ('Random Forest', RandomForestClassifier(n_estimators=100, verbose=True))])
svc = Pipeline([('Word2Vec Vectorizer', W2vVectorizer(wtv)),
                ('Support Vector Machine', SVC())])
lr = Pipeline([('Word2Vec Vectorizer', W2vVectorizer(wtv)),
              ('Logistic Regression', LogisticRegression(max_iter= 1000))])


In [494]:
models = [('Random Forest', rf),
          ('Support Vector Machine', svc),
          ('Logistic Regression', lr)]

In [500]:
scores = [(name, cross_val_score(model, final_df.text, final_df['target'], cv=2).mean()) for name, model, in models]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [501]:
scores

[('Random Forest', 0.5565960912052117),
 ('Support Vector Machine', 0.5842833876221498),
 ('Logistic Regression', 0.5842833876221498)]

In [10]:
lgr = LogisticRegression(class_weight= 'balanced')

W2V vectors are not any better than using TFIDF scores and involve a more complicated pipeline proccess so I would still recommend using the NB classifier model.

# Conclusions

All these models trained on my language data worked rather poorly. While this is a difficult thing to predict, there was no improvement over simply guessing the dominant class everytime. I would not reccomend using NLP to predict Supreme Court Cases. Other researchers have had far more success using other factors of the case in machine learning algorithims. 

# Using These Predictions to Supplement Another Model

Since I had little success with just NLP data, I have decided to use my predictions as a feature in a model based on categorical variables associated with the case.

In [61]:
X_all = tfidf.transform(X)

In [64]:
probs = nb_classifier.predict_proba(X_all)

In [66]:
probs = list(probs)

In [88]:
df_probs = pd.DataFrame(probs)
df_probs.columns = ['pwin', 'ploss']

In [89]:
final_df.head()

Unnamed: 0.1,Unnamed: 0,case,text,target,lib_or_con,majVotes
0,0,352us282,may it please the court this case be here on a...,1,2.0,6
1,1,353us586,mr chief justice if the court please when the ...,1,2.0,4
2,5,352us249,if the court please you might wait just a mome...,0,2.0,5
3,9,354us147,mr chief justice if the court please this be a...,0,2.0,5
4,10,352us407,mr chief justice may it please the court this ...,1,1.0,6


In [90]:
df_probs = df_probs.merge(final_df, left_index = True, right_index= True)

In [93]:
df_probs =df_probs[['case', 'pwin', 'ploss']]

In [94]:
df_probs.to_csv('probs_case.csv', index = False)