In [3]:
import pandas as pd 
import json
import os
import numpy as np
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import punkt
import nltk
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix
import string
from nltk.probability import FreqDist
import seaborn as sns
pd.options.display.max_rows = 999
pd.options.display.max_columns = 30
import lexnlp as lnlp
import src
from src import *
import importlib
import unidecode as unidecode
importlib.reload(src)
%matplotlib inline

# NLP Feature Engineering 

In [4]:
final_df = pd.read_csv("../data/Final_Merge.csv")
final_df.head()

Unnamed: 0,case,text,target,lib_or_con,majVotes
0,352us282,may it please the court this case be here on a...,1,2.0,6
1,353us586,mr chief justice if the court please when the ...,1,2.0,4
2,352us249,if the court please you might wait just a mome...,0,2.0,5
3,354us147,mr chief justice if the court please this be a...,0,2.0,5
4,352us407,mr chief justice may it please the court this ...,1,1.0,6


## Tokenizing all that sweet sweet text


In [5]:
tokenizer = RegexpTokenizer(r'[a-zA-Z0-9!]+')

final_df.text = final_df.text.apply(lambda x: tokenizer.tokenize(x))

## Lemmatizing

In [6]:
#using my custom function to lemmatize my text data
final_df.text = final_df.text.apply(lambda x: src.lemm_text(x))

In [7]:
#checking my work
final_df.head()

Unnamed: 0,case,text,target,lib_or_con,majVotes
0,352us282,"[may, it, please, the, court, this, case, be, ...",1,2.0,6
1,353us586,"[mr, chief, justice, if, the, court, please, w...",1,2.0,4
2,352us249,"[if, the, court, please, you, might, wait, jus...",0,2.0,5
3,354us147,"[mr, chief, justice, if, the, court, please, t...",0,2.0,5
4,352us407,"[mr, chief, justice, may, it, please, the, cou...",1,1.0,6


In [8]:
def fn_tdm_tfidf(docs, xColNames = None, **kwargs):
    ''' create a term document matrix as pandas DataFrame
    with **kwargs you can pass arguments of CountVectorizer
    if xColNames is given the dataframe gets columns Names'''

    #initialize the  vectorizer
    tf = TfidfVectorizer(**kwargs)
    x1 = tf.fit_transform(docs)
    #create dataFrame
    df = pd.DataFrame(x1.toarray().transpose(), index = tf.get_feature_names())

    if xColNames is not None:
        df.columns = xColNames

    return df

In [11]:
df = fn_tdm_tfidf(final_df.text).transpose()

In [12]:
df.head()

Unnamed: 0,aa,aaa,aacp,aadc,aagency,aah,aals,aamc,aamr,aar,aaron,aaronson,aarp,aaup,ab,...,zoom,zorach,zosianne,zschernig,zucca,zuckerman,zuckman,zur,zurcher,zurich,zweibaum,zweig,zwickler,zwiener,zwiezen
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df2 = df.drop('text', axis = 1)
df3 = df2.merge(df)

KeyboardInterrupt: 

In [None]:
X = df.drop('target', axis = 1)
Y = df.target

In [None]:
X_train_lem, X_test_lem, y_train_lem, y_test_lem = train_test_split(X, Y, test_size=0.6, random_state=34)

# Modeling 


I will be using accuracy as my metric as this model will have little actionable impact, its not important to maximise other metrics. The more accurate the model the better a data point it will be for legal prognosticators.

## Train Test Split and vectorizing with TFIDF scores

I used sklearn to get the tfidf scores for my data. I tried a couple of different ngram ranges and 1-3 seemed to be the best.

In [10]:
#to use the tfidf model as I know it, I rejoined the lemmatized words into one string
final_df.text = final_df.text.apply(lambda x: ','.join(x)) 
final_df.text =final_df.text.apply(lambda x: x.replace(',',' '))

In [27]:
X = final_df.text#assigning my features
Y = final_df.target#and my targets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


In [44]:
#creating stopwords, I included some legal ones, I could have done this in my src file 
#and imported them, but it was easier to expirement with different options by generating
#them within the notebook
sw_list = stopwords.words('english')
sw_list += list(string.punctuation)
sw_list += ["''", '""', '...', '``', '’', '“', '’', '”', '‘', '‘', '©',
        'said', 'one', 'com','-', '–', '—', 'co', 'wa', 'ha', '1', 'amp',
        'court', 'would', 'case', 'say', 'think']
sw_set = set(sw_list)

In [104]:
#train test split
X_train_lem, X_test_lem, y_train_lem, y_test_lem = train_test_split(X, Y, test_size=0.6, random_state=34)
#vectorizing with TFIDF scores
tfidf = TfidfVectorizer(ngram_range= (1,3), stop_words= sw_set)
#transforming the data 
tfidf_data_train_lem = tfidf.fit_transform(X_train_lem)
tfidf_data_test_lem = tfidf.transform(X_test_lem)

### top ten TFIDF scores

In [105]:
indices = np.argsort(tfidf.idf_)[::-1]
features = tfidf.get_feature_names()
top_n = 10
top_features = [features[i] for i in indices[:top_n]]
print (top_features)

['zwiener question obscenity', 'forbid kind evidence', 'forbid free', 'forbid form desecration', 'forbid form', 'forbid forbid special', 'forbid forbid district', 'forbid first amendment', 'forbid first', 'forbid federal constitution']


## Untuned RFC (baseline)

I will use an RFC as my baseline model. It's generally good for NLP classification tasks and not overly difficult to implement untuned.

In [112]:
#instantiating the model
rf_classifier_lem = RandomForestClassifier(n_estimators=10, random_state=0, class_weight= 'balanced')


In [113]:
#fitting
rf_classifier_lem.fit(tfidf_data_train_lem, y_train_lem)
#generating test predictions
rf_test_preds_lem = rf_classifier_lem.predict(tfidf_data_test_lem)
#generating train predictions
rf_train_preds_lem = rf_classifier_lem.predict(tfidf_data_train_lem)

### Evaulating test predictions

In [114]:
from sklearn.metrics import f1_score
rf_acc_score_lem = accuracy_score(y_test_lem, rf_test_preds_lem)#accuracy score
rf_f1_score_lem = f1_score(y_test_lem, rf_test_preds_lem)#F1 score
print('Random Forest with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(rf_acc_score_lem))
print()
print("F1 Score: {:.4}".format(rf_f1_score_lem))

Random Forest with Lemmatization Features
Testing Accuracy: 0.5448

F1 Score: 0.6074


In [115]:
confusion = confusion_matrix(y_test_lem, rf_test_preds_lem)#generating a confusion matrix
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
#printing it in the format which I find most readable, rather than the graphic version
print( f'True Positives: {TP}\n'
       f'True Negatives: {TN}\n', 
      f'False Positives: {FP}\n',
      f'False Negatives: {FN}\n')

True Positives: 519
True Negatives: 284
 False Positives: 318
 False Negatives: 353



### Evaulating training predictions

In [116]:
rf_acc_score_lem_t = accuracy_score(y_train_lem, rf_train_preds_lem)#accuracy score
rf_f1_score_lem_t = f1_score(y_train_lem, rf_train_preds_lem)# F1 score
print('Random Forest with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(rf_acc_score_lem_t))
print()
print("F1 Score: {:.4}".format(rf_f1_score_lem_t))

Random Forest with Lemmatization Features
Testing Accuracy: 0.9868

F1 Score: 0.9885


In [117]:
confusion = confusion_matrix(y_train_lem, rf_train_preds_lem) #generating a confusion matrix
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
#printing it in the format which I find most readable, rather than the graphic version
print( f'True Positives: {TP}\n'
       f'True Negatives: {TN}\n', 
      f'False Positives: {FP}\n',
      f'False Negatives: {FN}\n')

True Positives: 557
True Negatives: 412
 False Positives: 7
 False Negatives: 6



### Analysis

Seems like the baseline model is way way way overfit, I will try some different models and see if they have similar problems, then try a gridsearch cv to try to adress this problem.

## Multinomial Naive Bayes Classifier

In [56]:
from sklearn.naive_bayes import MultinomialNB 

In [118]:
nb_classifier = MultinomialNB(alpha = .1)#instantiating a multinomial naive bayes model 

In [119]:
nb_classifier.fit(tfidf_data_train_lem, y_train_lem) #fitting


MultinomialNB(alpha=0.1)

In [120]:
nb_test_preds_lem = nb_classifier.predict(tfidf_data_test_lem)
nb_train_preds_lem = nb_classifier.predict(tfidf_data_train_lem)


In [121]:
nb_acc_score_lem = accuracy_score(y_test_lem, nb_test_preds_lem)
nb_f1_score_lem = f1_score(y_test_lem, nb_test_preds_lem)

In [122]:
print('Naive Bayes with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(nb_acc_score_lem))
print()
print("F1 Score: {:.4}".format(nb_f1_score_lem))

Naive Bayes with Lemmatization Features
Testing Accuracy: 0.5909

F1 Score: 0.7429


The Naive Bayes model did a little better on the testing data but not a by a huge margin. It is about the same as predicting the dominant class wil always win (petitioner. 

In [123]:
confusion = confusion_matrix(y_test_lem, rf_test_preds_lem)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print( f'True Positives: {TP}\n'
       f'True Negatives: {TN}\n', 
      f'False Positives: {FP}\n',
      f'False Negatives: {FN}\n')

True Positives: 519
True Negatives: 284
 False Positives: 318
 False Negatives: 353



In [124]:
nb_acc_score_lem_t = accuracy_score(y_train_lem, nb_train_preds_lem)
nb_f1_score_lem_t = f1_score(y_train_lem, nb_train_preds_lem)

In [125]:
print('Naive Bayes with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(nb_acc_score_lem_t))
print()
print("F1 Score: {:.4}".format(nb_f1_score_lem))

Naive Bayes with Lemmatization Features
Testing Accuracy: 1.0

F1 Score: 0.7429


In [126]:
confusion = confusion_matrix(y_train_lem, nb_train_preds_lem)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print( f'True Positives: {TP}\n'
       f'True Negatives: {TN}\n', 
      f'False Positives: {FP}\n',
      f'False Negatives: {FN}\n')

True Positives: 563
True Negatives: 419
 False Positives: 0
 False Negatives: 0



Still having the same drastic overfitting issues with this model 

### Top Ten Most Imporant Features

In [25]:
feature_names = tfidf.get_feature_names()
top10 = np.argsort(nb_classifier.coef_[0])[-10:]

print(" ".join(feature_names[j] for j in top10))

honor make right well state think say would case court


These are largely the same as the most common words I found in my EDA. I thought that with TFIDF scores and modeling there would be more differences.

## Tuned RFC

I dont have high expectations but lets see if a grid search RCF can do any better before we fully decide on using the NB classifier. 

In [75]:
from sklearn.model_selection import GridSearchCV

In [76]:
param_grid = { 
    'n_estimators': [10, 20, 50, 100],
    'criterion': ['gini'],
    'max_depth': range(2,10),
    'max_features': ['auto']
}

In [77]:
grid_tree =GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

In [78]:
grid_tree.fit(tfidf_data_train_lem, y_train_lem)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:  8.8min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': range(2, 10),
                         'max_features': ['auto'],
                         'n_estimators': [10, 20, 50, 100]},
             scoring='accuracy', verbose=1)

In [85]:
grfc_preds_test =grid_tree.predict(tfidf_data_test_lem)
grfc_preds_train = grid_tree.predict(tfidf_data_train_lem)


In [81]:
accuracy_score_test = accuracy_score(grfc_preds, y_test_lem)
f1_score_test = f1_score(grfc_preds, y_test_lem)

In [83]:
print('Grid searched Random Forest Classifier with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(accuracy_score_test))
print()
print("F1 Score: {:.4}".format(f1_score_test))

Grid searched Random Forest Classifier with Lemmatization Features
Testing Accuracy: 0.5813

F1 Score: 0.7352


In [88]:
confusion = confusion_matrix(y_test_lem, grfc_preds_test)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print( f'True Positives: {TP}\n'
       f'True Negatives: {TN}\n', 
      f'False Positives: {FP}\n',
      f'False Negatives: {FN}\n')

True Positives: 286
True Negatives: 0
 False Positives: 206
 False Negatives: 0



Unfortunately this model is only predicting the dominant class 

In [86]:
accuracy_score_train = accuracy_score(grfc_preds_train, y_train_lem)
f1_score_train = f1_score(grfc_preds_train, y_train_lem)

In [87]:
print('Grid searched Random Forest Classifier with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(accuracy_score_train))
print()
print("F1 Score: {:.4}".format(f1_score_train))

Grid searched Random Forest Classifier with Lemmatization Features
Testing Accuracy: 0.585

F1 Score: 0.7382


In [89]:
confusion = confusion_matrix(y_train_lem, grfc_preds_train)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print( f'True Positives: {TP}\n'
       f'True Negatives: {TN}\n', 
      f'False Positives: {FP}\n',
      f'False Negatives: {FN}\n')

True Positives: 1149
True Negatives: 0
 False Positives: 815
 False Negatives: 0



It does the same on the training data.

In [80]:
grid_tree.best_params_

{'criterion': 'gini',
 'max_depth': 3,
 'max_features': 'auto',
 'n_estimators': 20}

It looks like the grid search proccess adressed the overfitting by increasing the bias by lowering the max depth and estimators. This eventually must have led to just predicting that the petitioner would win

## SVC

In [127]:
from sklearn.svm import SVC

In [128]:
svc = SVC(class_weight= 'balanced')

In [129]:
svc.fit(X= tfidf_data_train_lem, y= y_train_lem)

SVC(class_weight='balanced')

In [130]:
SVC_preds = svc.predict(tfidf_data_test_lem)
SVC_preds_train = svc.predict(tfidf_data_train_lem)

In [140]:
accuracy_score(SVC_preds, y_test_lem)

0.5630936227951153

In [141]:
accuracy_score(SVC_preds_train, y_train_lem)

1.0

This model is the best, in fact its 1% more accurate then just guessing that the petitioner won! Not great but since it seems the most promising I will try tuning its hyperparameters and see if I can squeeze some more juice out of it and cross validate my results.

In [52]:
param_gridsvc = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

In [53]:
grid_svc = GridSearchCV(SVC(), param_gridsvc, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)

In [55]:
grid_svc.fit(tfidf_data_train_lem, y_train_lem)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 18.5min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 62.0min finished


GridSearchCV(cv=3, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001],
                         'kernel': ['rbf', 'poly', 'sigmoid']},
             scoring='accuracy', verbose=1)

In [57]:
predsvc = grid_svc.predict(tfidf_data_test_lem)

In [58]:
accuracy_score(predsvc, y_test_lem)

0.5813008130081301

Many roads lead to the same place. I think I will stick with the nb classifier at this point as it is fast and competitive, but none of the models are amazing.

In [90]:
import xgboost as xgb

In [100]:
xg_clf = xgb.XGBClassifier(objective ='binary:logistic', 
                           colsample_bytree = 0.5, 
                           subsample = 0.5,
                           learning_rate = 0.1,
                           max_depth = 2, 
                           alpha = 1, 
                           n_estimators = 10)

In [101]:
xg_clf.fit(tfidf_data_train_lem , y_train_lem)


XGBClassifier(alpha=1, colsample_bytree=0.5, max_depth=2, n_estimators=10,
              subsample=0.5)

In [95]:
xg_test = xg_clf.predict(tfidf_data_test_lem)
xg_train = xg_clf.predict(tfidf_data_train_lem)

In [102]:
xg_accuracy = accuracy_score(y_train_lem, xg_train)
xg_f1 = f1_score(y_test_lem, xg_test)

In [103]:
print('Naive Bayes with Lemmatization Features')
print("Testing Accuracy: {:.4}".format(xg_accuracy))
print()
print("F1 Score: {:.4}".format(xg_f1))

Naive Bayes with Lemmatization Features
Testing Accuracy: 1.0

F1 Score: 0.6591


In [None]:
xg_accuracy_

# W2V

Lets see if using W2V and getting the mean vector for each argument leads to a better model.

In [486]:
vocab = df.text.map(word_tokenize)

In [487]:
from gensim.models import Word2Vec

model = Word2Vec(vocab, size=100, window=5, min_count=1, workers=4)

model.train(vocab, total_examples=model.corpus_count, epochs= 10)

(237773039, 795415180)

In [490]:
wtv = dict(zip(model.wv.index2word, model.wv.syn0))

  """Entry point for launching an IPython kernel.


In [491]:
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        # Takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(wtv))])
    
    # Note: Even though it doesn't do anything, it's required that this object implement a fit method or else
    # it can't be used in a scikit-learn pipeline  
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

In [493]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

rf =  Pipeline([('Word2Vec Vectorizer', W2vVectorizer(wtv)),
              ('Random Forest', RandomForestClassifier(n_estimators=100, verbose=True))])
svc = Pipeline([('Word2Vec Vectorizer', W2vVectorizer(wtv)),
                ('Support Vector Machine', SVC())])
lr = Pipeline([('Word2Vec Vectorizer', W2vVectorizer(wtv)),
              ('Logistic Regression', LogisticRegression(max_iter= 1000))])


In [494]:
models = [('Random Forest', rf),
          ('Support Vector Machine', svc),
          ('Logistic Regression', lr)]

In [500]:
scores = [(name, cross_val_score(model, final_df.text, final_df['target'], cv=2).mean()) for name, model, in models]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [501]:
scores

[('Random Forest', 0.5565960912052117),
 ('Support Vector Machine', 0.5842833876221498),
 ('Logistic Regression', 0.5842833876221498)]

In [10]:
lgr = LogisticRegression(class_weight= 'balanced')

W2V vectors are not any better than using TFIDF scores and involve a more complicated pipeline proccess so I would still recommend using the NB classifier model.

# Conclusions

All these models trained on my language data worked rather poorly. While this is a difficult thing to predict, there was no improvement over simply guessing the dominant class everytime. I would not reccomend using NLP to predict Supreme Court Cases. Other researchers have had far more success using other factors of the case in machine learning algorithims. 

# Making the Prediction From my NLP Model into a DataFrame to Use With Non NLP Data

In [133]:
X_all = tfidf.transform(X)#trasforming all the data with tfidf 

In [134]:
probs = nb_classifier.predict_proba(X_all)#predicting with the classifier and calling the probality

In [135]:
probs = list(probs) #turnin them into a list

In [136]:
df_probs = pd.DataFrame(probs)#turning them into a DataFrame
df_probs.columns = ['pwin', 'ploss'] #column names

In [137]:
final_df.head()

Unnamed: 0,case,text,target,lib_or_con,majVotes
0,352us282,may it please the court this case be here on a...,1,2.0,6
1,353us586,mr chief justice if the court please when the ...,1,2.0,4
2,352us249,if the court please you might wait just a mome...,0,2.0,5
3,354us147,mr chief justice if the court please this be a...,0,2.0,5
4,352us407,mr chief justice may it please the court this ...,1,1.0,6


In [138]:
df_probs = df_probs.merge(final_df, left_index = True, right_index= True) #merging with the prior DF, the order is consistent

In [139]:
df_probs.head()

Unnamed: 0,pwin,ploss,case,text,target,lib_or_con,majVotes
0,0.023292,0.976708,352us282,may it please the court this case be here on a...,1,2.0,6
1,0.008857,0.991143,353us586,mr chief justice if the court please when the ...,1,2.0,4
2,0.9911,0.0089,352us249,if the court please you might wait just a mome...,0,2.0,5
3,0.940557,0.059443,354us147,mr chief justice if the court please this be a...,0,2.0,5
4,0.110519,0.889481,352us407,mr chief justice may it please the court this ...,1,1.0,6


In [93]:
df_probs =df_probs[['case', 'pwin', 'ploss']] #getting only the columns I want

In [94]:
df_probs.to_csv('probs_case.csv', index = False)#exporting as a csv