# Imports

In [83]:
import pandas as pd 
import json
import os
import numpy as np
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import punkt
import nltk
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix
import string
from nltk.probability import FreqDist
import seaborn as sns
pd.options.display.max_rows = 999
pd.options.display.max_columns = 30
import lexnlp as lnlp
import importlib
import src
from src import *
importlib.reload(src)
%matplotlib inline

# Going from JSON Files to a Useable DataFrame

Here I'm just checking out the case to see what it looks like and making sure I have a good grasp of the file path.

In [2]:
f = open('../data/supreme-court-cases/cases/1956/Achilli_v._United_States.js')
# it goes data, the repo, cases, year, then the name of the case
data = json.load(f)

## Making a List of the File Paths for each Case

Now Im going to loop through all the cases and append each ones name to the file path that will give me access to it. 

In [3]:
# adding the years as the second level
ls_years = []
for x in range(1956, 2018):
    ls_years.append(x)
file_paths = []
for x in ls_years:
    file_paths.append(f'../data/supreme-court-cases/cases/{x}')

In [4]:
#making the list of case names
cases = {} 
for x in range(0, len(ls_years)):
    json_files = [pos_json for pos_json in os.listdir(file_paths[x]) if pos_json.endswith('.js')]
    cases.update({ls_years[x]: json_files})


In [5]:
#and opening all the cases with the filepaths and appending them to to a list 
case_list = []
for x in ls_years:
    for case in cases[x]: 
        f = open(f'../data/supreme-court-cases/cases/{x}/{case}') 
        data = json.load(f)
        case_list.append(data)


## Grabbing Just the Transcripts

Nested further within the list is the case transcript which is what we need for this project.

In [6]:
transcript_dict = {}# making a dictionary with the citation of the case as the key and the transcript as each entry 
for x in range(0, len(case_list)):
    try:
        transcript_dict.update({case_list[x]['citation']: case_list[x]['caseTranscripts'][0]['transcript']})
    except:
        pass

## Further Organazing the Transcripts 

Here I made a list of dictionarys, each one is a case, and each dictionary has a speaker as its key and the transcript of what they said as its value.

In [7]:
speaker_cleanText = {} #Now I need to segment the transcripts further by speaker
for case in transcript_dict.keys(): #"clean text" is a path within the json wich has their words in a cleaner format
    ls= []
    for x in transcript_dict[case]:
        ls.append([x['speakerName'].lower(), [x['textObjs'][y]['cleanText'] for y in range(0, len(x['textObjs']))]])
    speaker_cleanText.update({case: ls})

In [8]:
for case in speaker_cleanText: #quickly getting rid of some punctation so I can filter through different names easily
    for x in range(0,len(speaker_cleanText[case])):
        speaker_cleanText[case][x][0] = speaker_cleanText[case][x][0].lower().replace(" ", "").replace('.', '').replace(',','').replace('jr','')

In [75]:
speaker_cleanText['353 US 373 (1957)'] # Testing out the dictionary and discerning the list format 

[['charleskrice',
  [[['I',
     'be',
     'include',
     'my',
     'remark',
     'with',
     'respect',
     'to',
     'the',
     'inaudible',
     'the',
     'application',
     'of']],
   [['i',
     'think',
     'it',
     'be',
     'fair',
     'to',
     'say',
     'that',
     'the',
     'purpose',
     'apply',
     'that',
     'particular',
     'section',
     'we',
     'be',
     'to',
     'to',
     'meet',
     'the',
     'protest',
     'of',
     'the',
     'district',
     'court',
     'the',
     'various',
     'district',
     'court',
     'and',
     'the',
     'united',
     'state',
     'attorney',
     'in',
     'the',
     'various',
     'district']],
   [['and',
     'to',
     'ameliorate',
     'a',
     'lot',
     'of',
     'the',
     'small',
     'taxpayer',
     'who',
     'be',
     'be',
     'subject',
     'to',
     'a',
     'felony',
     'prosecution',
     'and',
     'summons',
     'where',
     'the',
     'tax',
   

## Filtering down to just the petitioner words

I need just the words said by the petitioner for my question. Here Im using a custom function I made that gets only the words said by the petitioner or the justice in response to the petitioner.

In [34]:
justices = pd.read_csv('../data/justices.csv', header = None) # Importing a list of justices for my filtering process

In [35]:
justices = list(justices[0])

In [36]:
justices = [x.lower().replace(" ", "").replace('.', '').replace(',','').replace('jr', '') for x in justices]
# doing a little bit of cleaning on the justices file

In [37]:
#testing my function
src.get_lawyers(speaker_cleanText["353 US 373 (1957)"], justices) #testing my custom function for grabbing the lawyer names

['charleskrice', 'peterbatwood']

In [52]:
#testing out a similar function that gets the justices
src.get_justices(speaker_cleanText["353 US 373 (1957)"], justices)

['earlwarren', 'unknown', 'hugolblack', 'felixfrankfurter', 'williamjbrennan']

In [14]:
#making a dictionary of only the lawyers differentiated by case
case_lawyers_dict = {}  
for case in speaker_cleanText:
    case_lawyers_dict.update({case:src.get_lawyers(speaker_cleanText[case], justices)})

In [57]:
#making a dictionary of the justices in each case 
justice_lawyers_dict = {} 
for case in speaker_cleanText:
    justice_lawyers_dict.update({case:src.get_justices(speaker_cleanText[case], justices)})

quickly scanning the names manually to double check theres no justices that snuck through, making a dataframe was unnesscary but much easier to examine visually

In [55]:
# and making the sure the key is what I think it is for future merges 
case_lawyers_dict['353 US 373 (1957)']

['charleskrice', 'peterbatwood']

In [60]:
justice_lawyers_dict['353 US 373 (1957)'] #looks like good, I'll deal with the mysterious unknown justice later

['earlwarren', 'unknown', 'hugolblack', 'felixfrankfurter', 'williamjbrennan']

## Getting the Words Said By the Petitioners and Justices in Each Case

For my analysis I want only the words said by the petioners or the justices, so Im going to use my custom function to make a dictionary that has this data. I searched long and hard for a way to scrape the lawyers and advocated for each side, but there was no way to do so for a large percentage of my data. While the petitioner always speaks first, the function is imperfect as it just takes the first lawyers words and the justice responses to those words. The petitioner could have another speaker that this function would not capture, but for the purposes of NLP we can still evualaute whether the words have any predictive power. 

In [21]:
#making the dictionary 
petitioner_words_dict = {}
for x in speaker_cleanText:
    try:
        petitioner_words_dict.update({x: src.get_petitioner_words(x, justices, case_lawyers_dict, speaker_cleanText)})
        #get_petitioner_words is a function which makes a list with the words said only by 
        #the petitioners or the justices in response
    except:
        #some of the cases do not have transcripts which breaks the function
        pass

In [22]:
len(petitioner_words_dict)

6005

In [23]:
justice_words_dict = {}
for x in speaker_cleanText:
    try:
        justice_words_dict.update({x: src.get_justice_words(x, justices, case_lawyers_dict, speaker_cleanText)})
        #this does the same but only for the justice words said in response t0 or to the petitioner 
    except:
        pass

In [84]:
src.get_cases("ruthbaderginsburg", speaker_cleanText, justice_lawyers_dict)

['406\xa0US\xa0320\xa0(1972)',
 '411\xa0US\xa0677\xa0(1973)',
 '416\xa0US\xa0351\xa0(1974)',
 '421\xa0US\xa0772\xa0(1975)',
 '420\xa0US\xa0636\xa0(1975)',
 '430\xa0US\xa0199\xa0(1977)',
 '439\xa0US\xa0357\xa0(1979)',
 '512\xa0US\xa0374\xa0(1993)',
 '511\xa0US\xa0462\xa0(1994)',
 '510\xa0US\xa0163\xa0(1994)',
 '511\xa0US\xa0350\xa0(1994)',
 '510\xa0US\xa0332\xa0(1994)',
 '511\xa0US\xa01\xa0(1994)',
 '512\xa0US\xa0821\xa0(1994)',
 '511\xa0US\xa0658\xa0(1994)',
 '511\xa0US\xa0738\xa0(1994)',
 '512\xa0US\xa0477\xa0(1994)',
 '512\xa0US\xa061\xa0(1994)',
 '510\xa0US\xa027\xa0(1993)',
 '510\xa0US\xa0487\xa0(1994)',
 '510\xa0US\xa0517\xa0(1994)',
 '511\xa0US\xa0244\xa0(1994)',
 '512\xa0US\xa0339\xa0(1994)',
 '510\xa0US\xa0510\xa0(1994)',
 '510\xa0US\xa07\xa0(1993)',
 '510\xa0US\xa042\xa0(1993)',
 '512\xa0US\xa0622\xa0(1994)',
 '512\xa0US\xa0186\xa0(1994)',
 '510\xa0US\xa017\xa0(1993)',
 '510\xa0US\xa0249\xa0(1994)',
 '511\xa0US\xa0431\xa0(1994)',
 '511\xa0US\xa0127\xa0(1994)',
 '511\xa0US\xa04

In [85]:
src.get_specific_justice("ruthbaderginsburg", speaker_cleanText, justice_lawyers_dict)

[]

In [None]:
speaker_cleanText['544\xa0US\xa093\xa0(2005)']

## Dictionary to DataFrame and cleaning up the column titles and value

In [52]:
dfj = pd.DataFrame.from_dict(justice_words_dict.items()) #turning both dicts into DFs 

In [53]:
df = pd.DataFrame.from_dict(petitioner_words_dict.items())

In [54]:
df.columns = ['case', 'text']
dfj.columns = ['case', 'text']

In [55]:
df[df.case =="353 US 373 (1957)"] #testing to make sure the rows are what I think they are

Unnamed: 0,case,text
45,353 US 373 (1957),"[[[['I', 'be', 'include', 'my', 'remark', 'wit..."


Getting rid of a bunch of punctuation

In [56]:
#remopving remanants of the nested list structure
df.text = df.text.apply(lambda x: str(x).replace('[','').replace(']', ''))
dfj.text = dfj.text.apply(lambda x: str(x).replace('[','').replace(']', ''))


In [57]:
#taking out commas
df.text = df.text.apply(lambda x: x.replace(",", ''))
dfj.text = dfj.text.apply(lambda x: x.replace(",", ''))

In [58]:
df.head()

Unnamed: 0,case,text
0,352 US 282 (1957),'may' 'it' 'please' 'the' 'court' 'this' 'case...
1,353 US 586 (1957),'mr' 'chief' 'justice' 'if' 'the' 'court' 'ple...
2,352 US 599 (1957),'mr' 'chief' 'justice' 'and' 'associate' 'just...
3,352 US 82 (1956),'may' 'it' 'please' 'the' 'court' 'mr' 'willia...
4,352 US 220 (1957),'may' 'it' 'please' 'the' 'court' 'mr' 'plauch...


In [59]:
#removing the year from the citation as its not present in my other dataset
df.case = df.case.apply(lambda x: re.sub('(\d\d\d\d)', '', x))
dfj.case = dfj.case.apply(lambda x: re.sub('(\d\d\d\d)', '', x))


In [60]:
#lowercase
df.case = df.case.apply(lambda x: x.lower())
dfj.case = dfj.case.apply(lambda x: x.lower())

### Removing non ascii characters

These do not act like strings and are hard to merge on.

In [61]:
#My custom function for modifying the "case" colum to remove non ascii characters
df.case = df.case.apply(lambda x: src.remove_non_ascii_chars(x)) 
dfj.case = dfj.case.apply(lambda x: src.remove_non_ascii_chars(x))

In [62]:
df[df.case == '352us282']#this would not work if the function did not work

Unnamed: 0,case,text
0,352us282,'may' 'it' 'please' 'the' 'court' 'this' 'case...


In [63]:
#doing the same with the text but with a slightly different function
df.text = df.text.apply(lambda x: src.remove_non_ascii_chars_t(x))
dfj.text = dfj.text.apply(lambda x: src.remove_non_ascii_chars_t(x))

In [64]:
#other clean up after changing to ascii
df.text = df.text.apply(lambda x: x.replace("'", "" ))
dfj.text = dfj.text.apply(lambda x: x.replace("'", "" ))

In [65]:
#checking my work 
df.head()

Unnamed: 0,case,text
0,352us282,may it please the court this case be here on a...
1,353us586,mr chief justice if the court please when the ...
2,352us599,mr chief justice and associate justice of the ...
3,352us82,may it please the court mr williams this matte...
4,352us220,may it please the court mr plauche you may pro...


In [69]:
#and checking on the justices 
dfj.head()

Unnamed: 0,case,text
0,352us282,that instruction i take it in effect import in...
1,353us586,may i at this point ask to be declare inaudibl...
2,352us599,well of course of course you may yes now mr mr...
3,352us82,mr williams that do not mean that it be only a...
4,352us220,mr plauche you may proceed deny the united sta...


## Importing CSV with Further Case Info and Target Variable

This is SCDB CSV which has all my categorical data including the target variable of whether or not the petitioner won.

In [70]:
#importing the CSV from the SCDB with cp1252 encoding 
df2 = pd.read_csv("../data/SCDB_2020_01_caseCentered_Citation.csv", encoding='cp1252')                 

In [71]:
df2.head()#checking it out

Unnamed: 0,caseId,docketId,caseIssuesId,voteId,dateDecision,decisionType,usCite,sctCite,ledCite,lexisCite,term,naturalCourt,chief,docket,caseName,...,voteUnclear,issue,issueArea,decisionDirection,decisionDirectionDissent,authorityDecision1,authorityDecision2,lawType,lawSupp,lawMinor,majOpinWriter,majOpinAssigner,splitVote,majVotes,minVotes
0,1946-001,1946-001-01,1946-001-01-01,1946-001-01-01-01,11/18/1946,1,329 U.S. 1,67 S. Ct. 6,91 L. Ed. 3,1946 U.S. LEXIS 1724,1946,1301,Vinson,24,HALLIBURTON OIL WELL CEMENTING CO. v. WALKER e...,...,0.0,80180.0,8.0,2.0,0.0,4.0,,6.0,600.0,35 U.S.C. § 33,78.0,78.0,1,8,1
1,1946-002,1946-002-01,1946-002-01-01,1946-002-01-01-01,11/18/1946,1,329 U.S. 14,67 S. Ct. 13,91 L. Ed. 12,1946 U.S. LEXIS 1725,1946,1301,Vinson,12,CLEVELAND v. UNITED STATES,...,0.0,10500.0,1.0,1.0,0.0,4.0,,6.0,600.0,18 U.S.C. § 398,81.0,87.0,1,6,3
2,1946-003,1946-003-01,1946-003-01-01,1946-003-01-01-01,11/18/1946,1,329 U.S. 29,67 S. Ct. 1,91 L. Ed. 22,1946 U.S. LEXIS 3037,1946,1301,Vinson,21,CHAMPLIN REFINING CO. v. UNITED STATES ET AL.,...,0.0,80250.0,8.0,2.0,0.0,1.0,,2.0,207.0,,84.0,78.0,1,5,4
3,1946-004,1946-004-01,1946-004-01-01,1946-004-01-01-01,11/25/1946,7,329 U.S. 40,67 S. Ct. 167,91 L. Ed. 29,1946 U.S. LEXIS 1696,1946,1301,Vinson,26,UNITED STATES v. ALCEA BAND OF TILLAMOOKS ET AL.,...,0.0,20150.0,2.0,2.0,0.0,4.0,,6.0,600.0,49 Stat. 801,87.0,87.0,1,5,3
4,1946-005,1946-005-01,1946-005-01-01,1946-005-01-01-01,11/25/1946,1,329 U.S. 64,67 S. Ct. 154,91 L. Ed. 44,1946 U.S. LEXIS 2997,1946,1301,Vinson,50,"UNITED STATES v. HOWARD P. FOLEY CO., INC.",...,0.0,80060.0,8.0,2.0,0.0,7.0,,,,,78.0,87.0,1,6,3


## Merging with Text DF

In [72]:
#grabbing only the columns I want to explore in combination with thet text data
target_name_df = df2[['partyWinning', 'usCite', 'decisionDirection', 'majVotes']].copy()#grabbing just the columns I need

In [73]:
# changing column names
target_name_df.columns = ['target', 'case', 'lib_or_con', 'majVotes']

In [74]:
#removing punctiation
target_name_df['case'] = target_name_df['case'].apply(lambda x: str(x).replace('.', '').lower().replace(' ', ''))#removing punctation

In [75]:
target_name_df.head()

Unnamed: 0,target,case,lib_or_con,majVotes
0,1.0,329us1,2.0,8
1,0.0,329us14,1.0,6
2,0.0,329us29,2.0,5
3,0.0,329us40,2.0,5
4,1.0,329us64,2.0,6


In [76]:
# testing the case variable to make sure I can merge with the text data
target_name_df[target_name_df.case == '352us282' ]

Unnamed: 0,target,case,lib_or_con,majVotes
1129,1.0,352us282,2.0,6


### Merging with the Text Data

In [77]:
#merging on the case citation column
final_df = df.merge(target_name_df, on ='case') 
final_justice_df= dfj.merge(target_name_df, on ='case')

In [78]:
#filtering down to "close votes"
#final_df = final_df[final_df.majVotes < 7] # narrowing down to "close votes"
#final_justice_df = final_justice_df[final_justice_df.majVotes < 7]

In [79]:
final_df.head()

Unnamed: 0,case,text,target,lib_or_con,majVotes
0,352us282,may it please the court this case be here on a...,1.0,2.0,6
1,353us586,mr chief justice if the court please when the ...,1.0,2.0,4
2,352us599,mr chief justice and associate justice of the ...,0.0,1.0,8
3,352us82,may it please the court mr williams this matte...,0.0,2.0,8
4,352us220,may it please the court mr plauche you may pro...,0.0,1.0,8


In [80]:
final_df.target.value_counts()# checking to make sure there are enough of each class

1.0    3766
0.0    2162
2.0       1
Name: target, dtype: int64

In [81]:
final_df = final_df[final_df.target!= 2.0]# getting rid of the one close case wich had an undefined result
final_justice_df = final_justice_df[final_justice_df.target != 2.0]

In [82]:
final_df = final_df.dropna()#dropping nans that came with the categorical data 
final_justice_df = final_justice_df.dropna()

In [83]:
import math
#this was the most consistent way to turn floats into integers
final_df.target = final_df.target.apply(lambda x: math.trunc(x)) 
final_justice_df.target = final_justice_df.target.apply(lambda x: math.trunc(x))

In [84]:
final_df.target.value_counts() #checking to make sure it works

1    3766
0    2160
Name: target, dtype: int64

In [85]:
final_justice_df.target.value_counts()# theyre the same which is a good thing and makes sense

1    3766
0    2160
Name: target, dtype: int64

In [86]:
#have not kept up as much with the final justice df, lets see if we did all the steps
final_justice_df.head()

Unnamed: 0,case,text,target,lib_or_con,majVotes
0,352us282,that instruction i take it in effect import in...,1,2.0,6
1,353us586,may i at this point ask to be declare inaudibl...,1,2.0,4
2,352us599,well of course of course you may yes now mr mr...,0,1.0,8
3,352us82,mr williams that do not mean that it be only a...,0,2.0,8
4,352us220,mr plauche you may proceed deny the united sta...,0,1.0,8


In [87]:
final_df.to_csv('../data/Final_Merge.csv', index = False)# exporting to a csv for EDA and modeling
final_justice_df.to_csv('../data/final_justice.csv', index = False)