In [1]:
import pandas as pd 
import json
import os
import numpy as np
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import punkt
import nltk
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix
import string
from nltk.probability import FreqDist
import seaborn as sns
pd.options.display.max_rows = 999
pd.options.display.max_columns = 30
import lexnlp as lnlp
import src
import importlib
import unidecode as unidecode
importlib.reload(src)
%matplotlib inline

# Going from JSON Files to a Useable DataFrame

In [2]:
f = open('supreme-court-cases/cases/1956/Achilli_v._United_States.js')

data = json.load(f)

## Making a List of the File Paths for each Case

In [3]:
path_to_json = 'supreme-court-cases/cases/'
ls_years = []
for x in range(1956, 2018):
    ls_years.append(x)
file_paths = []
for x in ls_years:
    file_paths.append(f'supreme-court-cases/cases/{x}')

In [4]:
cases = {}
for x in range(0, len(ls_years)):
    json_files = [pos_json for pos_json in os.listdir(file_paths[x]) if pos_json.endswith('.js')]
    cases.update({ls_years[x]: json_files})


In [5]:
case_list = []
for x in ls_years:
    for case in cases[x]: 
        f = open(f'supreme-court-cases/cases/{x}/{case}') 
        data = json.load(f)
        case_list.append(data)


## Grabbing Just the Transcripts

In [6]:
transcript_dict = {}
for x in range(0, len(case_list)):
    try:
        transcript_dict.update({case_list[x]['citation']: case_list[x]['caseTranscripts'][0]['transcript']})
    except:
        pass

## Further Organazing the Transcripts so the Speaker Can Be Easily Identified

In [7]:
speaker_cleanText = {}
for case in transcript_dict.keys():
    ls= []
    for x in transcript_dict[case]:
        ls.append([x['speakerName'].lower(), [x['textObjs'][y]['cleanText'] for y in range(0, len(x['textObjs']))]])
    speaker_cleanText.update({case: ls})

In [8]:
for case in speaker_cleanText:
    for x in range(0,len(speaker_cleanText[case])):
        speaker_cleanText[case][x][0] = speaker_cleanText[case][x][0].lower().replace(" ", "").replace('.', '').replace(',','').replace('jr','')

In [9]:
justices = pd.read_csv('justices.csv', header = None)

In [10]:
justices = list(justices[0])

In [11]:
speaker_cleanText['353 US 373 (1957)'][0][1][0][0][2]

'include'

## Grabbing the Lawyers from Each Case to Acquire Just the Words Said by the Petitioner in Each Case

In [12]:
justices = [x.lower().replace(" ", "").replace('.', '').replace(',','').replace('jr', '') for x in justices]
    

In [13]:
def get_lawyers (case):
    lawyers = []
    for x in case: 
        if x[0] in justices or x[0] in lawyers:
            continue
        else:
            lawyers.append(x[0])
    return lawyers

In [14]:
src.get_lawyers(speaker_cleanText["353 US 373 (1957)"], justices)

['charleskrice', 'peterbatwood']

In [15]:
case_lawyers_dict = {}
for case in speaker_cleanText:
    case_lawyers_dict.update({case:src.get_lawyers(speaker_cleanText[case], justices)})

### It's Easier to Check the Names in a DataFrame

In [16]:

df = pd.DataFrame.from_dict(case_lawyers_dict.items())

In [17]:
case_lawyers_dict['353 US 373 (1957)']

['charleskrice', 'peterbatwood']

## Getting the Words Said By the Petitioners and Justices in Each Case

In [18]:
def get_justice_words (case):
    justice_words = " "
    words = speaker_cleanText[case]
    previous_speaker = words[0][0]
    for speaker in words:
        if speaker[0] in justices and case_lawyers_dict[case][0] == previous_speaker:
            for x in range(0, (len(speaker[0][1]))):
                justice_words.join(speaker[1][0][0][x])  
        previous_speaker = speaker[0]
    return justice_words

In [19]:
get_justice_words("353 US 373 (1957)")

' '

In [20]:
petitioner_words_dict = {}
for x in speaker_cleanText:
    try:
        petitioner_words_dict.update({x: src.get_petitioner_words(x, justices, case_lawyers_dict, speaker_cleanText)})
    except:
        pass

In [21]:
len(petitioner_words_dict)

6005

In [22]:
src.get_justice_words("353 US 373 (1957)", justices, case_lawyers_dict, speaker_cleanText)

[[[['you',
    'i',
    'suppose',
    'have',
    'put',
    'in',
    'your',
    'brief',
    'the',
    'name',
    'of',
    'those',
    'case',
    'and',
    'the',
    'amount',
    'involve',
    'in',
    'them',
    'substantially',
    'to',
    'the',
    'statement']]],
 [[['if',
    'this',
    'be',
    'material',
    'i',
    'should',
    'think',
    'might',
    'be',
    'wise',
    'to',
    'have',
    'all',
    'record',
    'of',
    'all',
    'those',
    'case',
    'during',
    'that',
    'period',
    'of',
    'time',
    'with',
    'the',
    'name',
    'where',
    'they',
    'we',
    'be',
    'try']]],
 [[['if',
    'i',
    'presume',
    'it',
    'would',
    'be',
    'very',
    'easy',
    'because',
    'you',
    'know']]],
 [[['i', 'think', 'you', 'say', 'less', 'than', 'hundred']]],
 [[['tax']]],
 [[['that', 'be', 'all', 'of', 'them']]],
 [[['but',
    'if',
    'you',
    'if',
    'you',
    'be',
    'right',
    'in',
    'your'

In [23]:
justice_words_dict = {}
for x in speaker_cleanText:
    try:
        justice_words_dict.update({x: src.get_justice_words(x, justices, case_lawyers_dict, speaker_cleanText)})
    except:
        pass

In [24]:
dfj = pd.DataFrame.from_dict(justice_words_dict.items())

In [57]:
df = pd.DataFrame.from_dict(petitioner_words_dict.items())

In [58]:
df.columns = ['case', 'text']
dfj.columns = ['case', 'text']

In [59]:
df[df.case =="353 US 373 (1957)"]

Unnamed: 0,case,text
45,353 US 373 (1957),"[[[['I', 'be', 'include', 'my', 'remark', 'wit..."


In [60]:
df.text = df.text.apply(lambda x: str(x).replace('[','').replace(']', ''))
dfj.text = dfj.text.apply(lambda x: str(x).replace('[','').replace(']', ''))


In [61]:
df.text = df.text.apply(lambda x: x.replace(",", ''))

In [62]:
df.head()

Unnamed: 0,case,text
0,352 US 282 (1957),'may' 'it' 'please' 'the' 'court' 'this' 'case...
1,353 US 586 (1957),'mr' 'chief' 'justice' 'if' 'the' 'court' 'ple...
2,352 US 599 (1957),'mr' 'chief' 'justice' 'and' 'associate' 'just...
3,352 US 82 (1956),'may' 'it' 'please' 'the' 'court' 'mr' 'willia...
4,352 US 220 (1957),'may' 'it' 'please' 'the' 'court' 'mr' 'plauch...


In [63]:
df.case = df.case.apply(lambda x: re.sub('(\d\d\d\d)', '', x))
dfj.case = dfj.case.apply(lambda x: re.sub('(\d\d\d\d)', '', x))


In [64]:
df.case = df.case.apply(lambda x: x.lower())
dfj.case = dfj.case.apply(lambda x: x.lower())

In [65]:
from unidecode import unidecode
def remove_non_ascii_chars(title):
    return "".join([unidecode(char).rstrip('()').rstrip(' ') for char in title])      

In [66]:
df.case = df.case.apply(lambda x: remove_non_ascii_chars(x))
dfj.case = dfj.case.apply(lambda x: remove_non_ascii_chars(x))

In [67]:
df[df.case == '352us282']

Unnamed: 0,case,text
0,352us282,'may' 'it' 'please' 'the' 'court' 'this' 'case...


In [68]:
def remove_non_ascii_chars(title):
    return "".join([unidecode(char) for char in title])      

In [69]:
df.text = df.text.apply(lambda x: remove_non_ascii_chars(x))
dfj.text = dfj.text.apply(lambda x: remove_non_ascii_chars(x))

In [82]:
df.text = df.text.apply(lambda x: x.replace("'", "" ))
dfj.text = dfj.text.apply(lambda x: x.replace("'", "" ))

In [83]:
df.head()

Unnamed: 0,case,text
0,352us282,may it please the court this case be here on a...
1,353us586,mr chief justice if the court please when the ...
2,352us599,mr chief justice and associate justice of the ...
3,352us82,may it please the court mr williams this matte...
4,352us220,may it please the court mr plauche you may pro...


In [None]:
dfj.head()

## Importing CSV with Further Case Info and Target Variable

In [37]:
df2 = pd.read_csv("data/SCDB_2020_01_caseCentered_Citation.csv", encoding='cp1252')
                 

In [38]:
df2.head()

Unnamed: 0,caseId,docketId,caseIssuesId,voteId,dateDecision,decisionType,usCite,sctCite,ledCite,lexisCite,term,naturalCourt,chief,docket,caseName,...,voteUnclear,issue,issueArea,decisionDirection,decisionDirectionDissent,authorityDecision1,authorityDecision2,lawType,lawSupp,lawMinor,majOpinWriter,majOpinAssigner,splitVote,majVotes,minVotes
0,1946-001,1946-001-01,1946-001-01-01,1946-001-01-01-01,11/18/1946,1,329 U.S. 1,67 S. Ct. 6,91 L. Ed. 3,1946 U.S. LEXIS 1724,1946,1301,Vinson,24,HALLIBURTON OIL WELL CEMENTING CO. v. WALKER e...,...,0.0,80180.0,8.0,2.0,0.0,4.0,,6.0,600.0,35 U.S.C. § 33,78.0,78.0,1,8,1
1,1946-002,1946-002-01,1946-002-01-01,1946-002-01-01-01,11/18/1946,1,329 U.S. 14,67 S. Ct. 13,91 L. Ed. 12,1946 U.S. LEXIS 1725,1946,1301,Vinson,12,CLEVELAND v. UNITED STATES,...,0.0,10500.0,1.0,1.0,0.0,4.0,,6.0,600.0,18 U.S.C. § 398,81.0,87.0,1,6,3
2,1946-003,1946-003-01,1946-003-01-01,1946-003-01-01-01,11/18/1946,1,329 U.S. 29,67 S. Ct. 1,91 L. Ed. 22,1946 U.S. LEXIS 3037,1946,1301,Vinson,21,CHAMPLIN REFINING CO. v. UNITED STATES ET AL.,...,0.0,80250.0,8.0,2.0,0.0,1.0,,2.0,207.0,,84.0,78.0,1,5,4
3,1946-004,1946-004-01,1946-004-01-01,1946-004-01-01-01,11/25/1946,7,329 U.S. 40,67 S. Ct. 167,91 L. Ed. 29,1946 U.S. LEXIS 1696,1946,1301,Vinson,26,UNITED STATES v. ALCEA BAND OF TILLAMOOKS ET AL.,...,0.0,20150.0,2.0,2.0,0.0,4.0,,6.0,600.0,49 Stat. 801,87.0,87.0,1,5,3
4,1946-005,1946-005-01,1946-005-01-01,1946-005-01-01-01,11/25/1946,1,329 U.S. 64,67 S. Ct. 154,91 L. Ed. 44,1946 U.S. LEXIS 2997,1946,1301,Vinson,50,"UNITED STATES v. HOWARD P. FOLEY CO., INC.",...,0.0,80060.0,8.0,2.0,0.0,7.0,,,,,78.0,87.0,1,6,3


## Merging with Text DF

In [84]:
target_name_df = df2[['partyWinning', 'usCite', 'decisionDirection', 'majVotes']].copy()

In [85]:
target_name_df.columns = ['target', 'case', 'lib_or_con', 'majVotes']

In [86]:
target_name_df['case'] = target_name_df['case'].apply(lambda x: str(x).replace('.', '').lower().replace(' ', ''))

In [87]:
target_name_df.head()

Unnamed: 0,target,case,lib_or_con,majVotes
0,1.0,329us1,2.0,8
1,0.0,329us14,1.0,6
2,0.0,329us29,2.0,5
3,0.0,329us40,2.0,5
4,1.0,329us64,2.0,6


In [88]:
target_name_df[target_name_df.case == '352us282' ]

Unnamed: 0,target,case,lib_or_con,majVotes
1129,1.0,352us282,2.0,6


In [89]:
final_df = df.merge(target_name_df, on ='case')
final_justice_df= dfj.merge(target_name_df, on ='case')

Only 1400 rows this is gonna be a process. Lets see if we can levarage some of the citation data. OK I went through and changed everything and now im up to 5700 cases, thats plenty for me.

In [90]:
final_df = final_df[final_df.majVotes < 7]

In [91]:
final_df.head()

Unnamed: 0,case,text,target,lib_or_con,majVotes
0,352us282,may it please the court this case be here on a...,1.0,2.0,6
1,353us586,mr chief justice if the court please when the ...,1.0,2.0,4
5,352us249,if the court please you might wait just a mome...,0.0,2.0,5
9,354us147,mr chief justice if the court please this be a...,0.0,2.0,5
10,352us407,mr chief justice may it please the court this ...,1.0,1.0,6


In [92]:
final_df.target.value_counts()

1.0    1435
0.0    1022
Name: target, dtype: int64

In [93]:
final_df = final_df[final_df.target!= 2.0]

In [94]:
final_df = final_df.dropna()

In [95]:
import math
final_df.target = final_df.target.apply(lambda x: math.trunc(x))


In [96]:
final_df.target.value_counts()

1    1435
0    1021
Name: target, dtype: int64

In [97]:
final_df.to_csv('Final_Merge.csv')
final_justice_df.to_csv("Final_justice.csv")