In [1]:
import requests
import os
import pandas as pd

# Downloading all case files

Using the bulk API from https://case.law/ - https://case.law/bulk/ to downloand and directly extract the files

In [1]:
ACCESS_TOKEN = '<API_KEY>'
URL_BULK = 'https://api.case.law/v1/bulk/?body_format=text&filter_type=jurisdiction'

In [3]:
def processBulk(url_download, filename, access_token = ACCESS_TOKEN):  
    # Using wget to download the file
    wget_cmd = 'wget --header="Authorization: Token ' + ACCESS_TOKEN \
        + '" -O ../data/01_download/' + filename + ' "' + url_download + '"'
    # Using unzip to extract
    unzip_cmd = 'unzip ../data/01_download/' + filename + ' -d ../data/01_download/ && rm ../data/01_download/' + filename
    print(wget_cmd)
    print(unzip_cmd)
    os.system(wget_cmd)
    os.system(unzip_cmd)

In [4]:
header = headers={'Authorization': 'Token ' + ACCESS_TOKEN}

In [5]:
# First request receives an array with download URLs
response = requests.get(URL_BULK, headers=header).json()

In [6]:
for item in response['results']:
    filename = item['file_name']
    url_download = item['download_url']
    processBulk(url_download, filename)

# Process data

After all files are downloaded and extracted, only relevant variables are stored

In [7]:
from pandas.io.json import json_normalize
import json
import re
import os
import pandas as pd
from glob import glob

In [12]:
all_files = glob("../data/01_download/*")
# Direct fields
fields_direct = ['decision_date', 'docket_number', 'first_page', 'id', 'last_page', 'name' , 'name_abbreviation']
# Keywords, which are extracted from the casebody opinion field
keywords = ["dog bite","slip","medical malpractice","drug recall","m&a","antitrust prosecution","asylum","murder","arson","sexual harassment ","divorce","real estate","intellectual property","insurance claims","internet","free speech","capital murder","dog","fall","antitrust","sexual","patent","bit","prosecution"]

In [9]:
def no_of_occurence(s, s2):
    s = s.lower()
    s = re.sub('[^0-9a-zA-Z&]+', ' ', s)
    s = s.replace('  ', ' ')
    return(s.count(s2))

def no_of_occurence2(s, s2):
    return(s.count(s2))

def format_text(s):
    # Formatting the text
    s = s.lower()
    s = re.sub('[^0-9a-zA-Z&]+', ' ', s)
    s = s.replace('  ', ' ')
    return(s)

def processDF(df):
    # Extract relevant fields from each jsonLine
    df = df.reset_index(drop=True)
    df_output = df[fields_direct]
    df_volume = json_normalize(df['volume']).add_prefix('volume_')
    df_reporter = json_normalize(df['reporter']).add_prefix('reporter_')
    df_jurisdiction = json_normalize(df['jurisdiction']).add_prefix('jurisdiction_')
    df_court = json_normalize(df['court']).add_prefix('court_')
    df_no_citations = df['citations'].apply(len)
    df_first_citation = json_normalize(df['citations'].map(lambda x: x[0] if len(x)>0 else '')).add_prefix('citation1_')

    cb = json_normalize(df['casebody'])

    df_status = cb['status']
    df_status = pd.DataFrame(df_status)
    df_status.columns = ['cb_status']
    df_no_attorneys = cb['data.attorneys'].apply(len)
    df_no_attorneys = pd.DataFrame(df_no_attorneys)
    df_no_attorneys.columns = ['cb_no_attorneys']
    df_no_judges = cb['data.judges'].apply(len)
    df_no_judges = pd.DataFrame(df_no_judges)
    df_no_judges.columns = ['cb_no_judges']
    df_no_parties = cb['data.parties'].apply(len)
    df_no_parties = pd.DataFrame(df_no_parties)
    df_no_parties.columns = ['cb_no_parties']
    df_no_opinions = cb['data.opinions'].apply(len)
    df_no_opinions = pd.DataFrame(df_no_opinions)
    df_no_opinions.columns = ['cb_no_opinions']

    df_first_attorneys = cb['data.attorneys'].map(lambda x: x[0] if len(x)>0 else '')
    df_first_attorneys = pd.DataFrame(df_first_attorneys)
    df_first_attorneys.columns = ['cb_data_attr1_first_attorneys']
    df_sec_attorneys = cb['data.attorneys'].map(lambda x: x[1] if len(x)>1 else '')
    df_sec_attorneys = pd.DataFrame(df_sec_attorneys)
    df_sec_attorneys.columns = ['cb_data_attr2_sec_attorneys']
    df_third_attorneys = cb['data.attorneys'].map(lambda x: x[2] if len(x)>2 else '')
    df_third_attorneys = pd.DataFrame(df_third_attorneys)
    df_third_attorneys.columns = ['cb_data_attr3_sec_attorneys']
    df_first_judge = cb['data.judges'].map(lambda x: x[0] if len(x)>0 else '')
    df_first_judge = pd.DataFrame(df_first_judge)
    df_first_judge.columns = ['cb_data_judge1']
    df_sec_judge = cb['data.judges'].map(lambda x: x[1] if len(x)>1 else '')
    df_sec_judge = pd.DataFrame(df_sec_judge)
    df_sec_judge.columns = ['cb_data_judge2']
    df_third_judge = cb['data.judges'].map(lambda x: x[2] if len(x)>2 else '')
    df_third_judge = pd.DataFrame(df_third_judge)
    df_third_judge.columns = ['cb_data_judge3']
    df_first_party = cb['data.parties'].map(lambda x: x[0] if len(x)>0 else '')
    df_first_party = pd.DataFrame(df_first_party)
    df_first_party.columns = ['cb_data_party1']
    df_sec_party = cb['data.parties'].map(lambda x: x[1] if len(x)>1 else '')
    df_sec_party = pd.DataFrame(df_sec_party)
    df_sec_party.columns = ['cb_data_party2']
    df_third_party = cb['data.parties'].map(lambda x: x[2] if len(x)>2 else '')
    df_third_party = pd.DataFrame(df_third_party)
    df_third_party.columns = ['cb_data_party3']

    # Merge different sub dataframes
    df_output = df_output.join(df_volume)
    df_output = df_output.join(df_reporter)
    df_output = df_output.join(df_jurisdiction)
    df_output = df_output.join(df_court)
    df_output = df_output.join(df_no_citations)
    df_output = df_output.join(df_first_citation)
    df_output = df_output.join(df_status)
    df_output = df_output.join(df_no_attorneys)
    df_output = df_output.join(df_no_judges)
    df_output = df_output.join(df_no_parties)
    df_output = df_output.join(df_no_opinions)
    df_output = df_output.join(df_first_attorneys)
    df_output = df_output.join(df_sec_attorneys)
    df_output = df_output.join(df_third_attorneys)
    df_output = df_output.join(df_first_judge)
    df_output = df_output.join(df_sec_judge)
    df_output = df_output.join(df_third_judge)
    df_output = df_output.join(df_first_party)
    df_output = df_output.join(df_sec_party)
    df_output = df_output.join(df_third_party)
    
    # Extract the categories from the opinion field
    for i in range(3):
        print(i)
        df_count = cb['data.opinions'].map(lambda x: format_text(x[i]['text']) if len(x)>i else '')
        df_count = pd.DataFrame(df_count)
        df_count.columns = ['dummy']
        for keyword in keywords:
            df_count_words = df_count['dummy'].map(lambda x: no_of_occurence2(x, keyword))
            df_count_words = pd.DataFrame(df_count_words)
            df_count_words.columns = ['cb_data_text_' + str(i) + '_' + keyword]
            df_output = df_output.join(df_count_words)
            
    return(df_output)

In [None]:
# Iterating over all extracted folders
for folder in all_files:
    filename = folder.replace('../data/01_download/', '')
    if os.path.isfile('../data/02_processed_csvs/' + filename + '.csv'):
        print(folder)
        print('already processed')
    else:
        print(folder)
        ## Each file is a jsonl file - each line is one json object
        df = pd.read_json(folder + '/data/data.jsonl.xz', lines=True, compression='xz')
        df_output = processDF(df)
        df_output.to_csv('../data/02_processed_csvs/' + filename + '.csv', sep = ',')