In [2]:
import numpy as np
import pandas as pd
import os
import json
from tqdm.notebook import tqdm
import tiktoken

In [None]:
root = "../data/cases_json/"

n_cases = len(os.listdir(root))
n_cases

471

In [4]:
case_ids = np.zeros((n_cases), dtype='object')
file_names = np.zeros((n_cases), dtype='object')
outcomes = np.zeros((n_cases), dtype='object')

In [5]:
for i, file_name in enumerate(tqdm(sorted(os.listdir(root)))):
    case_ids[i] = file_name[:4]
    file_names[i] = file_name
    
    case_file_path = root + file_name
    with open(case_file_path, encoding='utf-8') as f:
        case_file = json.load(f)
    
    outcomes[i] = case_file['class']

  0%|          | 0/471 [00:00<?, ?it/s]

In [6]:
metadata_df = pd.DataFrame({'case_id': case_ids,
                            'file_name': file_names,
                            'outcome': outcomes})
metadata_df.head()

Unnamed: 0,case_id,file_name,outcome
0,1,0001_Caselaw_AZ_1490855-Sasha.json,reverse
1,2,0002_Caselaw_Alaska_11695007-Sasha.json,affirm
2,29,0029-nm_119_609-Torres v State-Sasha.json,reverse
3,32,0032-nm_118_685-Baer v Regents of the Universi...,mixed
4,33,0033-nm_118_385-Diaz ex rel Diaz v Feil-Sasha....,mixed


In [7]:
metadata_df.to_csv("../data/csvs/metadata.csv", index=False)

In [8]:
### Case Lengths
encoding = tiktoken.encoding_for_model('gpt-4o')
token_lengths = np.zeros((n_cases, 6))
for i in tqdm(range(n_cases)):
    file_path = root + metadata_df['file_name'][i]
    with open(file_path, encoding='utf-8') as f:
        case_file = json.load(f)

    token_lengths[i, 0] = len(encoding.encode(case_file['text']))
    token_lengths[i, 1] = len(encoding.encode(" ".join(case_file['annotations']['Facts'])))
    token_lengths[i, 3] = len(encoding.encode(" ".join(case_file['annotations']['Relevant Precedents'])))
    token_lengths[i, 2] = len(encoding.encode(" ".join(case_file['annotations']['Procedural History'])))
    token_lengths[i, 4] = len(encoding.encode(" ".join(case_file['annotations']['Application of Law to Facts'])))
    token_lengths[i, 5] = len(encoding.encode(" ".join(case_file['annotations']['Outcome'])))
token_lengths_df = pd.DataFrame(token_lengths, index=metadata_df['file_name'], columns=['text', 'facts', 'procedural_history', 'relevant_precedents', 'application_of_law_to_facts', 'outcome'])
token_lengths_df.head()
token_lengths_df.describe()
# token_lengths_df.reset_index().to_csv("../data/token_lengths.csv", index=False)

  0%|          | 0/471 [00:00<?, ?it/s]

Unnamed: 0,text,facts,procedural_history,relevant_precedents,application_of_law_to_facts,outcome
count,471.0,471.0,471.0,471.0,471.0,471.0
mean,5035.373673,620.14862,134.138004,164.280255,340.254777,29.968153
std,3104.940872,603.597346,125.452305,229.445785,235.630288,31.904129
min,802.0,0.0,0.0,0.0,20.0,0.0
25%,2935.0,274.0,41.5,29.5,175.5,10.0
50%,4433.0,461.0,102.0,83.0,281.0,21.0
75%,6318.0,743.0,193.0,208.0,438.0,37.0
max,23125.0,5150.0,761.0,2057.0,1726.0,242.0
