## Environment Setup


In [13]:
!pip install openai
!pip install datasets
!pip install evaluate



In [3]:
import json
from openai import OpenAI
from IPython.utils.capture import capture_output
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
from google.colab import drive
drive.mount('/content/drive')


%cd /content/drive/MyDrive/CS 159 Project

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/1LeLbrNZayyv5i4FVOL44lq8EKgwzSuZl/CS 159 Project


In [16]:
from factsheet_utils import *
from metrics_utils import *

In [17]:
YOUR_KEY = input("Enter your OpenAI API Key: ")
client = OpenAI(api_key = YOUR_KEY)

## Load Multi_Lexsum Dataset

In [18]:
multi_lexsum = load_dataset("allenai/multi_lexsum", name="v20220616")

In [19]:
multi_lexsum

DatasetDict({
    train: Dataset({
        features: ['id', 'sources', 'summary/long', 'summary/short', 'summary/tiny'],
        num_rows: 3177
    })
    validation: Dataset({
        features: ['id', 'sources', 'summary/long', 'summary/short', 'summary/tiny'],
        num_rows: 454
    })
    test: Dataset({
        features: ['id', 'sources', 'summary/long', 'summary/short', 'summary/tiny'],
        num_rows: 908
    })
})

In [20]:
train = multi_lexsum["test"] # The first instance of the dev set

## Methods to Generate, Combine, Save, and Compute Accuracy

Process a source given its index in the sources and ids lists.
1. Generates factsheets for each document in source
2. Combines these sheets into a single sheet
3. Generates factsheet for the summary (we use long summary for now)
4. Compares the source and summary sheets using cosine similarity.

In [21]:
# train
sources = train['sources']
ids = train['id']

def process_source(idx):
  source = sources[idx]
  id = ids[idx]
  print(f"Analyzing case {id}")

  # Break source up into segments for processing
  print('Generating fact sheet for each document in source...')
  source_responses = []
  for s in source:
    source_responses.append(generate_fact_sheet(s, client))

  # combine source responses
  print('Combining source fact sheets...')
  source_sheet = combine_fact_sheets(source_responses, client, verbose=False)
  print(source_sheet)

  # generate summary sheet
  print('Generating summary sheet...')
  summary_sheet = generate_fact_sheet(train['summary/long'][idx], client, verbose=False)
  print(summary_sheet)

  # compute cosine similarity as accuracy measure
  cos_sim = compute_cosine_similarity(source_sheet, summary_sheet, verbose=False)
  print(f"Cosine Similarity: {cos_sim}")

  return source_sheet, summary_sheet, cos_sim

## Generate Factsheets and metrics for First Two Sources

In [22]:
save_path = '/content/drive/MyDrive/CS 159 Project/factsheet/'

In [23]:
# specify indices in source to process
idx = 0
source0, sum0, sim0 = process_source(idx)
save_factsheet(source0, save_path + f'{ids[idx]}_source')
save_factsheet(sum0, save_path + f'{ids[idx]}_summary')

Analyzing case CJ-AL-0007
Generating fact sheet for each document in source...
{
    "case_info": "Case 2:14-cv-00186-MEF-CSC Document 1 Filed 03/18/14 Page 1 of 24",
    "parties": "Sharnalle Mitchell, Lorenzo Brown, Debra P. Hackett, Courtney Tubbs, Tito Williams, City of Montgomery",
    "legal_basis": "Civil rights action under 42 U.S.C. § 1983, 18 U.S.C. § 1595, and 28 U.S.C. § 2201, et seq., and Fourth, Sixth, Thirteenth, and Fourteenth Amendments to the United States Constitution",
    "case_background": "Plaintiffs jailed by City of Montgomery for inability to pay traffic ticket debts. City policy of jailing indigent people without considering ability to pay violates constitutional rights.",
    "court_proceedings": "Plaintiffs seek declaratory, injunctive, and compensatory relief. The Court has jurisdiction pursuant to 28 U.S. S.. § 1331 and 1343. Plaintiffs experienced systemic illegality in debt collection practices by the City.",
    "settlement_and_agreements": "Plaintiffs

In [24]:
# specify indices in source to process
idx = 1
source1, sum1, sim1 = process_source(idx)
save_factsheet(source1, save_path + f'{ids[idx]}_source')
save_factsheet(sum1, save_path + f'{ids[idx]}_summary')

Analyzing case CJ-AL-0020
Generating fact sheet for each document in source...
{
    "case_info": "Case: 2:13-cv-732 As of: 01/18/2019 10:59 AM CST. U.S. District Court Alabama Middle District (Montgomery). Cause: 42:1983 Civil Rights Act.",
    "parties": "Plaintiff: Harriet Delores Cleveland, V. Consol Plaintiff: Markis Antwuan Watts. Defendants: City of Montgomery, Milton J. Westry, Les Hayes, III.",
    "legal_basis": "Civil Rights Act of 1964, Title VII (42 U.S.C. § 2000e).",
    "case_background": "Filed on 10/04/2013, terminated on 11/17/2014. No jury demand. Nature of Suit: 440 Civil Rights: Other Jurisdiction: Federal Question.",
    "court_proceedings": "Notice of Removal filed on 10/09/2013. Various motions, petitions, and orders followed. Consolidation motion granted on 11/14/2013. Scheduling orders, conferences, and status reviews occurred throughout 2014.",
    "settlement_and_agreements": "Multiple motions for protective orders and continuances filed. Agreed settlement o