## Environment Setup


In [13]:
!pip install openai
!pip install datasets
!pip install evaluate

In [14]:
import json
from openai import OpenAI
from IPython.utils.capture import capture_output
from datasets import load_dataset

In [15]:
from google.colab import drive
drive.mount('/content/drive')


%cd /content/drive/MyDrive/CS 159 Project

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/1LeLbrNZayyv5i4FVOL44lq8EKgwzSuZl/CS 159 Project


In [16]:
from factsheet_utils import *
from metrics_utils import *

In [17]:
YOUR_KEY = "sk-proj-xoQeTl5vSUoRWpzHyg5aT3BlbkFJpE2JAIcnzhQznKYKQy89"
client = OpenAI(api_key = YOUR_KEY)

## Load Multi_Lexsum Dataset

In [18]:
multi_lexsum = load_dataset("allenai/multi_lexsum", name="v20220616")

In [19]:
multi_lexsum

DatasetDict({
    train: Dataset({
        features: ['id', 'sources', 'summary/long', 'summary/short', 'summary/tiny'],
        num_rows: 3177
    })
    validation: Dataset({
        features: ['id', 'sources', 'summary/long', 'summary/short', 'summary/tiny'],
        num_rows: 454
    })
    test: Dataset({
        features: ['id', 'sources', 'summary/long', 'summary/short', 'summary/tiny'],
        num_rows: 908
    })
})

In [20]:
train = multi_lexsum["test"] # The first instance of the dev set

## Methods to Generate, Combine, Save, and Compute Accuracy

Process a source given its index in the sources and ids lists.
1. Generates factsheets for each document in source
2. Combines these sheets into a single sheet
3. Generates factsheet for the summary (we use long summary for now)
4. Compares the source and summary sheets using cosine similarity.

In [21]:
# train
sources = train['sources']
ids = train['id']

def process_source(idx):
  source = sources[idx]
  id = ids[idx]
  print(f"Analyzing case {id}")

  # Break source up into segments for processing
  print('Generating fact sheet for each document in source...')
  source_responses = []
  for s in source:
    source_responses.append(generate_fact_sheet(s, client))

  # combine source responses
  print('Combining source fact sheets...')
  source_sheet = combine_fact_sheets(source_responses, client, verbose=False)
  print(source_sheet)

  # generate summary sheet
  print('Generating summary sheet...')
  summary_sheet = generate_fact_sheet(train['summary/long'][idx], client, verbose=False)
  print(summary_sheet)

  # compute cosine similarity as accuracy measure
  cos_sim = compute_cosine_similarity(source_sheet, summary_sheet, verbose=False)
  print(f"Cosine Similarity: {cos_sim}")

  return source_sheet, summary_sheet, cos_sim

## Generate Factsheets and metrics for First Two Sources

In [22]:
save_path = '/content/drive/MyDrive/CS 159 Project/factsheet/'

In [23]:
# specify indices in source to process
idx = 0
source0, sum0, sim0 = process_source(idx)
save_factsheet(source0, save_path + f'{ids[idx]}_source')
save_factsheet(sum0, save_path + f'{ids[idx]}_summary')

Analyzing case CJ-AL-0007
Generating fact sheet for each document in source...
```json
{
    "case_info": "Case 2:14-cv-00186-MEF-CSC Document 1 Filed 03/18/14 Page 1 of 24",
    "parties": "Sharnalle Mitchell, Lorenzo Brown, Deborah P. Hooten, Courtney Tubbs, Tito Williams, The City of Montgomery",
    "legal_basis": "42 U.S.C. § 1983, 18 U.S.C. § 1595, 28 U.S.C. § 1331, Fourth, Sixth, Thirteenth, and Fourteenth Amendments to the United States Constitution",
    "case_background": "Plaintiffs jailed by City of Montgomery for inability to pay debts from traffic tickets, City's policy to jail indigent individuals without proper inquiry or counsel",
    "court_proceedings": "Plaintiffs seek vindication of their rights, compensation, injunctive relief, and declaration of City's conduct as unlawful",
    "settlement_and_agreements": "Plaintiffs' emergency petitions for release led to agreements for release from jail, reduced debts, payment plans, ongoing fear of re-imprisonment",
    "outc

In [24]:
# specify indices in source to process
idx = 1
source1, sum1, sim1 = process_source(idx)
save_factsheet(source1, save_path + f'{ids[idx]}_source')
save_factsheet(sum1, save_path + f'{ids[idx]}_summary')

Analyzing case CJ-AL-0020
Generating fact sheet for each document in source...
{
    "case_info": "Case: 2:13-cv-732\nAs of: 01/18/2019 10:59 AM CST\nCLOSED,JOINT_ASSIGN,LEAD\nU.S. District Court Alabama Middle District (Montgomery) CIVIL DOCKET FOR CASE #: 2:13−cv−00732−MHT−TFM",
    "parties": "Harriet Delores Cleveland, Markis Antwuan Watts, City of Montgomery, Milton J. Westry, Les Hayes III",
    "legal_basis": "42:1983 Civil Rights Act",
    "case_background": "Plaintiff Harriet Delores Cleveland filed the case on 10/04/2013, related to Civil Rights. The case involved multiple defendants from the City of Montgomery.",
    "court_proceedings": "Various motions and orders were filed and issued, including petitions, answers, motions to consolidate cases, discovery plans, scheduling orders, and motions to compel.",
    "settlement_and_agreements": "There were motions for protective orders, extensions of deadlines, and joint motions for settlement agreements filed by both the plaintif