## Environment Setup


In [178]:
!pip install openai
!pip install datasets



In [179]:
import os
import json
import openai
from openai import OpenAI
import pandas

import re

from IPython import get_ipython
from IPython.utils.capture import capture_output

In [180]:
from google.colab import drive
drive.mount('/content/drive')


%cd /content/drive/MyDrive/CS 159 Project

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/1LeLbrNZayyv5i4FVOL44lq8EKgwzSuZl/CS 159 Project


In [181]:
YOUR_KEY = "sk-proj-xoQeTl5vSUoRWpzHyg5aT3BlbkFJpE2JAIcnzhQznKYKQy89"
client = OpenAI(api_key = YOUR_KEY)

In [182]:
def get_openai_response(prompt):
    response = client.chat.completions.create(
            model = "gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a highly intelligent Legal AI trained to solve reasoning problems and learn iteratively from feedback."},
                {"role": "user", "content": prompt}
            ]
        )
    return response.choices[0].message.content

## Load Multi_Lexsum Dataset

In [183]:
from datasets import load_dataset

multi_lexsum = load_dataset("allenai/multi_lexsum", name="v20220616")

In [184]:
multi_lexsum

DatasetDict({
    train: Dataset({
        features: ['id', 'sources', 'summary/long', 'summary/short', 'summary/tiny'],
        num_rows: 3177
    })
    validation: Dataset({
        features: ['id', 'sources', 'summary/long', 'summary/short', 'summary/tiny'],
        num_rows: 454
    })
    test: Dataset({
        features: ['id', 'sources', 'summary/long', 'summary/short', 'summary/tiny'],
        num_rows: 908
    })
})

In [185]:
train = multi_lexsum["test"] # The first instance of the dev set

## Methods to Generate, Combine, Save, and Compute Accuracy

In [186]:
def generate_fact_sheet(summary, verbose=True):
    prompt = f"""
    Given the following legal case summary, create a detailed fact sheet covering the following categories:
    1. Case Information
    2. Parties Involved
    3. Legal Basis
    4. Case Background
    5. Court Proceedings
    6. Settlement and Agreements
    7. Outcome and Impact
    8. Miscellaneous

    Summary: {summary}

    Please provide the fact sheet in a structured format. Output type should
    be a JSON object with the following keys: case_info, parties, legal_basis,
    case_background, court_proceedings, settlement_and_agreements, outcome_and_impact,
    miscellaneous.
    Do not have sub categories in the values of the JSON object.
    Make the values into string types.

    """
    response = get_openai_response(prompt)

    if verbose: print(response)
    return response


In [187]:
def combine_fact_sheets(responses, verbose=True):
    summary = '\n'.join(responses)
    prompt = f"""
    Given the following JSON objects appended together, merge these factsheets
    together to create a detailed fact sheet covering the following categories:
    1. Case Information
    2. Parties Involved
    3. Legal Basis
    4. Case Background
    5. Court Proceedings
    6. Settlement and Agreements
    7. Outcome and Impact
    8. Miscellaneous

    Summary: {summary}

    Please provide the fact sheet in a structured format. Output type should
    be a JSON object with the following keys: case_info, parties, legal_basis,
    case_background, court_proceedings, settlement_and_agreements, outcome_and_impact,
    miscellaneous.
    Do not have sub categories in the values of the JSON object.
    Make the values into string types.

    """
    response = get_openai_response(prompt)
    if verbose: print(response)
    return response

In [188]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def compute_cosine_similarity(sheet1, sheet2, verbose=True):
  """
  Compute the cosine similarity between the source and summary sheets.
  source_sheet: string representation of the source sheet
  summary_sheet: string representation of the summary sheet
  """
  # Create the TF-IDF vectors
  vectorizer = TfidfVectorizer()
  tfidf_matrix = vectorizer.fit_transform([sheet1, sheet2])

  # Compute the cosine similarity
  cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
  if verbose: print(f"Cosine Similarity: {cosine_sim[0][0]}")
  return cosine_sim[0][0]

In [189]:
import csv
from google.colab import files
import os

def save_factsheet(response, filename):
  if not response.startswith("{"):
    response = '\n'.join(response.split('\n')[1:-1])
  data = json.loads(response)
  print(data)
  with open(f"{filename}.csv", 'w', newline='') as file:
      writer = csv.writer(file)
      # Write the header
      writer.writerow(['Category', 'Details'])
      # Write the data
      for key, value in data.items():
        print(key)
        print(value)
        writer.writerow([key, value])

Process a source given its index in the sources and ids lists.
1. Generates factsheets for each document in source
2. Combines these sheets into a single sheet
3. Generates factsheet for the summary (we use long summary for now)
4. Compares the source and summary sheets using cosine similarity.

In [190]:
# train
sources = train['sources']
ids = train['id']

def process_source(idx):
  source = sources[idx]
  id = ids[idx]
  print(f"Analyzing case {id}")

  # Break source up into segments for processing
  print('Generating fact sheet for each document in source...')
  source_responses = []
  for s in source:
    source_responses.append(generate_fact_sheet(s))

  # combine source responses
  print('Combining source fact sheets...')
  source_sheet = combine_fact_sheets(source_responses, verbose=False)
  print(source_sheet)

  # generate summary sheet
  print('Generating summary sheet...')
  summary_sheet = generate_fact_sheet(train['summary/long'][idx], verbose=False)
  print(summary_sheet)

  # compute cosine similarity as accuracy measure
  cos_sim = compute_cosine_similarity(source_sheet, summary_sheet, verbose=False)
  print(f"Cosine Similarity: {cos_sim}")

  return source_sheet, summary_sheet, cos_sim

## Generate Factsheets and metrics for First Two Sources

In [191]:
save_path = '/content/drive/MyDrive/CS 159 Project/factsheet/'

In [192]:
# specify indices in source to process
idx = 0
source0, sum0, sim0 = process_source(idx)
save_factsheet(source0, save_path + f'{ids[idx]}_source')
save_factsheet(sum0, save_path + f'{ids[idx]}_summary')

Analyzing case CJ-AL-0007
Generating fact sheet for each document in source...
{
    "case_info": "Case 2:14-cv-00186-MEF-CSC Document 1 Filed 03/18/14 Page 1 of 24",
    "parties": "Plaintiffs: SHARNALLE MITCHELL, LORENZO BROWN, DEB -RA P. HACKETT, COURTNEY TUBBS, TITO WILLIAMS. Defendant: THE CITY OF MONTGOMERY",
    "legal_basis": "Civil rights action under 42 U.S.C. § 1983, 18 U.S.C. § 1595, and 28 U.S.C. § 2201, based on violations of the Fourth, Sixth, Thirteenth, and Fourteenth Amendments to the United States Constitution.",
    "case_background": "Plaintiffs impoverished and jailed for nonpayment of debts owed to the City of Montgomery from traffic tickets. City policy and practice of incarcerating indigent individuals without an inquiry into their ability to pay or providing alternatives to jail.",
    "court_proceedings": "Plaintiffs seek vindication of rights, compensation, injunctive relief, and declaration of City's conduct as unlawful. Court jurisdiction pursuant to 28 U.

In [193]:
# specify indices in source to process
idx = 1
source1, sum1, sim1 = process_source(idx)
save_factsheet(source1, save_path + f'{ids[idx]}_source')
save_factsheet(sum1, save_path + f'{ids[idx]}_summary')

Analyzing case CJ-AL-0020
Generating fact sheet for each document in source...
```json
{
  "case_info": "Case Number: 2:13-cv-732\nFiled Date: 10/04/2013\nTerminated Date: 11/17/2014\nJurisdiction: Federal Question\nCourt: U.S. District Court Alabama Middle District (Montgomery)\nCause: 42:1983 Civil Rights Act",
  "parties": "Plaintiff: Harriet Delores Cleveland, represented by David Carter Dinielli, Sara Michelle Zampierin, Samuel Jacob Brooke\nConsol Plaintiff: Markis Antwuan Watts, represented by David Carter Dinielli, Sara Michelle Zampierin, Samuel Jacob Brooke\nDefendants: City of Montgomery, Milton J. Westry, Les Hayes, III, represented by Jason Cole Paulk, Robert David Segall, Shannon Lynn Holliday",
  "legal_basis": "Civil Rights Act of 1983 (42 U.S. Code § 1983)",
  "case_background": "The case involved allegations related to civil rights and was filed on 10/04/2013. The plaintiff, Harriet Delores Cleveland, and the consol plaintiff, Markis Antwuan Watts, were represented by