##Generating Natural Language Summaries from Factsheets

###Setup

In [1]:
!pip install openai
!pip install datasets
!pip install evaluate

Collecting openai
  Downloading openai-1.30.1-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.6/320.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.5 ht

In [2]:
import os
import json
import openai
from openai import OpenAI
import pandas

import re

from IPython import get_ipython
from IPython.utils.capture import capture_output

In [3]:
from google.colab import drive
drive.mount('/content/drive')


%cd /content/drive/MyDrive/CS 159 Project

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1LeLbrNZayyv5i4FVOL44lq8EKgwzSuZl/CS 159 Project


###OpenAI Key

In [4]:
ipython = get_ipython() # run generated code
# YOUR_KEY = os.getenv("sk-proj-W5F0FE87iGYN9VS679VeT3BlbkFJDAjPPplBSgT4gqd5IPMZ")
YOUR_KEY = "sk-proj-n5GwGeTfcKLlcFi8qAyrT3BlbkFJKcYBL0VwMgaEVJowisbT"

client = OpenAI(api_key = YOUR_KEY)

pattern = re.compile(r'```python\n(.*?)```', re.DOTALL)  # extract code from llm generation

### Preprocessing

In [73]:
from datasets import load_dataset

def load_preprocess():
  # Load the dataset
  multi_lexsum = load_dataset("allenai/multi_lexsum", name="v20220616")
  train = multi_lexsum["test"] # The first instance of the dev set
  raw_data_train = []
  summaries_train = {"long": [], "short": [], "tiny": []}
  i = 0
  DEBUG = False

  for case1 in train:
    # each case has 4-5 sources
    if DEBUG and i == 10:
      break
    raw_data_train.append(case1["sources"])

    for sum_len in ["long", "short", "tiny"]:
      summaries_train[sum_len].append(case1["summary/" + sum_len])
    if DEBUG:
      i += 1
  return raw_data_train, summaries_train

### Generating Natural Language Summaries from CSVs/Worksheets

In [74]:
import csv
from evaluate import load
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def process_csv(csv_file_path):
  csv_data = []
  with open(csv_file_path, newline='') as csvfile:
      csvreader = csv.reader(csvfile)
      for row in csvreader:
          csv_data.append(row)
  return csv_data


def generate_string(csv_data):
  csv_string = ''
  for i in range(len(csv_data)):
    if i == 0:
        continue
    else:
      csv_string += ': '.join(csv_data[i]) + '\n'
  return csv_string



def generate_summary(processed_string):
    NUM_WORDS = 130
    template = '''
    Setting: You are a helpful and concise assistant designed to assist users in summarizing factsheets.

    Provide clear and precise answers to the user's questions. Avoid unnecessary details and keep your responses brief and to the point.

    Your goal is to understand the user's request, and provide text to
    fulfill the request.

    Your input will be a factsheet with the following format -->
    category1 : detail1
    category2 : detail2
    ...

    Your output will be text.
    '''

    prompt_create = f'''
    You are a lawyer describing the court case to the general public.
    Given the factsheet in the following text format -->
    category1 : detail1
    category2 : detail2
    ...

    Summarize the factsheet so that it is understable to the general public.
    The summary should be paragraph form around {NUM_WORDS} words. Below is the factsheet:
    {processed_string}

     '''
    # Prompting till we get 10 summaries w 650 words
    try_times = 0
    chat_summaries_short = []
    TOTAL_TRIES = 10
    while try_times < TOTAL_TRIES:
      summary = get_openai_response(prompt_create, template)
      sum_words = len(summary.split(" "))
      if sum_words > NUM_WORDS - 20 and sum_words < NUM_WORDS + 20:
        try_times += 1
        chat_summaries_short.append(summary)

    return chat_summaries_short

### Evaluation Metrics

In [75]:
def exact_match(pred, ground_truth):
    if len(pred) < len(ground_truth):
        ground_truth = ground_truth[:len(pred)]
    elif len(pred) > len(ground_truth):
        pred = pred[:len(ground_truth)]
    exact_match = load("exact_match")
    results = exact_match.compute(references=pred, predictions=ground_truth)
    return round(results["exact_match"], 2)


def evaluate(chat_summaries, ground_truth):
  exact_match_scores = []
  for summary in chat_summaries:
    exact_match_scores.append(exact_match(summary, ground_truth))
  return exact_match_scores

def find_best_summary(exact_match_scores):
  best_index = exact_match_scores.index(max(exact_match_scores))
  return best_index


def compute_cosine_similarity(chat_summaries, ground_truth, verbose=True):
  """
  Compute the cosine similarity between the source and summary sheets.
  source_sheet: string representation of the source sheet
  summary_sheet: string representation of the summary sheet
  """
  cosine_scores = []
  vectorizer = TfidfVectorizer()
  for summary in chat_summaries:
    tfidf_matrix = vectorizer.fit_transform([summary, ground_truth])
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    cosine_scores.append(cosine_sim[0][0])
  return cosine_scores

### Main Loop

In [76]:

def main(fact_sheet_path):
  raw_data_train, summaries_train = load_preprocess()
  # change short vs long depending on factsheet type / size
  # specify correct index
  ground_truth = summaries_train['short'][0]

  processed_csv = generate_string(process_csv(fact_sheet_path))
  chat_summaries_short = generate_summary(processed_csv)
  exact_match_scores = evaluate(chat_summaries_short, ground_truth)
  cosine_sim_scores = compute_cosine_similarity(chat_summaries_short, ground_truth)
  best_idx = find_best_summary(cosine_sim_scores)

  print("Exact similarity scores: ", exact_match_scores)
  print("Cosine similarity scores: ", cosine_sim_scores)
  print("Here is the best summary: ", chat_summaries_short[int(best_idx)])



In [77]:
fact_sheet_path = 'factsheet/CJ-AL-0020_summary.csv'
main(fact_sheet_path)

Exact similarity scores:  [0.06, 0.06, 0.05, 0.07, 0.07, 0.07, 0.07, 0.05, 0.08, 0.05]
Cosine similarity scores:  [0.4538061785234016, 0.41453297094674457, 0.4479310312073994, 0.45540559761938754, 0.4144960736739172, 0.4770789469498656, 0.4834145675095602, 0.4974333667609191, 0.35644706061362696, 0.453882105174925]
Here is the best summary:  In August 2013, an indigent detainee in Montgomery Municipal Jail filed a lawsuit in the Circuit Court of Montgomery County, Alabama against the City of Montgomery and Judge Westry. The lawsuit, based on federal law, alleged violations of constitutional rights. The detainee, facing fines from traffic tickets, was arrested and given a choice by Judge Westry to pay or serve jail time without legal representation. The case moved to federal court, underwent discovery and mediation, leading to a settlement in 2014. The settlement included attorney fees and established procedures for indigent defendants. The final judgment in November 2014 affirmed the c