##Generating Natural Language Summaries from Factsheets

###Setup

In [25]:
!pip install openai
!pip install datasets
!pip install evaluate



In [49]:
!pip install rouge-score
!pip install bert-score
from rouge_score import rouge_scorer
from bert_score import score



In [27]:
import os
import json
import openai
from openai import OpenAI
import pandas

import re

from IPython import get_ipython
from IPython.utils.capture import capture_output

In [28]:
from google.colab import drive
drive.mount('/content/drive')


%cd /content/drive/MyDrive/CS 159 Project

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/1LeLbrNZayyv5i4FVOL44lq8EKgwzSuZl/CS 159 Project


###OpenAI Key

In [29]:
ipython = get_ipython() # run generated code
# YOUR_KEY = os.getenv("sk-proj-W5F0FE87iGYN9VS679VeT3BlbkFJDAjPPplBSgT4gqd5IPMZ")
YOUR_KEY = "sk-proj-n5GwGeTfcKLlcFi8qAyrT3BlbkFJKcYBL0VwMgaEVJowisbT"

client = OpenAI(api_key = YOUR_KEY)

pattern = re.compile(r'```python\n(.*?)```', re.DOTALL)  # extract code from llm generation

### Preprocessing

In [30]:
from datasets import load_dataset

def load_preprocess():
  # Load the dataset
  multi_lexsum = load_dataset("allenai/multi_lexsum", name="v20220616")
  train = multi_lexsum["test"] # The first instance of the dev set
  raw_data_train = []
  summaries_train = {"long": [], "short": [], "tiny": []}
  i = 0
  DEBUG = False

  for case1 in train:
    # each case has 4-5 sources
    if DEBUG and i == 10:
      break
    raw_data_train.append(case1["sources"])

    for sum_len in ["long", "short", "tiny"]:
      summaries_train[sum_len].append(case1["summary/" + sum_len])
    if DEBUG:
      i += 1
  return raw_data_train, summaries_train

### Generating Natural Language Summaries from CSVs/Worksheets

In [31]:
def get_openai_response(prompt, template):
    response = client.chat.completions.create(
            model = "gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": template},
                {"role": "user", "content": prompt}
            ]
    )
    return response.choices[0].message.content

In [32]:
import csv
from evaluate import load
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def process_csv(csv_file_path):
  csv_data = []
  with open(csv_file_path, newline='') as csvfile:
      csvreader = csv.reader(csvfile)
      for row in csvreader:
          csv_data.append(row)
  return csv_data


def generate_string(csv_data):
  csv_string = ''
  for i in range(len(csv_data)):
    if i == 0:
        continue
    else:
      csv_string += ': '.join(csv_data[i]) + '\n'
  return csv_string



def generate_summary(processed_string):
    NUM_WORDS = 130
    template = '''
    Setting: You are a helpful and concise assistant designed to assist users in summarizing factsheets.

    Provide clear and precise answers to the user's questions. Avoid unnecessary details and keep your responses brief and to the point.

    Your goal is to understand the user's request, and provide text to
    fulfill the request.

    Your input will be a factsheet with the following format -->
    category1 : detail1
    category2 : detail2
    ...

    Your output will be text.
    '''

    prompt_create = f'''
    You are a lawyer describing the court case to the general public.
    Given the factsheet in the following text format -->
    category1 : detail1
    category2 : detail2
    ...

    Summarize the factsheet so that it is understable to the general public.
    The summary should be paragraph form around {NUM_WORDS} words. Below is the factsheet:
    {processed_string}

     '''
    # Prompting till we get 10 summaries w 650 words
    try_times = 0
    chat_summaries_short = []
    TOTAL_TRIES = 10
    while try_times < TOTAL_TRIES:
      summary = get_openai_response(prompt_create, template)
      sum_words = len(summary.split(" "))
      if sum_words > NUM_WORDS - 20 and sum_words < NUM_WORDS + 20:
        try_times += 1
        chat_summaries_short.append(summary)

    return chat_summaries_short

### Evaluation Metrics

In [33]:
def exact_match(pred, ground_truth):
    if len(pred) < len(ground_truth):
        ground_truth = ground_truth[:len(pred)]
    elif len(pred) > len(ground_truth):
        pred = pred[:len(ground_truth)]
    exact_match = load("exact_match")
    results = exact_match.compute(references=pred, predictions=ground_truth)
    return round(results["exact_match"], 2)


def evaluate(chat_summaries, ground_truth):
  exact_match_scores = []
  for summary in chat_summaries:
    exact_match_scores.append(exact_match(summary, ground_truth))
  return exact_match_scores

def find_best_summary(exact_match_scores):
  best_index = exact_match_scores.index(max(exact_match_scores))
  return best_index


def compute_cosine_similarity(chat_summaries, ground_truth, verbose=True):
  """
  Compute the cosine similarity between the source and summary sheets.
  source_sheet: string representation of the source sheet
  summary_sheet: string representation of the summary sheet
  """
  cosine_scores = []
  vectorizer = TfidfVectorizer()
  for summary in chat_summaries:
    tfidf_matrix = vectorizer.fit_transform([summary, ground_truth])
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    cosine_scores.append(cosine_sim[0][0])
  return cosine_scores

### Main Loop

In [46]:
def main(fact_sheet_path):
  raw_data_train, summaries_train = load_preprocess()
  # change short vs long depending on factsheet type / size
  # specify correct index
  ground_truth = summaries_train['short'][0]

  processed_csv = generate_string(process_csv(fact_sheet_path))
  chat_summaries_short = generate_summary(processed_csv)
  exact_match_scores = evaluate(chat_summaries_short, ground_truth)
  cosine_sim_scores = compute_cosine_similarity(chat_summaries_short, ground_truth)
  best_idx = find_best_summary(cosine_sim_scores)

  # scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
  # scores = []
  # for summary in chat_summaries_short:
  #   scores.append(scorer.score(summary, ground_truth))

  print("Exact similarity scores: ", exact_match_scores)
  print("Cosine similarity scores: ", cosine_sim_scores)
  print("Here is the best summary: ", chat_summaries_short[int(best_idx)])
  return chat_summaries_short[int(best_idx)], ground_truth

In [47]:
fact_sheet_path = 'factsheet/CJ-AL-0020_summary.csv'
best_summary, ground_truth = main(fact_sheet_path)

Exact similarity scores:  [0.07, 0.06, 0.06, 0.05, 0.05, 0.06, 0.07, 0.07, 0.07, 0.07]
Cosine similarity scores:  [0.3395783381585084, 0.4656917420334311, 0.4710055045716709, 0.3961153833087837, 0.3949031330025527, 0.49403565788997245, 0.46429992905023587, 0.5465971942850533, 0.5002320312341256, 0.4495282103141212]
Here is the best summary:  In a legal case from August 28, 2013, an indigent detainee at Montgomery Municipal Jail filed a lawsuit in the Circuit Court of Montgomery County, Alabama citing violations under 42 U.S.C. § 1983. The plaintiff sued the City of Montgomery and Judge Westry, represented by the Southern Poverty Law Center, alleging violations of the Sixth Amendment, due process, and equal protection clause. The plaintiff, facing imprisonment for unpaid traffic fines, moved the case to the United States District Court for the Middle District of Alabama. After a motion to consolidate similar cases was granted, parties engaged in mediation leading to a settlement approve

In [50]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = scorer.score(best_summary, ground_truth)
print(rouge_scores)

P, R, F1 = score([best_summary], [ground_truth], lang="en", verbose=True)

{'rouge1': Score(precision=0.45454545454545453, recall=0.38461538461538464, fmeasure=0.41666666666666663), 'rouge2': Score(precision=0.12844036697247707, recall=0.10852713178294573, fmeasure=0.11764705882352941), 'rougeL': Score(precision=0.2545454545454545, recall=0.2153846153846154, fmeasure=0.23333333333333334)}


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 4.14 seconds, 0.24 sentences/sec


Compute Rouge/BERT Scores for best summary.

In [51]:
print(rouge_scores) # Rouge Scores
print(P, R, F1) # Bert Scores

{'rouge1': Score(precision=0.45454545454545453, recall=0.38461538461538464, fmeasure=0.41666666666666663), 'rouge2': Score(precision=0.12844036697247707, recall=0.10852713178294573, fmeasure=0.11764705882352941), 'rougeL': Score(precision=0.2545454545454545, recall=0.2153846153846154, fmeasure=0.23333333333333334)}
tensor([0.8608]) tensor([0.8841]) tensor([0.8723])
