In [24]:
import pandas as pd
import numpy as np
import statistics

from bert_score import BERTScorer
from rouge_score import rouge_scorer
import ast
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from scipy.spatial import distance

import sys

In [25]:
import os
from openai import OpenAI

import os
from openai import OpenAI

def get_chatpgt_scores(prompt):

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        model="gpt-4o",
    )
    return chat_completion.choices[0].message.content

os.environ["LAS_API_TOKEN"] = "a4adfef6de531c1c258c9eda6f9be5c7a5701e365c32babe72273f63214811df"

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("LAS_API_TOKEN"),
)

In [26]:
def fn(a, b):
    
    product = a*b
    if(product > 25):
        product = 25
    den = abs(10 - (a+b)) + 1
    
    metric = product/den
    
    return metric

def normalize(s, smin, smax):
    
    return (s - smin) / (smax - smin)

def compute_coverage_metric(scores1, scores2):

    if(len(scores1) != len(scores2)):
        return None
    
    coverage_score = []
    for s1, s2 in zip(scores1, scores2):
        scr = normalize(fn(s1, s2), 0.222, 26)    ## 0.22 in the min and 26 is the max possible for 2 documents on a scale of 1-10. We use these to conduct min max normalization
        # print(s1, s2, scr)
        coverage_score.append(scr)

    return statistics.mean(coverage_score)


def compute_attribution_metric(scores1, scores2):

    if(len(scores1) != len(scores2)):
        return None
    
    attr_score_diff = [x-y for x, y in zip(scores1, scores2)]
    
    return abs(statistics.mean(attr_score_diff))


def Convert(score):
    
    score = score.strip('][')
    score = [int(x) for x in list(score.split(","))]
    return score

def coverage_score(docs, summary):   ## docs: list of documents, summary: one summary for all documents

    eval_prompt = open('../prompts/eval_coverage.txt', 'r').read()
    prompt1 = eval_prompt + "\n" + document + ": " + docs[0] + "\n\n=======\n\nSummary sentences: " +  "\n-".join(sent_tokenize(summary))
    prompt2 = eval_prompt + "\n" + document + ": " + docs[1] + "\n\n=======\n\nSummary sentences: " +  "\n-".join(sent_tokenize(summary))

    scores1 = get_chatpgt_scores(prompt1)
    scores1 = Convert(scores1)

    scores2 = get_chatpgt_scores(prompt2)
    scores2 = Convert(scores2)
    
    return scores1, scores2

def get_bert_scores(docs, summary):
    # BERTScore calculation
    scorer = BERTScorer(model_type='bert-base-uncased')
    f1_scores = []
    for d in docs:
        P, R, F1 = scorer.score([summary], [d])
        F1 = float(F1)
        f1_scores.append(F1)
        
    return statistics.mean(f1_scores)


def get_rouge_scores(docs, summary):
    # BERTScore calculation
    f1_scores = []
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
    for d in docs:
        scr = scorer.score(d, summary)
        f1_scores.append(scr['rouge1'].fmeasure)
        
    return statistics.mean(f1_scores)


def get_compression_ratio(docs, summary):
    
    return len(summary.split(' '))/len(" ".join(docs).split(' '))


def get_rating_metric(docs, summary, neg_topic, pos_topic, topic_df):

    topic = list(set(neg_topic).intersection(set(pos_topic)))[0]
    description = topic_df[topic_df.Topic == topic].Description.values[0]

    prompt = open('../prompts/review_rating_prompts.txt').read()
    prompt = prompt.replace('{topic}', topic).replace('{description}', description)
    rating_doc1 = int(get_chatpgt_scores(prompt + "\n" + docs[0]))
    rating_doc2 = int(get_chatpgt_scores(prompt + "\n" + docs[1]))
    rating_sum = int(get_chatpgt_scores(prompt + "\n" + summary))

    # print(rating_doc1, rating_doc2, rating_sum)
    
    expected_rating = (rating_doc1 + rating_doc2)/2
    diff = abs(rating_sum - expected_rating)
    
    return diff/10


def get_cosine_sim_scores(ans_doc1, ans_doc2, ans_sum):
    
    cos1, cos2, diff = [], [], []
    
    model = SentenceTransformer("all-MiniLM-L6-v2")
    doc1_emb = model.encode(ans_doc1.split("\n"))
    doc2_emb = model.encode(ans_doc2.split("\n"))
    sum_emb = model.encode(ans_sum.split("\n"))
    
    # print(doc1_emb.shape)
    
    for d1, d2, s in zip(doc1_emb, doc2_emb, sum_emb):
        
        # print(d1.shape)
        cosA = 1 - distance.cosine(s, d1)
        cosB = 1 - distance.cosine(s, d2)
        diff.append(cosA - cosB)
        cos1.append(cosA)
        cos2.append(cosB)
        
    return abs(statistics.mean(diff))

In [28]:
def get_rev_data():
    
    df = pd.read_csv('../results/hotel_review_summaries.csv')
    # df = pd.read_csv('results/hotel_review_summaries.csv')
    topics = ['Service', 'Cleanliness', 'Value', 'Sleep Quality', 'Rooms', 'Business service (e.g., internet access)', 'Check in / front desk']
    topic_df = pd.read_csv('../data/topic_definitions.csv')
    
    return df, topics, topic_df


def get_news_data():

    df_news = pd.read_csv('../data/conflicting_news_dataset.csv')
    df_summaries = pd.read_csv('../results/news_summaries.csv')
    df_news_qa = pd.read_csv('../data/news_questions.csv')
    
    return df_news, df_summaries


def evaluate_rev_sum(df, topic_df, approach):
    
    attr_corel, attr_scores, coverage_scores, bert_scores, rouge_scores, sum_rating, compression_ratio = [], [], [], [], [], [], []
    scoresA, scoresB = [], []

    for i, row in df.iloc[0:2].iterrows():

        docs = [df.iloc[i]['rev_neg'], df.iloc[i]['rev_pos']]
        summary = df.iloc[i][approach]

        scores1, scores2 = coverage_score(docs, summary)
        corel = np.corrcoef(scores1, scores2)[0,1]
        cvg_scores = compute_coverage_metric(scores1, scores2)
        # attr_scr = compute_attribution_metric(scores1, scores2)
        attr_scr = normalize(compute_attribution_metric(scores1,scores2), 0, 9)
        b_scores = get_bert_scores(docs, summary)
        r_scores = get_rouge_scores(docs, summary)

        rev_topic_neg = ast.literal_eval(df.iloc[0].attr_neg)
        rev_topic_pos = ast.literal_eval(df.iloc[0].attr_pos)
        sum_rating_score = get_rating_metric(docs, summary, rev_topic_neg, rev_topic_pos, topic_df)

        comp_ratio = get_compression_ratio(docs, summary)

        attr_scores.append(attr_scr)
        attr_corel.append(corel)
        coverage_scores.append(cvg_scores)
        bert_scores.append(b_scores)
        rouge_scores.append(r_scores)
        sum_rating.append(sum_rating_score)
        compression_ratio.append(comp_ratio)

        scoresA.append(scores1)
        scoresB.append(scores2)
        
    return attr_corel, attr_scores, coverage_scores, bert_scores, rouge_scores, sum_rating, compression_ratio, scoresA, scoresB


def evaluate_news_sum(approach, df_news, df_summaries):

    attr_corel, attr_scores, coverage_scores, bert_scores, rouge_scores, qa_cos_diff, compression_ratio = [], [], [], [], [], [], []
    scoresA, scoresB = [], []
    
    ## Iterating over each cluster. This is based on the evaluation dataset used (which has 15 clusters numbered from 1-15)
    for c in range(1, 2):

        news = df_news[df_news.cluster_id == c]
        docs = [news[news.side == 'L'].text.values[0], news[news.side == 'R'].text.values[0]]

        summary = df_summaries[df_summaries.cluster == c][approach + '_summary'].values[0]

        Q = df_summaries[df_summaries.cluster == c]['questions']
        # print(Q.values[0])
        # sys.exit()
        qa_prompt = open('../prompts/question_answering_prompt.txt').read()

        qa_prompt_doc1 = qa_prompt + "\n" + document + ":\n" + docs[0] + "\n\n=======\n\nQuestions about the news:\n" + Q.values[0]
        qa_prompt_doc2 = qa_prompt + "\n" + document + ":\n" + docs[1] + "\n\n=======\n\nQuestions about the news:\n" + Q.values[0]
        qa_prompt_sum = qa_prompt + "\n" + document + ":\n" + summary + "\n\n=======\n\nQuestions about the news:\n" + Q.values[0]

        ans_doc1 = get_chatpgt_scores(qa_prompt_doc1)
        ans_doc2 = get_chatpgt_scores(qa_prompt_doc2)
        ans_sum = get_chatpgt_scores(qa_prompt_sum)

        cos_diff = get_cosine_sim_scores(ans_doc1, ans_doc2, ans_sum)
        # print("cos_diff", cos_diff)

        scores1, scores2 = coverage_score(docs, summary)

        corel = np.corrcoef(scores1, scores2)[0,1]
        cvg_scores = compute_coverage_metric(scores1, scores2)
        # attr_scr = compute_attribution_metric(scores1, scores2)
        attr_scr = normalize(compute_attribution_metric(scores1,scores2), 0, 9)
        b_scores = get_bert_scores(docs, summary)
        r_scores = get_rouge_scores(docs, summary)

        # rev_topic_neg = ast.literal_eval(df.iloc[0].attr_neg)
        # rev_topic_pos = ast.literal_eval(df.iloc[0].attr_pos)
        # sum_rating_score = get_rating_metric(docs, summary, rev_topic_neg, rev_topic_pos)

        comp_ratio = get_compression_ratio(docs, summary)

        attr_corel.append(corel)
        attr_scores.append(attr_scr)
        coverage_scores.append(cvg_scores)
        bert_scores.append(b_scores)
        rouge_scores.append(r_scores)
        qa_cos_diff.append(cos_diff)
        compression_ratio.append(comp_ratio)
        
        scoresA.append(scores1)
        scoresB.append(scores2)
        
    return attr_corel, attr_scores, coverage_scores, bert_scores, rouge_scores, qa_cos_diff, compression_ratio, scoresA, scoresB

In [18]:
### Change the document variable to prduce evaluation metrics for news and reviews
document = 'News'

df = pd.DataFrame()
if(document == 'News'):
    
    ## Specify the approach here
    ## Approaches can be [mutliagent, or writer_critic]
    approach = 'writer_critic'
    df, df_summaries = get_news_data()
    attr_corel, attr_scores, coverage_scores, bert_scores, rouge_scores, sum_rating, compression_ratio, scoresA, scoresB = evaluate_news_sum(approach, df, df_summaries)
    
elif(document == 'Reviews'):
    
    ## Specify the approach here
    ## Approaches can be [mutliagent, or writer_critic]
    approach = 'mutliagent'
    df, topics, topic_df = get_rev_data()
    attr_corel, attr_scores, coverage_scores, bert_scores, rouge_scores, sum_rating, compression_ratio, scoresA, scoresB = evaluate_rev_sum(df, topic_df, approach)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [96]:
# attr_corel, attr_scores, coverage_scores, bert_scores, rouge_scores, sum_rating, compression_ratio, scoresA, scoresB

In [111]:
df_summaries = df_summaries[['cluster', 'multiagent_summary', 'writer_critic_summary', 'questions']]

In [115]:
approach = 'WC'   # Short hand notation for the approach to be used as column header for the metrics
df_summaries['corelation_attr_' + approach] = attr_corel
df_summaries['attribution_' + approach] = attr_scores
df_summaries['coverage_' + approach] = coverage_scores 
df_summaries['bertScore_' + approach] = bert_scores
df_summaries['rougeScore_' + approach] = rouge_scores
df_summaries['qa_cos_diff_' + approach] = sum_rating
df_summaries['compression_ratio_' + approach] = compression_ratio

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_summaries['corelation_attr_' + approach] = attr_corel


In [91]:
df.columns

Index(['Unnamed: 0', 'rev_neg', 'rev_pos', 'attr_neg', 'attr_pos', 'cls',
       'multiagent_summary', 'writer_critic_summary', 'corelation_attr_WC',
       'attribution_WC', 'coverage_WC', 'bertScore_WC', 'rougeScore_WC',
       'rating_offset_WC', 'compression_ratio_WC', 'corelation_attr_MA',
       'attribution_MA', 'coverage_MA', 'bertScore_MA', 'rougeScore_MA',
       'rating_offset_MA', 'compression_ratio_MA'],
      dtype='object')

In [99]:
df.round(3).to_csv('../results/summary_metrics.csv')

In [93]:
# df = pd.read_csv('results/hotel_review_summaries.csv')

In [125]:
from scipy.stats import wilcoxon

### Comparing the distribution of attribution scores. 
### Attribution scores are computed pairwise (on attribution score for each sentence for each reference document)
### We compare the distribution toegteher for the entire evaluation dataset (due to less number of attribution scores per summary)
res = wilcoxon(scoresA, scoresB, alternative='two-sided')
res

WilcoxonResult(statistic=244.0, pvalue=4.743994003612866e-05)