# Import required libraries

In [None]:
!pip install transformers torch accelerate tensorflow-hub bert-tensorflow tensorflow tqdm bert-score sentence_transformers rank_bm25

In [None]:
!pip install datasets

In [None]:
!pip install rouge-score

In [None]:
!pip install sacrebleu

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM, Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification, MarianMTModel, MarianTokenizer, BertConfig
import torch
from transformers import BertTokenizer, BertForSequenceClassification, TFBertForSequenceClassification
from transformers import LlamaTokenizer, LlamaForCausalLM
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize, sent_tokenize
from statistics import mean
import pandas as pd
import tensorflow_hub as hub
import tensorflow as tf
from datetime import datetime
from torch.utils.data import DataLoader
import re
import nltk
from nltk.corpus import wordnet
import random
from tqdm import tqdm
from datasets import load_metric
from bert_score import score as bert_score
import concurrent.futures
from nltk.tokenize import sent_tokenize
import matplotlib.pyplot as plt
import json
import numpy as np
from bert_score import score
from google.colab import drive
drive.mount("/content/drive")


# Import and clean complete posts

In [None]:
nltk.download('punkt')
dataset_path = "/content/drive/My Drive/Diss_Dataset/dataset500_cleaned.csv"
data = pd.read_csv(dataset_path)
data["userid"] = data.iloc[:, 0]
data["posts"] = data.iloc[:, 1]
data["label"] = data.iloc[:, 2]

data['posts'] = data['posts'].str.strip('[]').str.split("', '")
data['posts'] = data['posts'].apply(lambda x: [post.strip("' ") for post in x])

def clean_post(post):
    post = re.sub(r'\*+', '', post)
    post = re.sub(r'\s+', ' ', post)
    post = re.sub(r'&gt;', '', post)
    post = re.sub(r'[^\x00-\x7F]+', '', post)
    post = re.sub(r'"[^"]*"', '', post)
    post = re.sub(r'\([^)]*\)', '', post)
    post = re.sub(r'[()]', '', post)
    post = re.sub(r'\[[^\]]*\]', '', post)
    post = re.sub(r'[\[\]]', '', post)
    return post.strip().lower()

data["clean_posts"] = data["posts"].apply(lambda posts: [clean_post(post) for post in posts])
data['combined_posts'] = data['clean_posts'].apply(lambda posts: ' '.join(posts))


print(data['combined_posts'][1])

# Import highlights, user risk and supportiveness percentage

In [None]:
top_5_path = "/content/drive/My Drive/Diss_Dataset/top_5_sentences_per_user.csv"
top_5_df = pd.read_csv(top_5_path)

user_risk_supportiveness_path = "/content/drive/My Drive/Diss_Dataset/user_risk_with_supportiveness.csv"
risk_supportiveness_df = pd.read_csv(user_risk_supportiveness_path)

# Create meta-information

In [None]:
def create_meta_info(user_id, risk_df, top_5_df):
    risk_info = risk_df[risk_df['userid'] == user_id]
    top_5_info = top_5_df[top_5_df['userid'] == user_id]

    if not risk_info.empty:
        risk_val = risk_info['risk_rating'].values[0]
        risk = "Very High" if risk_val == 5 else "High" if risk_val == 4 else "Medium" if risk_val == 3 else "Low" if risk_val == 2 else "Very Low" if risk_val == 1 else "No"
        supportiveness_ratio = risk_info['supportiveness_ratio'].values[0]
    else:
        risk = "Unknown"
        supportiveness_ratio = "Unknown"

    top_5_sentences = ' '.join(top_5_info['sentence'].tolist())
    supportive_percentage = (supportiveness_ratio * 100).round(2)

    meta_info = f"The author indicates {risk} suicidal risk, with {supportive_percentage}% of their sentences supporting other users."
    return meta_info

# Clear GPU RAM and initialise a model

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
model_name1 = "/content/drive/My Drive/Diss_Dataset/DPO7b"
tokenizer1 = AutoTokenizer.from_pretrained(model_name1)
model1 = AutoModelForCausalLM.from_pretrained(model_name1, device_map="auto")

In [None]:
model_name1 = "/content/drive/My Drive/Diss_Dataset/Mistral7b"
tokenizer1 = AutoTokenizer.from_pretrained(model_name1)
model1 = AutoModelForCausalLM.from_pretrained(model_name1, torch_dtype=torch.bfloat16, device_map="auto")

# Prompts for Tulu-2-DPO-7b

In [None]:
Zero_prompt_model1 = """
<|user|> In one paragraph, summarise the provided text from the author on Reddit:\n

Full input text: {posts}\n

<|assistant|> Summary:\n

"""

In [None]:
Zero_prompt_meta_model1 = """
<|user|> In one paragraph, summarise the provided text from the author on Reddit:\n
Full input text: {posts}\n
Meta-Information: {meta_info}\n
<|assistant|> Summary:\n

"""

In [None]:
CoT_prompt_meta_model1 = """
<|user|> As a mental health expert, your task is to summarise in one paragraph, directly from the provided input from the author on Reddit. When summarising, consier these aspects:\n
Emotions: Evaluate expressed emotions, from sadness to intense psychological pain.
Cognitions: Explore the individual’s thoughts and perceptions about suicide, including the level and frequency of suicidal thoughts, intentions of suicide, and any existing plans.
Behavior and Motivation: Evaluate the user’s actions related to suicide, such as access to means and concrete plans. Consider their ability to handle difficult/stressful situations and the motivations behind their desire to die.
Interpersonal and Social Support: Investigate the individual’s social support or stable relationships, and understand their feelings toward significant others.
Mental Health-Related Issues: Consider psychiatric diagnoses associated with suicide such as schizophrenia, bipolar, anxiety, eating disorder, previous suicidal attempts, and others.
Additional Risk Factors: Consider other factors like socioeconomic and demographic factors, exposure to suicide behavior by others, chronic medical conditions, etc.
Supportive Nature: Take into account authors who are supporting other users in their writing.

Input to be summarised: {posts}\n
Meta-Information: {meta_info}\n
<|assistant|> Summary:\n

"""

In [None]:
CoT_prompt_model1 = """
<|user|> As a mental health expert, your task is to summarise in one paragraph, directly from the provided input from the author on Reddit. When summarising, consier these aspects:\n
Emotions: Evaluate expressed emotions, from sadness to intense psychological pain.
Cognitions: Explore the individual’s thoughts and perceptions about suicide, including the level and frequency of suicidal thoughts, intentions of suicide, and any existing plans.
Behavior and Motivation: Evaluate the user’s actions related to suicide, such as access to means and concrete plans. Consider their ability to handle difficult/stressful situations and the motivations behind their desire to die.
Interpersonal and Social Support: Investigate the individual’s social support or stable relationships, and understand their feelings toward significant others.
Mental Health-Related Issues: Consider psychiatric diagnoses associated with suicide such as schizophrenia, bipolar, anxiety, eating disorder, previous suicidal attempts, and others.
Additional Risk Factors: Consider other factors like socioeconomic and demographic factors, exposure to suicide behavior by others, chronic medical conditions, etc.
Supportive Nature: Take into account authors who are supporting other users in their writing.

Input to be summarised: {posts}\n

<|assistant|> Summary:\n

"""

# Prompts for Mistral-7b-instruct-v0.3

In [None]:
Zero_prompt_model1 = """
[INST]
In one paragraph, summarise the provided text from the author on Reddit:\n
#Full input text:
{posts}\n
Summary: [/INST]

"""

In [None]:
Zero_prompt_meta_model1 = """
[INST]
In one paragraph, summarise the provided text from the author on Reddit:\n
#Full input text:
{posts}\n
#Meta-Information:
{meta_info}\n
Summary: [/INST]

"""

In [None]:
CoT_prompt_meta_model1 = """
[INST]
As a mental health expert, your task is to summarise in one paragraph, directly from the provided input from the author on Reddit. When summarising, consier these aspects:\n
Emotions: Evaluate expressed emotions, from sadness to intense psychological pain.
Cognitions: Explore the individual’s thoughts and perceptions about suicide, including the level and frequency of suicidal thoughts, intentions of suicide, and any existing plans.
Behavior and Motivation: Evaluate the user’s actions related to suicide, such as access to means and concrete plans. Consider their ability to handle difficult/stressful situations and the motivations behind their desire to die.
Interpersonal and Social Support: Investigate the individual’s social support or stable relationships, and understand their feelings toward significant others.
Mental Health-Related Issues: Consider psychiatric diagnoses associated with suicide such as schizophrenia, bipolar, anxiety, eating disorder, previous suicidal attempts, and others.
Additional Risk Factors: Consider other factors like socioeconomic and demographic factors, exposure to suicide behavior by others, chronic medical conditions, etc.
Supportive Nature: Take into account authors who are supporting other users in their writing.

#Input to be summarised:
{posts}\n
#Meta-Information:
{meta_info}\n
#Summary: [/INST]

"""

In [None]:
CoT_prompt_model1 = """
[INST]
As a mental health expert, your task is to summarise in one paragraph, directly from the provided input from the author on Reddit. When summarising, consier these aspects:\n
Emotions: Evaluate expressed emotions, from sadness to intense psychological pain.
Cognitions: Explore the individual’s thoughts and perceptions about suicide, including the level and frequency of suicidal thoughts, intentions of suicide, and any existing plans.
Behavior and Motivation: Evaluate the user’s actions related to suicide, such as access to means and concrete plans. Consider their ability to handle difficult/stressful situations and the motivations behind their desire to die.
Interpersonal and Social Support: Investigate the individual’s social support or stable relationships, and understand their feelings toward significant others.
Mental Health-Related Issues: Consider psychiatric diagnoses associated with suicide such as schizophrenia, bipolar, anxiety, eating disorder, previous suicidal attempts, and others.
Additional Risk Factors: Consider other factors like socioeconomic and demographic factors, exposure to suicide behavior by others, chronic medical conditions, etc.
Supportive Nature: Take into account authors who are supporting other users in their writing.

#Input to be summarised:
{posts}\n
#Summary: [/INST]

"""

# Generate summaries

In [None]:
def truncate_text(text, max_input_tokens=1800):
    tokens = tokenizer1.tokenize(text)
    if len(tokens) > max_input_tokens:
        truncated_tokens = tokens[:max_input_tokens]
        truncated_text = tokenizer1.convert_tokens_to_string(truncated_tokens)
        return truncated_text
    return text

reserved_tokens = 512

def extract_assistant_response(text):
    assistant_marker = "[/INST]]]"
    if assistant_marker in text:
        summary = text.split(assistant_marker)[1].strip()
    else:
        summary = text.strip()
    return summary


results = []

for user_id in risk_supportiveness_df['userid'].unique()[:50]:
    full_text = data[data['userid'] == user_id]['combined_posts'].values[0]

    meta_info = create_meta_info(user_id, risk_supportiveness_df, top_5_df)
    truncated_text = truncate_text(full_text, max_input_tokens=2048 - reserved_tokens)

    input_text_zero_shot = Zero_prompt_model1.format(posts=truncated_text)
    input_text_zero_shot_meta = Zero_prompt_meta_model1.format(meta_info=meta_info, posts=truncated_text)
    input_text_cot = CoT_prompt_model1.format(posts=truncated_text)
    input_text_cot_meta = CoT_prompt_meta_model1.format(meta_info=meta_info, posts=truncated_text)

    inputs_zero_shot = tokenizer1(input_text_zero_shot, return_tensors="pt").to("cuda")
    inputs_zero_shot_meta = tokenizer1(input_text_zero_shot_meta, return_tensors="pt").to("cuda")
    inputs_cot = tokenizer1(input_text_cot, return_tensors="pt").to("cuda")
    inputs_cot_meta = tokenizer1(input_text_cot_meta, return_tensors="pt").to("cuda")

    outputs_zero_shot = model1.generate(**inputs_zero_shot, max_new_tokens=200)
    outputs_zero_shot_meta = model1.generate(**inputs_zero_shot_meta, max_new_tokens=200)
    outputs_cot = model1.generate(**inputs_cot, max_new_tokens=200)
    outputs_cot_meta = model1.generate(**inputs_cot_meta, max_new_tokens=200)

    zero_shot_summary = extract_assistant_response(tokenizer1.decode(outputs_zero_shot[0], skip_special_tokens=True))
    zero_shot_meta_summary = extract_assistant_response(tokenizer1.decode(outputs_zero_shot_meta[0], skip_special_tokens=True))
    cot_summary = extract_assistant_response(tokenizer1.decode(outputs_cot[0], skip_special_tokens=True))
    cot_meta_summary = extract_assistant_response(tokenizer1.decode(outputs_cot_meta[0], skip_special_tokens=True))

    results.append({
        "userid": user_id,
        "zero_shot_summary": zero_shot_summary,
        "zero_shot_meta_summary": zero_shot_meta_summary,
        "cot_summary": cot_summary,
        "cot_meta_summary": cot_meta_summary
    })

results_df = pd.DataFrame(results)
results_df.to_csv('/content/drive/My Drive/Diss_Dataset/summary_comparisons2.csv', index=False)

print(results_df.head())

# Extract the summaries

In [None]:
def extract_assistant_response(text):
    assistant_marker = "Summary:"
    if assistant_marker in text:
        summary = text.split(assistant_marker)[1].strip()
    else:
        summary = text.strip()

    summary = summary.split('\n')[0].strip()

    return summary

In [None]:
results_df = pd.read_csv('/content/drive/My Drive/Diss_Dataset/summary_comparisons.csv')
print(results_df.head())
for index, row in results_df.iterrows():
    zero_shot_summary_stripped = extract_assistant_response(row['zero_shot_summary'])
    zero_shot_meta_summary_stripped = extract_assistant_response(row['zero_shot_meta_summary'])
    cot_summary_stripped = extract_assistant_response(row['cot_summary'])
    cot_meta_summary_stripped = extract_assistant_response(row['cot_meta_summary'])

    results_df.at[index, 'zero_shot_summary'] = zero_shot_summary_stripped
    results_df.at[index, 'zero_shot_meta_summary'] = zero_shot_meta_summary_stripped
    results_df.at[index, 'cot_summary'] = cot_summary_stripped
    results_df.at[index, 'cot_meta_summary'] = cot_meta_summary_stripped

print(results_df.head())
results_df.to_csv('/content/drive/My Drive/Diss_Dataset/summary_comparisons_cleaned.csv', index=False)


In [None]:
original_texts = []
zero_shot_summaries = []
zero_shot_meta_summaries = []
cot_summaries = []
cot_meta_summaries = []

for index, row in results_df.iterrows():
    user_id = row['userid']

    full_text = data[data['userid'] == user_id]['combined_posts'].values[0]
    original_texts.append(full_text)

    zero_shot_summaries.append(row['zero_shot_summary'])
    zero_shot_meta_summaries.append(row['zero_shot_meta_summary'])
    cot_summaries.append(row['cot_summary'])
    cot_meta_summaries.append(row['cot_meta_summary'])

# Define and produce evaluation metrics

In [None]:
model_sbert = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def compute_sbert_similarity(summary, reference):
    summary_emb = model_sbert.encode(summary, convert_to_tensor=True)
    reference_emb = model_sbert.encode(reference, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(summary_emb, reference_emb)
    return similarity.item()

In [None]:
qg_model = pipeline('text2text-generation', model='valhalla/t5-small-qg-hl')
qa_model = pipeline('question-answering')

def generate_questions(text):
    inputs = f"generate questions: {text}"
    questions = qg_model(inputs)
    return [q['generated_text'] for q in questions]

def answer_questions(context, questions):
    answers = []
    for question in questions:
        answer = qa_model(question=question, context=context)
        answers.append(answer['answer'])
    return answers

def evaluate_with_qaeval(original_texts, summaries):
    qa_scores = []
    for original, summary in zip(original_texts, summaries):
        questions = generate_questions(original)
        original_answers = answer_questions(original, questions)
        summary_answers = answer_questions(summary, questions)

        sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        similarities = [util.pytorch_cos_sim(sbert_model.encode(o), sbert_model.encode(s)).item()
                        for o, s in zip(original_answers, summary_answers)]

        qa_scores.append(np.mean(similarities))
    return qa_scores

zero_shot_qa_scores = evaluate_with_qaeval(original_texts, zero_shot_summaries)
zero_shot_meta_qa_scores = evaluate_with_qaeval(original_texts, zero_shot_meta_summaries)
cot_qa_scores = evaluate_with_qaeval(original_texts, cot_summaries)
cot_meta_qa_scores = evaluate_with_qaeval(original_texts, cot_meta_summaries)

average_zero_shot = np.mean(zero_shot_qa_scores)
average_zero_shot_meta = np.mean(zero_shot_meta_qa_scores)
average_cot = np.mean(cot_qa_scores)
average_cot_meta = np.mean(cot_meta_qa_scores)

print(f"Zero-Shot Summaries - QA Scores: {average_zero_shot}")
print(f"Zero-Shot Meta Summaries - QA Scores: {average_zero_shot_meta}")
print(f"CoT Summaries - QA Scores: {average_cot}")
print(f"CoT Meta Summaries - QA Scores: {average_cot_meta}")

In [None]:
def evaluate_with_bm25(original_texts, summaries):
    ir_scores = []
    for original, summary in zip(original_texts, summaries):
        original_sentences = sent_tokenize(original.lower())
        tokenized_sentences = [word_tokenize(s) for s in original_sentences]

        bm25 = BM25Okapi(tokenized_sentences)

        summary_tokens = word_tokenize(summary.lower())

        scores = bm25.get_scores(summary_tokens)

        ir_scores.append(sum(scores) / len(scores))
    return ir_scores

zero_shot_ir_scores = evaluate_with_bm25(original_texts, zero_shot_summaries)
zero_shot_meta_ir_scores = evaluate_with_bm25(original_texts, zero_shot_meta_summaries)
cot_ir_scores = evaluate_with_bm25(original_texts, cot_summaries)
cot_meta_ir_scores = evaluate_with_bm25(original_texts, cot_meta_summaries)

average_zero_shot_ir = mean(zero_shot_ir_scores)
average_zero_shot_meta_ir = mean(zero_shot_meta_ir_scores)
average_cot_ir = mean(cot_ir_scores)
average_cot_meta_ir = mean(cot_meta_ir_scores)

print(f"Average IR Score - Zero-Shot Summaries: {average_zero_shot_ir}")
print(f"Average IR Score - Zero-Shot Meta Summaries: {average_zero_shot_meta_ir}")
print(f"Average IR Score - CoT Summaries: {average_cot_ir}")
print(f"Average IR Score - CoT Meta Summaries: {average_cot_meta_ir}")


In [None]:
def compute_coverage(summary, reference):
    key_points = reference.split()[:10]
    coverage = sum(1 for kp in key_points if kp in summary) / len(key_points)
    return coverage

In [None]:
from sklearn.metrics import precision_score
def compute_fidelity(summary, reference):
    key_points = reference.split()[:10]
    return precision_score([kp in summary for kp in key_points], [True] * len(key_points))


In [None]:
original_texts = []
zero_shot_summaries = []
zero_shot_meta_summaries = []
cot_summaries = []
cot_meta_summaries = []

for index, row in results_df.iterrows():
    user_id = row['userid']
    full_text = data[data['userid'] == user_id]['combined_posts'].values[0]
    original_texts.append(full_text)

    zero_shot_summaries.append(row['zero_shot_summary'])
    zero_shot_meta_summaries.append(row['zero_shot_meta_summary'])
    cot_summaries.append(row['cot_summary'])
    cot_meta_summaries.append(row['cot_meta_summary'])

for summary_type, summaries in [("zero_shot", zero_shot_summaries),
                                ("zero_shot_meta", zero_shot_meta_summaries),
                                ("cot", cot_summaries),
                                ("cot_meta", cot_meta_summaries)]:

    sbert_similarities = []
    coverage_scores = []
    fidelity_scores = []

    for summary, reference in zip(summaries, original_texts):
        sbert_similarities.append(compute_sbert_similarity(summary, reference))
        coverage_scores.append(compute_coverage(summary, reference))
        fidelity_scores.append(compute_fidelity(summary, reference))

    print(f"{summary_type.capitalize()} Summaries -")
    print(f"Semantic Similarity (Sentence-BERT): {np.mean(sbert_similarities):.4f}")
    print(f"Coverage Score: {np.mean(coverage_scores):.4f}")
    print(f"Fidelity Score: {np.mean(fidelity_scores):.4f}")

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()
del model1
del tokenizer1

In [None]:
model_name = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"
nli_model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

nli_pipeline = pipeline("text-classification", model=nli_model, tokenizer=tokenizer, return_all_scores=True)

In [None]:
rouge = load_metric("rouge")
bleu = load_metric("bleu")

  rouge = load_metric("rouge")


In [None]:
def evaluate_nli(original_texts, summaries, nli_pipeline):
    consistency_scores = []
    contradiction_scores = []

    for original, summary in tqdm(zip(original_texts, summaries), total=len(original_texts), desc="Evaluating NLI"):
        input_pair = f"{summary} [SEP] {original}"

        outputs = nli_pipeline(input_pair)

        entailment_score = next(score['score'] for score in outputs[0] if score['label'].lower() == 'entailment')
        contradiction_score = next(score['score'] for score in outputs[0] if score['label'].lower() == 'contradiction')

        consistency_scores.append(entailment_score)
        contradiction_scores.append(contradiction_score)

    mean_consistency = sum(consistency_scores) / len(consistency_scores)
    max_contradiction = max(contradiction_scores)

    return mean_consistency, max_contradiction

In [None]:
def compute_bertscore(original_texts, summaries):
    P, R, F1 = bert_score(summaries, original_texts, lang="en", verbose=True)
    return P.mean().item(), R.mean().item(), F1.mean().item()

def compute_rouge_bleu(original_texts, summaries):
    rouge_results = rouge.compute(predictions=summaries, references=original_texts)
    bleu_results = bleu.compute(predictions=[summary.split() for summary in summaries],
                                references=[[text.split()] for text in original_texts])

    return rouge_results, bleu_results

print("Computing BERTScore for Zero-Shot Summaries...")
zero_shot_P, zero_shot_R, zero_shot_F1 = compute_bertscore(original_texts, zero_shot_summaries)

print("Computing BERTScore for Zero-Shot Meta Summaries...")
zero_shot_meta_P, zero_shot_meta_R, zero_shot_meta_F1 = compute_bertscore(original_texts, zero_shot_meta_summaries)

print("Computing BERTScore for CoT Summaries...")
cot_P, cot_R, cot_F1 = compute_bertscore(original_texts, cot_summaries)

print("Computing BERTScore for CoT Meta Summaries...")
cot_meta_P, cot_meta_R, cot_meta_F1 = compute_bertscore(original_texts, cot_meta_summaries)

print("Computing ROUGE and BLEU for Zero-Shot Summaries...")
zero_shot_rouge, zero_shot_bleu = compute_rouge_bleu(original_texts, zero_shot_summaries)

print("Computing ROUGE and BLEU for Zero-Shot Meta Summaries...")
zero_shot_meta_rouge, zero_shot_meta_bleu = compute_rouge_bleu(original_texts, zero_shot_meta_summaries)

print("Computing ROUGE and BLEU for CoT Summaries...")
cot_rouge, cot_bleu = compute_rouge_bleu(original_texts, cot_summaries)

print("Computing ROUGE and BLEU for CoT Meta Summaries...")
cot_meta_rouge, cot_meta_bleu = compute_rouge_bleu(original_texts, cot_meta_summaries)

print("Evaluating NLI for Zero-Shot Summaries...")
zero_shot_consistency, zero_shot_contradiction = evaluate_nli(original_texts, zero_shot_summaries, nli_pipeline)

print("Evaluating NLI for Zero-Shot Meta Summaries...")
zero_shot_meta_consistency, zero_shot_meta_contradiction = evaluate_nli(original_texts, zero_shot_meta_summaries, nli_pipeline)

print("Evaluating NLI for CoT Summaries...")
cot_consistency, cot_contradiction = evaluate_nli(original_texts, cot_summaries, nli_pipeline)

print("Evaluating NLI for CoT Meta Summaries...")
cot_meta_consistency, cot_meta_contradiction = evaluate_nli(original_texts, cot_meta_summaries, nli_pipeline)

print(f"Zero-Shot Summaries - BERTScore: P: {zero_shot_P:.4f}, R: {zero_shot_R:.4f}, F1: {zero_shot_F1:.4f}")
print(f"Zero-Shot Meta Summaries - BERTScore: P: {zero_shot_meta_P:.4f}, R: {zero_shot_meta_R:.4f}, F1: {zero_shot_meta_F1:.4f}")
print(f"CoT Summaries - BERTScore: P: {cot_P:.4f}, R: {cot_R:.4f}, F1: {cot_F1:.4f}")
print(f"CoT Meta Summaries - BERTScore: P: {cot_meta_P:.4f}, R: {cot_meta_R:.4f}, F1: {cot_meta_F1:.4f}")

print(f"Zero-Shot Summaries - ROUGE: {zero_shot_rouge}, BLEU: {zero_shot_bleu['bleu']:.4f}")
print(f"Zero-Shot Meta Summaries - ROUGE: {zero_shot_meta_rouge}, BLEU: {zero_shot_meta_bleu['bleu']:.4f}")
print(f"CoT Summaries - ROUGE: {cot_rouge}, BLEU: {cot_bleu['bleu']:.4f}")
print(f"CoT Meta Summaries - ROUGE: {cot_meta_rouge}, BLEU: {cot_meta_bleu['bleu']:.4f}")

print(f"Zero-Shot Summaries - NLI: Mean Consistency: {zero_shot_consistency:.4f}, Max Contradiction: {zero_shot_contradiction:.4f}")
print(f"Zero-Shot Meta Summaries - NLI: Mean Consistency: {zero_shot_meta_consistency:.4f}, Max Contradiction: {zero_shot_meta_contradiction:.4f}")
print(f"CoT Summaries - NLI: Mean Consistency: {cot_consistency:.4f}, Max Contradiction: {cot_contradiction:.4f}")
print(f"CoT Meta Summaries - NLI: Mean Consistency: {cot_meta_consistency:.4f}, Max Contradiction: {cot_meta_contradiction:.4f}")