In [None]:
import pandas as pd
from rouge_score import rouge_scorer
import pickle
from question_answer_functions import *
import numpy as np

In [2]:
# Set display options
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)  # Show full text in columns


In [3]:
# pip install rouge-score


In [4]:
# Step to open and load the pickle files
pickle_file_path = 'random_question_dataframe.pkl'
with open(pickle_file_path, 'rb') as file:
    random_question_dataframe = pickle.load(file)

In [None]:
random_question_dataframe

In [6]:

# Assuming you have a function `model_function` that takes a question and returns a predicted answer.
def model_function(question,document):
    # Placeholder for model's answer generation logic
    # Replace with your actual question-answering model function
    best_chunk,word_count=user_query(question,document)
    # print(best_score)
        # Generate outputs using multiple models
    
    model_outputs_1 = models_output1(question,best_chunk)
    model_outputs_2 = models_output2(question,best_chunk)
    output = {}
    output['gpt-neo-1.3B'] = model_outputs_2
    output['roberta-base-cuad-finetuned'] = model_outputs_1
    return output

In [7]:
# Function to compute ROUGE score
def compute_rouge(predicted_answer, actual_answer):
    scores = scorer.score(predicted_answer, actual_answer)
    return scores

In [None]:


# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)


# Initialize empty lists to store results
predicted_answers = []
rouge_scores_list_bert = []
rouge_scores_list_llm = []

# Loop through each question and get the model's predicted answer
for index, row in random_question_dataframe.iterrows():
    question = row['question_text']
    actual_answer = row['answer_text']
    document = row['document_title'] + '.PDF'
    
    # Get the predicted answer from the model
    predicted_answer = model_function(question, document)
    print(predicted_answer)
    #predicted_answer = ''
    
    # Compute the ROUGE score
    rouge_scores_1 = compute_rouge(predicted_answer['roberta-base-cuad-finetuned'], actual_answer)
    rouge_scores_2 = compute_rouge(predicted_answer['gpt-neo-1.3B'], actual_answer)
    
    # rouge_score_dict = {}
    # rouge_score_dict['gpt-neo-1.3B'] = rouge_scores_2
    # rouge_score_dict['roberta-base-cuad-finetuned'] = rouge_scores_1
    # Store the predicted answer and the ROUGE score
    rouge_scores_list_bert.append(rouge_scores_1)
    rouge_scores_list_llm.append(rouge_scores_2)

    predicted_answers.append(predicted_answer)
    # rouge_scores_list.append(rouge_score_dict)

# Add predicted answers and ROUGE scores to the DataFrame
random_question_dataframe['predicted_answer'] = predicted_answers
random_question_dataframe['rouge_score_bert_model'] = rouge_scores_list_bert
random_question_dataframe['rouge_score_llm_model'] = rouge_scores_list_llm


# Save the DataFrame to CSV for evaluation
random_question_dataframe.to_csv('model_predictions_rouge_evaluation.csv', index=False)

print("Model predictions and ROUGE scores saved to 'model_predictions_rouge_evaluation.csv'")


In [None]:
random_question_dataframe.head()

In [None]:

# Initialize lists to store precision, recall, and F1 for each ROUGE metric
rouge1_p_bert, rouge1_r_bert, rouge1_f_bert = [], [], []
rouge2_p_bert, rouge2_r_bert, rouge2_f_bert = [], [], []
rougeL_p_bert, rougeL_r_bert, rougeL_f_bert = [], [], []

# Loop through each row in the 'rouge_scores' column
for index, row in random_question_dataframe.iterrows():
    score = row['rouge_score_bert_model']  # Extract the dictionary for each row
    
    # Append Rouge-1 scores
    rouge1_p_bert.append(score['rouge1'][0])
    rouge1_r_bert.append(score['rouge1'][1])
    rouge1_f_bert.append(score['rouge1'][2])
    
    # Append Rouge-2 scores
    rouge2_p_bert.append(score['rouge2'][0])
    rouge2_r_bert.append(score['rouge2'][1])
    rouge2_f_bert.append(score['rouge2'][2])
    
    # Append Rouge-L scores
    rougeL_p_bert.append(score['rougeL'][0])
    rougeL_r_bert.append(score['rougeL'][1])
    rougeL_f_bert.append(score['rougeL'][2])

# Calculate average precision, recall, and F1 for ROUGE-1, ROUGE-2, and ROUGE-L
rouge1_avg = (np.mean(rouge1_p_bert), np.mean(rouge1_r_bert), np.mean(rouge1_f_bert))
rouge2_avg = (np.mean(rouge2_p_bert), np.mean(rouge2_r_bert), np.mean(rouge2_f_bert))
rougeL_avg = (np.mean(rougeL_p_bert), np.mean(rougeL_r_bert), np.mean(rougeL_f_bert))

# Print average ROUGE scores
print(f"For roberta-base-cuad-finetuned model ROUGE-1 avg: Precision: {rouge1_avg[0]:.2f}, Recall: {rouge1_avg[1]:.2f}, F1: {rouge1_avg[2]:.2f}")
print(f"For roberta-base-cuad-finetuned model ROUGE-2 avg: Precision: {rouge2_avg[0]:.2f}, Recall: {rouge2_avg[1]:.2f}, F1: {rouge2_avg[2]:.2f}")
print(f"For roberta-base-cuad-finetuned model ROUGE-L avg: Precision: {rougeL_avg[0]:.2f}, Recall: {rougeL_avg[1]:.2f}, F1: {rougeL_avg[2]:.2f}")


In [None]:

# Initialize lists to store precision, recall, and F1 for each ROUGE metric
rouge1_p_llm, rouge1_r_llm, rouge1_f_llm = [], [], []
rouge2_p_llm, rouge2_r_llm, rouge2_f_llm = [], [], []
rougeL_p_llm, rougeL_r_llm, rougeL_f_llm = [], [], []

# Loop through each row in the 'rouge_scores' column
for index, row in random_question_dataframe.iterrows():
    score = row['rouge_score_llm_model']  # Extract the dictionary for each row
    
    # Append Rouge-1 scores
    rouge1_p_llm.append(score['rouge1'][0])
    rouge1_r_llm.append(score['rouge1'][1])
    rouge1_f_llm.append(score['rouge1'][2])
    
    # Append Rouge-2 scores
    rouge2_p_llm.append(score['rouge2'][0])
    rouge2_r_llm.append(score['rouge2'][1])
    rouge2_f_llm.append(score['rouge2'][2])
    
    # Append Rouge-L scores
    rougeL_p_llm.append(score['rougeL'][0])
    rougeL_r_llm.append(score['rougeL'][1])
    rougeL_f_llm.append(score['rougeL'][2])

# Calculate average precision, recall, and F1 for ROUGE-1, ROUGE-2, and ROUGE-L
rouge1_avg = (np.mean(rouge1_p_llm), np.mean(rouge1_r_llm), np.mean(rouge1_f_llm))
rouge2_avg = (np.mean(rouge2_p_llm), np.mean(rouge2_r_llm), np.mean(rouge2_f_llm))
rougeL_avg = (np.mean(rougeL_p_llm), np.mean(rougeL_r_llm), np.mean(rougeL_f_llm))

# Print average ROUGE scores
print(f"For gpt-neo-1.3B model ROUGE-1 avg: Precision: {rouge1_avg[0]:.2f}, Recall: {rouge1_avg[1]:.2f}, F1: {rouge1_avg[2]:.2f}")
print(f"For gpt-neo-1.3B model ROUGE-2 avg: Precision: {rouge2_avg[0]:.2f}, Recall: {rouge2_avg[1]:.2f}, F1: {rouge2_avg[2]:.2f}")
print(f"For gpt-neo-1.3B model ROUGE-L avg: Precision: {rougeL_avg[0]:.2f}, Recall: {rougeL_avg[1]:.2f}, F1: {rougeL_avg[2]:.2f}")
