In [None]:
import torch
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import torch
import re

In [None]:
def lcs(X, Y):
    m = len(X)
    n = len(Y)
    L = [[0] * (n + 1) for i in range(m + 1)]

    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0:
                L[i][j] = 0
            elif X[i - 1] == Y[j - 1]:
                L[i][j] = L[i - 1][j - 1] + 1
            else:
                L[i][j] = max(L[i - 1][j], L[i][j - 1])
    return L[m][n]


def compute_lcs_score_matrix(per_sentence, documents):
    num_sentences = len(per_sentence)
    num_docs = len(documents)
    score_matrix = torch.zeros((num_sentences, num_docs))

    for i, sentence in enumerate(per_sentence):
        for j, doc_content in enumerate(documents):
            score_matrix[i][j] = lcs(sentence, doc_content)

    return score_matrix


def compute_score_matrix(per_sentence, documents, score_type='bleu', n_gram=1):
    num_sentences = len(per_sentence)
    num_docs = len(documents)
    score_matrix = torch.zeros((num_sentences, num_docs))

    for i, sentence in enumerate(per_sentence):
        sentence_tokens = word_tokenize(sentence)
        for j, doc in enumerate(documents):
            # Ensure doc is a string before tokenization
            doc_content = doc if isinstance(doc, str) else str(doc)
            doc_tokens = word_tokenize(doc_content)

            if score_type == 'bleu':
                score_matrix[i][j] = calculate_bleu(doc_tokens, sentence_tokens, n_gram)
            elif score_type == 'rouge':
                score_matrix[i][j] = calculate_rouge(doc_tokens, sentence_tokens, n_gram)

    return score_matrix


def calculate_bleu(reference, candidate, n_gram):
    weights = [1.0 / n_gram] * n_gram  # Equal weights for each n-gram
    return sentence_bleu([reference], candidate, weights=weights)

def calculate_rouge(reference, candidate, n_gram):
    scorer = rouge_scorer.RougeScorer([f'rouge{n_gram}'], use_stemmer=True)
    scores = scorer.score(' '.join(reference), ' '.join(candidate))
    return scores[f'rouge{n_gram}'].fmeasure


def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0


def compute_jaccard_matrix(keyword_sentences, keyword_documents):
    num_sentences = len(keyword_sentences)
    num_docs = len(keyword_documents)
    jaccard_matrix = torch.zeros((num_sentences, num_docs))

    for i, sentence_keywords in enumerate(keyword_sentences):
        for j, document_keywords in enumerate(keyword_documents):
            jaccard_matrix[i][j] = jaccard_similarity(sentence_keywords, document_keywords)

    return jaccard_matrix


def flatten_keyword_list(keyword_list):
    flattened_set = set()
    for keywords in keyword_list:
        if isinstance(keywords, set):
            flattened_set = flattened_set.union(keywords)
        elif isinstance(keywords, str):
            flattened_set.add(keywords)
        else:
            raise TypeError(f"Unexpected type in keyword list: {type(keywords)}")
    return flattened_set
    

def compute_tfidf_similarity_matrix(per_sentence, documents):
    vectorizer = TfidfVectorizer()
    combined_texts = per_sentence + documents
    tfidf_matrix = vectorizer.fit_transform(combined_texts)
    similarity_matrix = cosine_similarity(tfidf_matrix[:len(per_sentence)], tfidf_matrix[len(per_sentence):])
    return similarity_matrix



In [None]:
def get_text(per_sentence, documents, score_matrix, debug=False):
    txt = ''
    used_docs = list()
    
#argmax_score_matrix = torch.argmax(score_matrix, dim=-1)

    if isinstance(score_matrix, np.ndarray):
        argmax_score_matrix = np.argmax(score_matrix, axis=-1)
    else:
        argmax_score_matrix = torch.argmax(score_matrix, dim=-1)
        
    for i in range(len(per_sentence)):
        txt += f'{per_sentence[i]}.'

        if documents and 0 <= argmax_score_matrix[i] < len(documents):
            metadata = documents[argmax_score_matrix[i]].metadata
            page_content = documents[argmax_score_matrix[i]].page_content
            metadata['page_content'] = page_content
            if 'source' in metadata:
                txt += f'\n(Medline: {metadata["title"]})'
            else:
                txt += f'\n({metadata["first_author"]}, {metadata["year"]})'
            if metadata not in used_docs:
                used_docs.append(metadata)
        else:
            txt += '\n(No relevant document found)'
        txt += '\n'
    txt += '\n\n'

    for x in used_docs:
        if 'source' in x:
            txt += f'\nMedline {x["title"]} url: {x["source"]}'
        else:
            txt += f'\n({x["first_author"]}, {x["year"]}) {x["title"]} url: doi.org/{x["doi"]}'
        txt += f'\n{x["page_content"]}\n'

    return txt



### import data

In [None]:
import pandas as pd
df = pd.read_pickle('appendix_data.pkl')
df.head()

In [None]:
output_df = pd.DataFrame(columns = list(df.columns).extend(["causal_output", "default_output"]))
output_df

from tqdm import tqdm
import re

keyword_docs =[]
keyword_sents = []
temperature = 0.07
for i, row in tqdm(df.iterrows()):
    output = row['generated_answer']
    documents = row['sources']
    
    cur_dict = row.to_dict()
    
    per_sentence = [x.strip() for x in re.split('[^0-9]["."][^0-9]', output) if x.strip() != '']
    document_contents = [x.page_content for x in documents]

    
    lcs_score_matrix = compute_lcs_score_matrix(per_sentence, document_contents)
    keyword_sentences = row['keyword_sentences'][0]
    keyword_documents = row['keyword_documents'][0]
    # Flatten the list of sets for sentences and documents
    flattened_sentences = [flatten_keyword_list(s) for s in keyword_sentences]
    flattened_documents = [flatten_keyword_list(d) for d in keyword_documents]

    jaccard_mat = compute_jaccard_matrix(flattened_sentences, flattened_documents)
    bleu1_score_matrix = compute_score_matrix(per_sentence, document_contents, score_type='bleu', n_gram=1)
    rouge1_score_matrix = compute_score_matrix(per_sentence, document_contents, score_type='rouge', n_gram=1)
    bleu2_score_matrix = compute_score_matrix(per_sentence, document_contents, score_type='bleu', n_gram=2)
    rouge2_score_matrix = compute_score_matrix(per_sentence, document_contents, score_type='rouge', n_gram=2)
    tfidf_similarity_matrix = compute_tfidf_similarity_matrix(per_sentence, document_contents)

    
    cur_dict['cited_lcs'] = get_text(per_sentence, documents, lcs_score_matrix)
    cur_dict['cited_jaccard'] = get_text(per_sentence, documents, lcs_score_matrix)
    cur_dict['cited_bleu1'] = get_text(per_sentence, documents, bleu1_score_matrix)
    cur_dict['cited_rouge1'] = get_text(per_sentence, documents, rouge1_score_matrix)
    cur_dict['cited_bleu2'] = get_text(per_sentence, documents, bleu2_score_matrix)
    cur_dict['cited_rouge2'] = get_text(per_sentence, documents, rouge2_score_matrix)
    cur_dict['cited_tfidf'] = get_text(per_sentence, documents, tfidf_similarity_matrix)


    
    output_df = pd.concat([output_df, pd.DataFrame.from_records([cur_dict])])

In [None]:
#output_df.to_pickle('appendix_result.pkl')

### Checking Processing time for each method

In [None]:
import time

def process_rows(df, num_rows, metric_function, metric_type, *args):
    start_time = time.time()
    for i, row in df.head(num_rows).iterrows():
        output = row['generated_answer']
        documents = row['sources']
        per_sentence = [x.strip() for x in re.split('[^0-9]["."][^0-9]', output) if x.strip() != '']
        document_contents = [x.page_content for x in documents]

        if metric_type == 'Jaccard':
            keyword_sentences = row['keyword_sentences'][0]  
            keyword_documents = row['keyword_documents'][0]  

            flattened_sentences = [flatten_keyword_list(s) for s in keyword_sentences]
            flattened_documents = [flatten_keyword_list(d) for d in keyword_documents]

            score_matrix = compute_jaccard_matrix(flattened_sentences, flattened_documents)
        elif metric_type == 'LCS':
            score_matrix = compute_lcs_score_matrix(per_sentence, document_contents)
        elif metric_type == 'TF-IDF':
            score_matrix = compute_tfidf_similarity_matrix(per_sentence, document_contents)
        else:
            score_matrix = metric_function(per_sentence, document_contents, *args)

    end_time = time.time()
    return end_time - start_time


In [None]:
row_counts = [9, 49, 99, 199]
times = {'Jaccard': [], 'LCS': [], 'BLEU-1': [], 'BLEU-2': [], 'ROUGE-1': [], 'ROUGE-2': [], 'TF-IDF': []}

for count in row_counts:
    times['Jaccard'].append(process_rows(df, count, compute_jaccard_matrix, 'Jaccard'))
    times['LCS'].append(process_rows(df, count, compute_lcs_score_matrix, 'LCS'))
    times['BLEU-1'].append(process_rows(df, count, compute_score_matrix, 'bleu', 1))
    times['BLEU-2'].append(process_rows(df, count, compute_score_matrix, 'bleu', 2))
    times['ROUGE-1'].append(process_rows(df, count, compute_score_matrix, 'rouge', 1))
    times['ROUGE-2'].append(process_rows(df, count, compute_score_matrix, 'rouge', 2))
    times['TF-IDF'].append(process_rows(df, count, compute_tfidf_similarity_matrix, 'TF-IDF'))



In [None]:
#times
import math
log_times = {metric: [math.log(time_value) for time_value in time_values] for metric, time_values in times.items()}

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
for metric, time_values in log_times.items():
    if len(time_values) == len(row_counts):
        plt.plot(row_counts, time_values, label=metric)
    else:
        print(f"Error: Mismatched lengths for {metric}")

plt.xlabel("Sample Size")
plt.ylabel("Processing Time (Log Scale, in seconds)")
plt.title('Processing Time Comparison')
plt.legend()
plt.grid(True)
plt.savefig('time_complexity.pdf', bbox_inches = 'tight')  
plt.show()