In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
def get_top_n_relevant_sentences_tfidf(query, sentences, n=3):
    # Fit TF-IDF model
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)

    # Transform the query
    query_tfidf = tfidf_vectorizer.transform([query])

    # Compute cosine similarity between the query and sentences
    cosine_similarities = linear_kernel(query_tfidf, tfidf_matrix).flatten()

    # Get the indices of the top N cosine similarities
    top_n_indices = cosine_similarities.argsort()[-n:][::-1]

    # Get the top N relevant sentences
    top_n_relevant_sentences = [sentences[i] for i in top_n_indices]

    return top_n_relevant_sentences

In [4]:
df = pd.read_excel('/content/orignal_dataset_1.xlsx')
df = df[['claim', 'evidence']]
text = df['claim'].iloc[0:500]
df.loc[:, 'evidence'] = df['evidence'].apply(str)
sentences = list(df['evidence'])

In [7]:
lst1 = []
lst2 = []
ground_truth = []  # Replace this with your actual ground truth data in passage form

# Loop through each claim and find the top 3 relevant sentences using TF-IDF
for claim in text:
    print("Claim:", claim)
    top_3_relevant_sentences = get_top_n_relevant_sentences_tfidf(claim, sentences, n=3)
    lst1.append(claim)
    lst2.append(top_3_relevant_sentences)

    # Assuming that your ground truth data is a list of passages for each claim
    # Replace the line below with the actual ground truth data
    ground_truth.append("ground_truth_passage")

Claim: Asif Zardari has announced support for Imran Khan, saying Shehbaz Sharif cannot lead Pakistan
Claim: A video shows a Punjab Police officer “misbehaving” and “insulting” an older woman amid a recent crackdown by authorities following protests across Pakistan against the 9 May arrest of Imran Khan.
Claim: Video shows Akshay Kumar condemning Imran Khan's arrest
Claim: Former prime minister and Pakistan Tehreek-e-Insaf (PTI) chief Imran Khan has died.
Claim: Agreement signed between detained ex-Pak PM Imran Khan, Pakistan government and US ambassador with bizarre “no-rape” conditions
Claim: sri Lanka to gift two female elephants to Pakistan !!
Claim: US President Joe Biden has allocated $500,000 to “transgender Pakistani youth English lessons”.
Claim: New Zealand cricket commentator Simon Doull says living in Pakistan is like living in jail
Claim: Astrologists and geologists predict that a massive earthquake similar to the quake that hit Turkey on Monday will strike India and Pakist

In [9]:
ground = df['evidence'].iloc[0:500]

In [10]:
list(ground)

['Asif Zardari has not announced support for Khan, nor has he said Sharif cannot lead Pakistan. the press conference shown in the video is actually from 2019. Another clip with the same caption but a different press conference is actually from 2022.On 24 May 2023, Facebook page ‘Entertainment Videos’ posted a live video (archive) showing the former president and the co-chairperson of the Pakistan People’s Party (PPP), Asif Ali Zardari, addressing a press conference.The video — which had over 520,000 views, 600 comments, and 500 shares as of writing time.',
 "The video is not recent, nor does it have anything to do with the May 2023 crackdown by Punjab Police and they were not misbehaving and insulting an older women. It is, in fact, from 2019, when Imran Khan was the prime minister of Pakistan.On 20 May 2023, Abdulla Alamadi — a media consultant from Qatar whose Twitter bio says he is a “Columnist & Author” — posted a video (archive) showing a Punjab Police officer seemingly misbehavin

In [11]:
rr_list = []

for true_passage, predicted_passages in zip(ground, lst2):
    # Find the position of the true_passage in predicted_passages
    try:
        position = predicted_passages.index(true_passage) + 1
        rr = 1 / position
    except ValueError:
        # If true_passage is not in predicted_passages, set rr to 0
        rr = 0

    rr_list.append(rr)

# Calculate Mean Reciprocal Rank
mrr = sum(rr_list) / len(rr_list)

print("Mean Reciprocal Rank:", mrr)

Mean Reciprocal Rank: 0.8903333333333333
