# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [1]:
import json
import operator
import subprocess
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from statistics import mean
from collections import Counter

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [2]:
with open('data/train-claims.json', 'r') as input_file:
    train_claim_data = json.load(input_file)

# Read in development data (claim)
with open('data/dev-claims.json', 'r') as input_file:
    dev_claim_data = json.load(input_file)

# Read in test data (claim)
with open('data/test-claims-unlabelled.json', 'r') as input_file:
    test_claim_data = json.load(input_file)

# Read in evidence data
with open('data/evidence.json', 'r') as input_file:
    evi_data = json.load(input_file)
    

full_evidence_id = list(evi_data.keys())
full_evidence_text  = list(evi_data.values())
train_claim_id = list(train_claim_data.keys())
train_claim_text  = [ v["claim_text"] for v in train_claim_data.values()]

In [3]:
# TF-IDF
evidence_tfidf_vectorizer = TfidfVectorizer(stop_words="english", use_idf=True)
claim_tfidf_vectorizer = TfidfVectorizer(use_idf=True)

evidence_tfidf_vectorizer.fit(train_claim_text+full_evidence_text)
train_claim_emb_list = claim_tfidf_vectorizer.fit_transform(train_claim_text)

full_evi_emb_list = evidence_tfidf_vectorizer.transform(full_evidence_text)

In [4]:
import numpy as np

with open('data/dev-claims.json', 'r') as input_file:
    test_claims = json.load(input_file)

# Prepare output dictionary
output_results = {}

# Iterate over each claim
for claim_id, claim_value in test_claims.items():
    # Vectorize the current claim text for evidence matching
    test_claim_vector = evidence_tfidf_vectorizer.transform([claim_value['claim_text']])
    similarities = cosine_similarity(test_claim_vector, full_evi_emb_list)[0]
    evidence_ranking = np.argsort(-similarities)[:2]  # Get indices of top 1-10 similarities

    # Store top k evidence IDs
    selected_evidence_ids = [full_evidence_id[idx] for idx in evidence_ranking]
    output_results[claim_id] = {"evidences": selected_evidence_ids}

    # Vectorize the current claim text for claim label prediction
    test_claim_vector = claim_tfidf_vectorizer.transform([claim_value['claim_text']])
    claim_similarities = cosine_similarity(test_claim_vector, train_claim_emb_list)[0]
    most_similar_claim_idx = np.argmax(claim_similarities)  # Get index of the most similar claim

    # Assign the label from the most similar claim
    output_results[claim_id]["claim_label"] = train_claim_data[train_claim_id[most_similar_claim_idx]]["claim_label"]

# Write results to file
with open("data/dev_predict.json", "w") as outfile:
    json.dump(output_results, outfile, indent=4)


# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [5]:
# %%cmd
# python eval.py --predictions dev-claims-baseline.json --groundtruth dev-claims.json
# python eval.py --predictions dev_predict.json --groundtruth dev-claims.json

output = subprocess.check_output("python eval.py --predictions data/dev_predict.json --groundtruth data/dev-claims.json", shell=True)

print(output.decode("utf-8"))

Evidence Retrieval F-score (F)    = 0.10510204081632651
Claim Classification Accuracy (A) = 0.474025974025974
Harmonic Mean of F and A          = 0.17205555936935077



## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*