# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [12]:
import pandas as pd
import json
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from gensim.models import Doc2Vec

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felikskong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/felikskong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/felikskong/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/felikskong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
evidence = json.load(open('data/evidence.json', 'r'))
train_claims = json.load(open('data/train-claims.json', 'r'))
dev_claims = json.load(open('data/dev-claims.json', 'r'))
test_claims = json.load(open('data/test-claims-unlabelled.json', 'r'))

In [3]:
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma
	
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
def data_preprocess(data, remove_stopwords, lemma, stem):
	tokens = nltk.word_tokenize(data.lower())
	tokens = [token for token in tokens if token.isalpha()]
	if remove_stopwords:
		tokens = [token for token in tokens if token not in stopwords]
	if lemma:
		tokens = [lemmatize(token) for token in tokens]
	if stem:
		tokens = [stemmer.stem(token) for token in tokens]
	return ' '.join(tokens)

In [4]:
english_evidence = {}
for evidence_id, evidence_text in evidence.items():
    tokens = nltk.word_tokenize(evidence_text)
    english_tokens = [token for token in tokens if token.isalpha()]
    if len(english_tokens) / len(tokens) > 0.5:
        english_tokens = [token for token in english_tokens if token.lower() not in stopwords]
        english_text = ' '.join(english_tokens)
        english_evidence[evidence_id] = english_text

print(f"English Evidence Count:, {len(english_evidence)}/{len(evidence)}")


English Evidence Count:, 1181638/1208827


In [5]:
train_claims_text = []
for claim in train_claims.values():
    train_claims_text.append(data_preprocess(claim['claim_text'], True, True, False))

dev_claims_text = []
for claim in dev_claims.values():
    dev_claims_text.append(data_preprocess(claim['claim_text'], True, True, False))

claims_words = []
for claims_text in [train_claims_text, dev_claims_text]:
    for claim in claims_text:
        tokens = nltk.word_tokenize(claim)
        for token in tokens:
            claims_words.append(token.lower())

top_words = [word for word, count in Counter(claims_words).most_common(150)]


In [6]:
evidence_with_top_words = {}
for evidence_id, evidence_text in english_evidence.items():
    words = nltk.word_tokenize(evidence_text)
    if any(word.lower() in top_words for word in words):
        evidence_with_top_words[evidence_id] = evidence_text

for value in train_claims.values():
    if "evidences" in value:
        for evidence_id in value["evidences"]:
            if evidence_id in evidence:
                evidence_with_top_words[evidence_id] = evidence[evidence_id]
            else:
                print("Evidence ID not found:", evidence_id)

In [7]:
def convert_to_df(data, labelled):
	data_for_dataframe = []
	for claim_id, claim_info in data.items():
		claims_text = data_preprocess(claim_info['claim_text'], True, True, False)
		if labelled:
			claim_label = claim_info['claim_label']
			evidence_id = claim_info['evidences']
			data_for_dataframe.append({
					'claim_id': claim_id,
					'claim_text': claims_text,
					'claim_label': claim_label,
					'evidence_id': evidence_id
				})
		else:
			data_for_dataframe.append({
					'claim_id': claim_id,
					'claim_text': claims_text
				})

	df = pd.DataFrame(data_for_dataframe)
	return df

In [8]:
evidence_processed = {id: data_preprocess(text, True, True, False) for id, text in evidence.items()}
filtered_evidence_processed = {id: data_preprocess(text, True, True, False) for id, text in evidence_with_top_words.items()}
with open("filtered_evidence_processed.json", "w") as outfile:
    json.dump(filtered_evidence_processed, outfile, indent=2)

evidence_processed_df = pd.DataFrame(evidence_processed.items(), columns=['id', 'evidence'])
filtered_evidence_processed_df = pd.DataFrame(filtered_evidence_processed.items(), columns=['id', 'evidence'])

train_claims_df = convert_to_df(train_claims, True)
train_claims_df['evidence_texts'] = train_claims_df['evidence_id'].apply(
	lambda x: [evidence_processed[evidence_id] for evidence_id in x]
)

dev_claims_df = convert_to_df(dev_claims, True)

train_claims_list = train_claims_df['claim_text'].tolist()
dev_claims_list = dev_claims_df['claim_text'].tolist()
dev_claims_id = dev_claims_df['claim_id'].tolist()

evidence_id = list(evidence_processed.keys())
evidence_texts  = list(evidence_processed.values())
filtered_evidence_texts = list(filtered_evidence_processed.values())

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

Tf-idf

In [9]:
vectorizer = TfidfVectorizer()
vectorizer.fit(train_claims_list + filtered_evidence_texts)
evidence_vec = vectorizer.transform(filtered_evidence_texts)
dev_claims_vec = vectorizer.transform(dev_claims_list)

Word2vec

In [10]:
from gensim.models import Word2Vec
all_texts = train_claims_list + filtered_evidence_texts
sentences = [text.lower().split() for text in all_texts]
model = Word2Vec(
    sentences=sentences,
    vector_size=400,    
    window=5,           
    min_count=1,        
    workers=4,          
    sg=1              
)
model.save("word2vec.model")

model = Word2Vec.load("word2vec.model")

def sentence_to_vec(sentence, model):
    words = sentence.lower().split()
    word_vecs = [model.wv[word] for word in words if word in model.wv]
    if len(word_vecs) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vecs, axis=0)

claims_vec = np.array([sentence_to_vec(text, model) for text in dev_claims_list])
evidence_vec = np.array([sentence_to_vec(text, model) for text in filtered_evidence_texts])

Doc2vec

In [13]:
from gensim.models.doc2vec import TaggedDocument
all_texts = train_claims_list + filtered_evidence_texts
tagged_data = [TaggedDocument(words=text.lower().split(), tags=[str(i)]) for i, text in enumerate(all_texts)]

model = Doc2Vec(
    vector_size=300,
    window=5,
    min_count=1,
    workers=4,
    epochs=20
)

model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

model.save("doc2vec.model")

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

topk

In [18]:
def knn_retrieve(claims_id, claims_emb, evidence_emb, evidence_df, k):

    neigh = NearestNeighbors(n_neighbors=k, metric='cosine')
    neigh.fit(evidence_emb)
    
    distances, indices = neigh.kneighbors(claims_emb)
    
    top_evidence_id = {}
    for i in range(len(claims_id)):
        top_evidence_id[claims_id[i]] = [evidence_df.iloc[int(idx)]['id'] for idx in indices[i]]
    return top_evidence_id

top_evidence_id = knn_retrieve(dev_claims_id, dev_claims_vec, evidence_vec, filtered_evidence_processed_df, k=3)

with open('data/dev-claims.json', 'r') as input_file:
    test_out_temp = json.load(input_file)

for claim_id, _ in test_out_temp.items():
	test_out_temp[claim_id]["evidences"] = top_evidence_id[claim_id]

with open("dev_predict.json", "w") as outfile:
    json.dump(test_out_temp, outfile)

ValueError: X has 288558 features, but NearestNeighbors is expecting 400 features as input.

knn

In [19]:
def knn_retrieve(claims_id, claims_emb, evidence_emb, evidence_df, k):

    neigh = NearestNeighbors(n_neighbors=k, metric='cosine')
    neigh.fit(evidence_emb)
    
    distances, indices = neigh.kneighbors(claims_emb)
    
    top_evidence_id = {}
    for i in range(len(claims_id)):
        top_evidence_id[claims_id[i]] = [evidence_df.iloc[int(idx)]['id'] for idx in indices[i]]
    return top_evidence_id

top_evidence_id = knn_retrieve(dev_claims_id, dev_claims_vec, evidence_vec, filtered_evidence_processed_df, k=3)

with open('data/dev-claims.json', 'r') as input_file:
    test_out_temp = json.load(input_file)

for claim_id, _ in test_out_temp.items():
	test_out_temp[claim_id]["evidences"] = top_evidence_id[claim_id]

with open("dev_predict.json", "w") as outfile:
    json.dump(test_out_temp, outfile)

ValueError: X has 288558 features, but NearestNeighbors is expecting 400 features as input.

python eval.py --predictions dev_predict.json --groundtruth data/dev-claims.json