In [1]:
# This script generates WMD (word mover's distance)

In [68]:
from nltk.corpus import stopwords
import re
import nltk
import gensim.downloader as api
import pandas as pd
import xml.etree.ElementTree as ET
import json

In [90]:
def preprocess(text):
    
    if text is None:
        text = ''
    
    raw_text = text.lower()
    raw_text = re.sub(r"[^\w\s]", "", raw_text)
    tokens = nltk.word_tokenize(raw_text)
    stop = stopwords.words("english")
    cleaned_tokens = [token for token in tokens if token not in stop]
    
    return cleaned_tokens

def calculate(sentence1, sentence2):
    return model.wmdistance(sentence1, sentence2)

def find_news_title(newsid):
    file_path = "../../news_json/" + newsid + ".json"
    try:
        with open (file_path, 'r') as f:
            news = json.load(f)
            return news['title']
    except Exception:
        return ""

In [28]:
# load word embeddings
model = api.load('word2vec-google-news-300')

In [91]:
# load <topicid, newsid> pair from the xml file
tree = ET.parse('../../query_topics/2018BL_topic.xml')
root = tree.getroot()

# topic to news
t2n = {}

for i in root:
    topicid = i[0].text[-3:]
    newsid = i[1].text
    t2n[topicid] = newsid

In [105]:
# load qrels
qrel = pd.read_csv("../../query_topics/2018BL_answer.txt", sep = " ", header = None)
qrel.columns = ["topicid", "q0", "newsid", "rel"]

In [109]:
# save results
df_wmd = pd.DataFrame(columns=["topicid", "newsid", "wmd"])

In [110]:
for index, rows in qrel.iterrows():
    
    topicid = str(qrel.loc[index, "topicid"])
    newsid = qrel.loc[index, "newsid"]
    
    topic_title = find_news_title(t2n[topicid])
    candidate_title = find_news_title(newsid)
    
    wmd = calculate(preprocess(topic_title), preprocess(candidate_title))
    
    # convert infinite because of empty title to 0
    if wmd == float("inf"):
        wmd = 0
    
    df_wmd = df_wmd.append({"topicid": topicid, "newsid": newsid, "wmd": wmd}, ignore_index=True)
    
    if index%1000==0:
        print(index)    

0
1000
2000
3000
4000
5000
6000
7000
8000


In [None]:
df_wmd.to_csv("../../features/wmd.csv", sep = ",", index=False)