In [None]:
"""
this notebook merges the test dataset ground truth labels with their corresponding news article texts. 
Cleaning and filtering is also performed to remove empty or exceptionally short articles 

task website: 
https://competitions.codalab.org/competitions/33835#learn_the_details-timetable
"""

In [2]:
import transformers
import pandas as pd
from tqdm import tqdm 
import json
import os

In [2]:
outerFolder = "/shared/3/projects/newsDiffusion/data/raw/test"

dfList = []
for folderNum in tqdm(os.listdir(outerFolder)): 
    innerFolder = outerFolder + "/" + folderNum 
    for innerFileName in os.listdir(innerFolder): 
        innerPathName = outerFolder + "/" + folderNum + "/" + innerFileName
        if "json" in innerPathName: 
            with open(innerPathName, "r") as f: 
                data = json.loads(f.read())
            df = pd.json_normalize(data)
            df["id"] = innerFileName.split(".")[0]
            dfList.append(df)
        

100%|██████████| 100/100 [00:43<00:00,  2.32it/s]


In [3]:
fullDf = pd.concat(dfList).reset_index(drop=True)
leanDf = fullDf[["id", "title", "url", "source_url", "text"]]

In [47]:
groundTruths = pd.read_csv("/shared/3/projects/benlitterer/agendaSetting/final_evaluation_data.csv")
print(groundTruths.shape)

(4902, 14)


Unnamed: 0,url1_lang,url2_lang,pair_id,link1,link2,ia_link1,ia_link2,GEO,ENT,TIME,NAR,Overall,STYLE,TONE
0,en,en,1484189203_1484121193,https://wsvn.com/news/local/broward/police-2-m...,https://wsvn.com/news/local/no-swim-advisory-l...,https://web.archive.org/web/https://wsvn.com/n...,https://web.archive.org/web/https://wsvn.com/n...,1.5,4.0,2.0,4.0,3.5,1.0,1.5
1,en,en,1484011097_1484011106,https://www.zdnet.com/article/autoclerk-databa...,https://securityboulevard.com/2019/10/best-wes...,https://web.archive.org/web/https://www.zdnet....,https://web.archive.org/web/https://securitybo...,1.0,2.0,1.0,1.0,1.0,3.5,2.5
2,en,en,1484039488_1484261803,https://www.presstelegram.com/2019/12/31/ducks...,https://boingboing.net/2020/01/01/woody-guthri...,https://web.archive.org/web/https://www.presst...,https://web.archive.org/web/https://boingboing...,5.0,4.0,3.0,4.0,4.0,4.0,3.666667


In [57]:
#get just the english pairs 
groundTruths.loc[(groundTruths["url1_lang"] == "en") & (groundTruths["url2_lang"] == "en")].shape

(236, 16)

In [85]:
def splitIds(inStr): 
    return inStr.split("_")
#groundTruths[["id1", "id2"]] = groundTruths["pair_id"].apply(splitIds)
groundTruths["id1"] = [item[0] for item in groundTruths["pair_id"].apply(splitIds)]
groundTruths["id2"] = [item[1] for item in groundTruths["pair_id"].apply(splitIds)]

In [87]:
leanDf = leanDf[["id", "text", "title"]]

In [88]:
#merge text for the first id, id1
leanDf1 = leanDf.rename(columns = {"id":"id1","text":"text1", "title":"title1"})
firstMerge = pd.merge(groundTruths, leanDf1, how="left", on="id1")

leanDf2 = leanDf.rename(columns = {"id":"id2","text":"text2", "title":"title2"})
merged = pd.merge(firstMerge, leanDf2, how="left", on="id2")
merged.head(3)

In [100]:
#NOTE: when we do inner joins, we go down to only 4901 articles above meaning that we 
#don't have the data for one article 

In [103]:
mergedClean = merged.dropna(subset=["title1", "text1"], how="all").dropna(subset=["title2", "text2"], how="all")

In [108]:
#get removed article pairs 
removed = list(set(merged["pair_id"]) - set(mergedClean["pair_id"]))
removedDf = merged[merged["pair_id"].isin(removed)]

In [110]:
list(removedDf["id2"])

['1568361410']

In [105]:
#article that we lost: completely empty....
#for some reason we only have empty html of this file (1568361410)
merged.loc[merged["pair_id"] == "1525127987_1568361410", "link2"]

2659    https://www.onet.pl/?utm_source=_viasg_fakt&ut...
Name: link2, dtype: object

In [106]:
#do some more thoughtful dropping of na's 
#this time for empty text and title fields 
mergedClean[["title1", "title2", "text1", "text2"]] = mergedClean[["title1", "title2", "text1", "text2"]].fillna("")

mergedClean["titleText1"] = mergedClean["title1"] + mergedClean["text1"]
mergedClean["titleText2"] = mergedClean["title2"] + mergedClean["text2"]

#NOTE: we lose 29 records here
#NOTE: we lose 10 records here according to the SemEval paper 
def splitLen(inStr): 
    return len(inStr.split())

#we want to keep only the data for which we have over 2 tokens 
mergedClean = mergedClean[(mergedClean["titleText1"].apply(len) > 0) & (mergedClean["titleText2"].apply(len) > 0)]
len(mergedClean)

4901

In [107]:
#NOTE: here we keep all 236 records 
#write to file 
mergedEn = mergedClean[(mergedClean["url1_lang"] == "en") & (mergedClean["url2_lang"] == "en")]
mergedEn.shape

(236, 22)

In [73]:
mergedEn.to_csv("/shared/3/projects/newsDiffusion/data/processed/enTestData.tsv", sep="\t")