In [None]:
"""
this notebook merges the train dataset ground truth labels with their corresponding news article texts. 
Cleaning and filtering is also performed to remove empty or exceptionally short articles 

task website: 
https://competitions.codalab.org/competitions/33835#learn_the_details-timetable
"""

In [3]:
import transformers
from tqdm import tqdm 
import pandas as pd
import json
import os

## start by getting the data in the right format for training 

In [8]:
#get load json into dataframes
outerFolder = "/shared/3/projects/newsDiffusion/data/raw/train"

dfList = []
for folderNum in tqdm(os.listdir(outerFolder)): 
    innerFolder = outerFolder + "/" + folderNum
    for innerFileName in os.listdir(innerFolder): 
        innerPathName = outerFolder + "/" + folderNum + "/" + innerFileName
        if "json" in innerPathName: 
            with open(innerPathName, "r") as f: 
                data = json.loads(f.read())
            df = pd.json_normalize(data)
            df["id"] = innerFileName.split(".")[0]
            dfList.append(df)
        

100%|██████████| 100/100 [00:43<00:00,  2.32it/s]


In [9]:
#concatenate training data dataframes 
fullDf = pd.concat(dfList).reset_index(drop=True)

In [1]:
leanDf = fullDf[["id", "title", "url", "source_url", "text"]]

NameError: name 'fullDf' is not defined

In [24]:
#get ground truth data 
#downloaded from: https://competitions.codalab.org/competitions/33835#learn_the_details-timetable
groundTruths = pd.read_csv("/shared/3/projects/benlitterer/agendaSetting/trainingDataV2.csv")

(4964, 14)


Unnamed: 0,url1_lang,url2_lang,pair_id,link1,link2,ia_link1,ia_link2,Geography,Entities,Time,Narrative,Overall,Style,Tone
0,en,en,1484084337_1484110209,https://www.washingtonpost.com/local/virginia-...,https://www.washingtonpost.com/world/the_ameri...,https://web.archive.org/web/www.washingtonpost...,https://web.archive.org/web/www.washingtonpost...,4.0,4.0,1.0,4.0,4.0,1.666667,2.0
1,en,en,1484396422_1483924666,https://www.stlucianewsonline.com/guyana-three...,https://www.thestar.com/news/world/europe/2020...,https://web.archive.org/web/www.stlucianewsonl...,https://web.archive.org/web/www.thestar.com/ne...,4.0,4.0,1.0,4.0,3.666667,1.666667,1.333333
2,en,en,1484698254_1483758694,https://www.teaparty.org/trump-brings-in-2020-...,https://www.timesofisrael.com/trump-says-he-do...,https://web.archive.org/web/www.teaparty.org/t...,https://web.archive.org/web/www.timesofisrael....,1.0,2.0,1.0,2.333333,2.333333,1.0,1.333333


In [25]:
groundTruths[["id1", "id2"]] = groundTruths["pair_id"].str.split("_", expand=True)
leanDf = leanDf[["id", "text", "title"]]

In [67]:
len(groundTruths)

4964

In [68]:
#merge text for the first id, id1
leanDf1 = leanDf.rename(columns = {"id":"id1","text":"text1", "title":"title1"})
firstMerge = pd.merge(groundTruths, leanDf1, how="left", on="id1")

leanDf2 = leanDf.rename(columns = {"id":"id2","text":"text2", "title":"title2"})
merged = pd.merge(firstMerge, leanDf2, how="left", on="id2")
merged.head(3)

In [70]:
#NOTE: when we do two inner joins above, we lose 14 articles, going down 
#to 4,951 total articles

In [71]:
#drop all data where both the title and text is na for either article pair 
mergedClean = merged.dropna(subset=["title1", "text1"], how="all").dropna(subset=["title2", "text2"], how="all")

In [72]:
#all of the common articles : 
removed = list(set(merged["pair_id"]) - set(mergedClean["pair_id"]))
removedDf = merged[merged["pair_id"].isin(removed)]

In [81]:
#count all pairs 
len(mergedClean)

4951

In [82]:
#count en-en pairs 
len(mergedClean[(mergedClean["url1_lang"] == "en") & (mergedClean["url2_lang"] == "en")])

1791

In [83]:
#do some more thoughtful dropping of na's 
#this time for empty text and title fields 
mergedClean[["title1", "title2", "text1", "text2"]] = mergedClean[["title1", "title2", "text1", "text2"]].fillna("")

mergedClean["titleText1"] = mergedClean["title1"] + mergedClean["text1"]
mergedClean["titleText2"] = mergedClean["title2"] + mergedClean["text2"]

#NOTE: we lose 29 records here
#NOTE: we lose 10 records here according to the SemEval paper 
def splitLen(inStr): 
    return len(inStr.split())

#we want to keep only the data for which we have over 2 tokens 
mergedClean = mergedClean[(mergedClean["titleText1"].apply(len) > 0) & (mergedClean["titleText2"].apply(len) > 0)]
len(mergedClean)

4950

In [84]:
mergedClean = mergedClean.drop(columns=["titleText1", "titleText2"])

In [114]:
#get the articles which were removed in a dataframe 
#looks like 14 were removed in cleaning
removed = list(set(merged["pair_id"]) - set(mergedClean["pair_id"]))
removedDf = merged[merged["pair_id"].isin(removed)]

In [117]:
shortRemoved = removedDf[["pair_id", "id1", "id2", "title1", "text1", "title2", "text2"]]
removed1 = shortRemoved[["id1", "title1", "text1"]]
ids1 = list(removed1.loc[removed1["text1"].isna(), "id1"])

removed2 = shortRemoved[["id2", "title2", "text2"]]
removed2
ids2 = list(removed2.loc[removed2["text2"].isna(), "id2"])

In [119]:
#send data from all languages 
#write to file 
mergedClean.to_csv("/shared/3/projects/newsDiffusion/data/processed/allTrainData.tsv", sep="\t")

In [123]:
noTextId = "1514227906"

In [125]:
allProblemIds = ids1 + ids2 + [noTextId]

In [64]:
#we don't lose any en-en pairs during cleaning!
mergedCleanEn = mergedClean[(mergedClean["url1_lang"] == "en") & (mergedClean["url2_lang"] == "en")]
len(mergedCleanEn)

In [66]:
#write to file 
mergedCleanEn.to_csv("/shared/3/projects/newsDiffusion/data/processed/enTrainData.tsv", sep="\t")