In [1]:
import transformers
import pandas as pd
import json
import os

## resources for SBERT
https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/sts/training_stsbenchmark.py 
https://www.sbert.net/examples/training/sts/README.html#loss-function 
https://github.com/cerlymarco/MEDIUM_NoteBook/blob/master/Siamese_Dual_BERT/Siamese_Dual_BERT.ipynb
    

## start by getting the data in the right format for training 

In [2]:
outerFolder = "/shared/3/projects/benlitterer/agendaSetting/trainingHtml"

dfList = []
for folderNum in os.listdir(outerFolder): 
    innerFolder = outerFolder + "/" + folderNum 
    for innerFileName in os.listdir(innerFolder): 
        innerPathName = outerFolder + "/" + folderNum + "/" + innerFileName
        if "json" in innerPathName: 
            with open(innerPathName, "r") as f: 
                data = json.loads(f.read())
            df = pd.json_normalize(data)
            df["id"] = innerFileName.split(".")[0]
            dfList.append(df)
        

In [None]:
fullDf = pd.concat(dfList).reset_index(drop=True)

In [None]:
leanDf = fullDf[["id", "title", "url", "source_url", "text"]]

In [None]:
groundTruths = pd.read_csv("/shared/3/projects/benlitterer/agendaSetting/trainingDataV2.csv")
print(groundTruths.shape)
groundTruths.head(3)

In [13]:
groundTruths.loc[(groundTruths["url1_lang"] == "en") & (groundTruths["url2_lang"] == "en")].shape

(1800, 14)

In [15]:
groundTruths[["id1", "id2"]] = groundTruths["pair_id"].str.split("_", expand=True)

AttributeError: 'PandasArray' object has no attribute '_str_split'

In [30]:
leanDf = leanDf[["id", "text"]]

In [31]:
#merge text for the first id, id1
leanDf1 = leanDf.rename(columns = {"id":"id1","text":"text1"})
firstMerge = pd.merge(groundTruths, leanDf1, how="left", on="id1")
firstMerge.head(3)

Unnamed: 0,url1_lang,url2_lang,pair_id,link1,link2,ia_link1,ia_link2,Geography,Entities,Time,Narrative,Overall,Style,Tone,id1,id2,text1
0,en,en,1484084337_1484110209,https://www.washingtonpost.com/local/virginia-...,https://www.washingtonpost.com/world/the_ameri...,https://web.archive.org/web/www.washingtonpost...,https://web.archive.org/web/www.washingtonpost...,4.0,4.0,1.0,4.0,4.0,1.666667,2.0,1484084337,1484110209,"MARTINSBURG, W.Va. — A suspected drunken drive..."
1,en,en,1484396422_1483924666,https://www.stlucianewsonline.com/guyana-three...,https://www.thestar.com/news/world/europe/2020...,https://web.archive.org/web/www.stlucianewsonl...,https://web.archive.org/web/www.thestar.com/ne...,4.0,4.0,1.0,4.0,3.666667,1.666667,1.333333,1484396422,1483924666,Share This On:\n\nPin 11 Shares\n\n(NEWS ROOM ...
2,en,en,1484698254_1483758694,https://www.teaparty.org/trump-brings-in-2020-...,https://www.timesofisrael.com/trump-says-he-do...,https://web.archive.org/web/www.teaparty.org/t...,https://web.archive.org/web/www.timesofisrael....,1.0,2.0,1.0,2.333333,2.333333,1.0,1.333333,1484698254,1483758694,(Breitbart) – President Donald Trump welcomed ...


In [32]:
leanDf2 = leanDf.rename(columns = {"id":"id2","text":"text2"})
merged = pd.merge(firstMerge, leanDf2, how="left", on="id2")
merged.head(3)

Unnamed: 0,url1_lang,url2_lang,pair_id,link1,link2,ia_link1,ia_link2,Geography,Entities,Time,Narrative,Overall,Style,Tone,id1,id2,text1,text2
0,en,en,1484084337_1484110209,https://www.washingtonpost.com/local/virginia-...,https://www.washingtonpost.com/world/the_ameri...,https://web.archive.org/web/www.washingtonpost...,https://web.archive.org/web/www.washingtonpost...,4.0,4.0,1.0,4.0,4.0,1.666667,2.0,1484084337,1484110209,"MARTINSBURG, W.Va. — A suspected drunken drive...","PORT-AU-PRINCE, Haiti — Haitian President Jove..."
1,en,en,1484396422_1483924666,https://www.stlucianewsonline.com/guyana-three...,https://www.thestar.com/news/world/europe/2020...,https://web.archive.org/web/www.stlucianewsonl...,https://web.archive.org/web/www.thestar.com/ne...,4.0,4.0,1.0,4.0,3.666667,1.666667,1.333333,1484396422,1483924666,Share This On:\n\nPin 11 Shares\n\n(NEWS ROOM ...,BERLIN - A fire at a zoo in western Germany in...
2,en,en,1484698254_1483758694,https://www.teaparty.org/trump-brings-in-2020-...,https://www.timesofisrael.com/trump-says-he-do...,https://web.archive.org/web/www.teaparty.org/t...,https://web.archive.org/web/www.timesofisrael....,1.0,2.0,1.0,2.333333,2.333333,1.0,1.333333,1484698254,1483758694,(Breitbart) – President Donald Trump welcomed ...,"PALM BEACH, United States — US President Donal..."


In [33]:
mergedFull = merged.dropna(subset=["text1", "text2"])

In [34]:
mergedFull.shape

(4032, 18)

In [35]:
mergedFull[(mergedFull["url1_lang"] == "en") & (mergedFull["url2_lang"] == "en")]

Unnamed: 0,url1_lang,url2_lang,pair_id,link1,link2,ia_link1,ia_link2,Geography,Entities,Time,Narrative,Overall,Style,Tone,id1,id2,text1,text2
0,en,en,1484084337_1484110209,https://www.washingtonpost.com/local/virginia-...,https://www.washingtonpost.com/world/the_ameri...,https://web.archive.org/web/www.washingtonpost...,https://web.archive.org/web/www.washingtonpost...,4.0,4.000000,1.000000,4.000000,4.000000,1.666667,2.000000,1484084337,1484110209,"MARTINSBURG, W.Va. — A suspected drunken drive...","PORT-AU-PRINCE, Haiti — Haitian President Jove..."
1,en,en,1484396422_1483924666,https://www.stlucianewsonline.com/guyana-three...,https://www.thestar.com/news/world/europe/2020...,https://web.archive.org/web/www.stlucianewsonl...,https://web.archive.org/web/www.thestar.com/ne...,4.0,4.000000,1.000000,4.000000,3.666667,1.666667,1.333333,1484396422,1483924666,Share This On:\n\nPin 11 Shares\n\n(NEWS ROOM ...,BERLIN - A fire at a zoo in western Germany in...
2,en,en,1484698254_1483758694,https://www.teaparty.org/trump-brings-in-2020-...,https://www.timesofisrael.com/trump-says-he-do...,https://web.archive.org/web/www.teaparty.org/t...,https://web.archive.org/web/www.timesofisrael....,1.0,2.000000,1.000000,2.333333,2.333333,1.000000,1.333333,1484698254,1483758694,(Breitbart) – President Donald Trump welcomed ...,"PALM BEACH, United States — US President Donal..."
3,en,en,1576314516_1576455088,https://gadgets.ndtv.com/apps/news/zomato-uber...,https://gadgets.ndtv.com/internet/news/indian-...,https://web.archive.org/web/gadgets.ndtv.com/a...,https://web.archive.org/web/gadgets.ndtv.com/i...,1.0,2.333333,2.666667,1.666667,2.000000,1.666667,1.666667,1576314516,1576455088,Uber has sold its online food-ordering busines...,Rapid digitisation and growth in both online b...
4,en,en,1484036253_1483894099,https://news.yahoo.com/india-approves-third-mo...,https://www.channelnewsasia.com/news/asia/indi...,https://web.archive.org/web/news.yahoo.com/ind...,https://web.archive.org/web/www.channelnewsasi...,1.0,1.250000,1.000000,1.250000,1.250000,1.000000,1.000000,1484036253,1483894099,BENGALURU (Reuters) - India has approved its t...,BANGALORE: India plans to make a fresh attempt...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3444,en,en,1602628318_1522567203,https://communities.vmware.com/message/2948357...,https://communities.vmware.com/message/2927239...,https://web.archive.org/web/communities.vmware...,https://web.archive.org/web/communities.vmware...,4.0,3.000000,1.000000,2.000000,3.000000,1.000000,1.000000,1602628318,1522567203,"Hello,\n\nWe are preparing to move our environ...",I'm running vCenter 6.7 w/ three host also run...
3445,en,en,1558724916_1578120789,https://www.cnbctv18.com/healthcare/you-can-no...,https://www.business-standard.com/article/opin...,https://web.archive.org/web/www.cnbctv18.com/h...,https://web.archive.org/web/www.business-stand...,1.0,4.000000,3.000000,4.000000,4.000000,2.000000,1.000000,1558724916,1578120789,You can now consult with your doctors through ...,India is doing a commendable job in fighting t...
3446,en,en,1516170185_1515943366,http://www.looptonga.com/global-news/thai-sold...,https://www.itv.com/news/2020-02-09/thai-soldi...,https://web.archive.org/web/www.looptonga.com/...,https://web.archive.org/web/www.itv.com/news/2...,1.0,2.000000,1.000000,2.000000,1.000000,3.000000,1.000000,1516170185,1515943366,Jakraphanth Thomma on Saturday killed his comm...,Medics carry a stretcher towards a Thai shoppi...
3447,en,en,1511216591_1511221676,https://www.sowetanlive.co.za/news/south-afric...,https://www.iol.co.za/news/south-africa/gauten...,https://web.archive.org/web/www.sowetanlive.co...,https://web.archive.org/web/www.iol.co.za/news...,1.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1511216591,1511221676,The bodies of the victims were found lying in ...,Johannesburg - Gauteng police on Saturday said...


In [36]:
mergedFull.to_csv("/shared/3/projects/benlitterer/localNews/NetworkMVP/enTrainData.csv", sep="\t")