In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
import re
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial import distance 
import seaborn as sns

In [28]:
#get data with content, embeddings, and named entities 
df = pd.read_csv("/shared/3/projects/newsDiffusion/data/interim/NEREmbedding/first500000withEmbeddings.tsv", sep="\t", converters={"embedding":lambda x: np.array(x.strip("[]").split(", "), dtype=float)}, nrows=5000)

# old path 
# "/shared/3/projects/benlitterer/localNews/NetworkMVP/dataWithEmbeddings.tsv"

#get only the data unique to this step of pipeline 
df = df.drop(columns=["Unnamed: 0", "date", "national", "headTail"])

In [29]:
ogDf = pd.read_csv("/shared/3/projects/benlitterer/localNews/mergedNewsData/mergedNER.tsv", sep="\t", nrows=1000)

In [30]:
ogDf = ogDf.drop(columns=["content"])

In [36]:
#merge the original data with the data that has embeddings attached
#here we can see how many are lost in the process of getting embeddings 

#NOTE: we should only be losing anything because we didn't have 
#anything left for the head + tail 
merged = pd.merge(ogDf, df, how="inner", on="article_id")

In [38]:
df = merged

In [39]:
print(df.shape)
df["article_id"] = df["article_id"].apply(lambda x: x.strip())
print(df.drop_duplicates(subset=["article_id"]).shape)

(1000, 46)
(1000, 46)


In [40]:
print(max(pd.to_datetime(df["date"])))
print(min(pd.to_datetime(df["date"])))
print(df.shape)

2020-04-04 00:00:00
2020-04-01 00:00:00
(1000, 46)


In [41]:
testDf = df[["key", "topics", "embedding"]]

print("parsing")

def cleanList(inList): 
    return [str(re.sub("[^a-zA-Z0-9 ]", "", item).lower()) for item in inList]

def parseList(inStr): 
    split = inStr.split("\'), (\'")
    return [cleanList(item.split("', '")) for item in split]

#parse topics from string to actual list of tuples 
testDf["topics"] = testDf["topics"].apply(parseList)

print("parsed")

#test out idea for creating reverse mapping 
testDf = testDf.dropna(subset=["topics"])

#bring each tuple into its own row 
testDf = testDf.explode("topics")

#bring each tuple entry into its own column 
#split ent_type, entity pairs to columns 
testDf[["ent_type","entity"]] = pd.DataFrame(testDf["topics"].tolist(), index=testDf.index)

print("formatted") 

#keep only the entity types that may be interesting 
toKeep = ["org","event", "person", "work_of_art", "product"]
testDf = testDf[testDf["ent_type"].isin(toKeep)]

grouped = testDf[["embedding", "ent_type", "entity", "key"]].groupby(by=["ent_type", "entity"]).agg(list)

parsing
parsed
formatted


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testDf["topics"] = testDf["topics"].apply(parseList)


In [42]:
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,embedding,key
ent_type,entity,Unnamed: 2_level_1,Unnamed: 3_level_1
event,a memorial day,"[[0.004712148569524288, 0.14361728727817535, 0...",[waterburyrepublicanamerican_My_Ride:_Classic_...
event,a world series,"[[-0.07535392791032791, 0.05455530807375908, 0...",[chicagosuntimes_A_novel_approach_to_Cubs_hist...
event,allstar classic,"[[-0.16040632128715515, -0.02758200466632843, ...",[troymessenger_Galloway_finishes_career_at_Gos...
event,allstar game,"[[-0.03063148818910122, 0.0081802261993289, 0....",[dailyherald_Go_Figure:_Ed_Farmer's_numbers_te...
event,an olympic games,"[[-0.11521651595830917, 0.17485880851745605, -...",[dailybreeze_Coronavirus_delays_UCLA_pole_vaul...
...,...,...,...
product,webex,"[[0.015168684534728527, -0.015776410698890686,...","[fortmorgantimes_Recruiting,_advising_and_more..."
product,wheelbase,"[[-0.07119020819664001, -0.08963277190923691, ...",[waterburyrepublicanamerican_Auto_review:_Don’...
product,x9cworldwide,"[[-0.0026862930972129107, 0.0596192441880703, ...","[dailyherald_Gov._Cuomo:_China_sending_1,000_v..."
product,xtramath,"[[-0.013016817159950733, -0.10538842529058456,...",[fortbraggadvocatenews_New_remote_teaching_is_...


In [43]:
grouped["articleNum"] = grouped["key"].apply(len)
grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,embedding,key,articleNum
ent_type,entity,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
event,a memorial day,"[[0.004712148569524288, 0.14361728727817535, 0...",[waterburyrepublicanamerican_My_Ride:_Classic_...,1
event,a world series,"[[-0.07535392791032791, 0.05455530807375908, 0...",[chicagosuntimes_A_novel_approach_to_Cubs_hist...,1
event,allstar classic,"[[-0.16040632128715515, -0.02758200466632843, ...",[troymessenger_Galloway_finishes_career_at_Gos...,1
event,allstar game,"[[-0.03063148818910122, 0.0081802261993289, 0....",[dailyherald_Go_Figure:_Ed_Farmer's_numbers_te...,2
event,an olympic games,"[[-0.11521651595830917, 0.17485880851745605, -...",[dailybreeze_Coronavirus_delays_UCLA_pole_vaul...,3


In [45]:
#NOTE: arbitrary cutoff, we keep named entities that have over 5 or under 1000 articles 
groupedLean = grouped[(grouped["articleNum"] > 5) & (grouped["articleNum"] < 1000)]
groupedLean.shape

(434, 3)

In [46]:
groupedLean

Unnamed: 0_level_0,Unnamed: 1_level_0,embedding,key,articleNum
ent_type,entity,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
event,division ii,"[[0.04572198912501335, -0.06260434538125992, 0...","[contracostatimes_Kobe_Bryant,_Tim_Duncan_head...",8
event,olympics,"[[-0.11521651595830917, 0.17485880851745605, -...",[dailybreeze_Coronavirus_delays_UCLA_pole_vaul...,16
event,wimbledon,"[[-0.0977105125784874, 0.10482001304626465, -0...",[westhawaiitoday_Virus_forces_Wimbledon_cancel...,7
org,3m,"[[-0.07977821677923203, -0.11823803931474686, ...",[campbellreporter_Simple_DIY_face_mask-making_...,8
org,adams,"[[-0.08972325176000595, -0.043380528688430786,...",[dailyreviewatlas_Coronavirus_live_updates:_Cl...,12
...,...,...,...,...
product,quest,"[[-0.07424215227365494, 0.05049649998545647, -...",[campbellreporter_Coronavirus:_How_many_Califo...,6
product,sharks,"[[0.026582151651382446, 0.09881743788719177, 0...",[dailydemocrat_The_next_Johnny_Hockey?_Maybe_n...,10
product,the uss theodore roosevelt,"[[0.0010143928229808807, 0.07253465056419373, ...","[dailyreviewatlas_Navy_Capt._Crozier,_fired_fo...",6
product,twitter,"[[0.08284326642751694, 0.14152759313583374, 0....",[juneauempire_Juneauites_express_concerns_for_...,24


In [47]:
#now, within named entity categories we want to get a matrix of the embeddings we want to compare 

def getPairwise(inList): 
    return [(item1, item2) for item2 in inList for item1 in inList]


#work with a small df first 
testDf = groupedLean[["key", "embedding"]].head(2000)

testDf["embedding"] = testDf["embedding"].apply(getPairwise)
testDf["key"] = testDf["key"].apply(getPairwise)

exploded = testDf.apply(pd.Series.explode).drop_duplicates(subset=["key"])
testDf.head(3)


Unnamed: 0_level_0,Unnamed: 1_level_0,key,embedding
ent_type,entity,Unnamed: 2_level_1,Unnamed: 3_level_1
event,division ii,"[(contracostatimes_Kobe_Bryant,_Tim_Duncan_hea...","[([0.04572198912501335, -0.06260434538125992, ..."
event,olympics,[(dailybreeze_Coronavirus_delays_UCLA_pole_vau...,"[([-0.11521651595830917, 0.17485880851745605, ..."
event,wimbledon,[(westhawaiitoday_Virus_forces_Wimbledon_cance...,"[([-0.0977105125784874, 0.10482001304626465, -..."


In [49]:
#now we have exploded, which tells us which pairwise embedding comparisons to make 
#ALSO we know which entities are associated which is good information to have
#might need this for the paper 
exploded

Unnamed: 0_level_0,Unnamed: 1_level_0,key,embedding
ent_type,entity,Unnamed: 2_level_1,Unnamed: 3_level_1
event,division ii,"(contracostatimes_Kobe_Bryant,_Tim_Duncan_head...","([0.04572198912501335, -0.06260434538125992, 0..."
event,division ii,"(dailydemocrat_Kobe_Bryant,_Tim_Duncan_head_20...","([0.061869993805885315, -0.06245817244052887, ..."
event,division ii,"(redbluffdailynews_Kobe_Bryant,_Tim_Duncan_hea...","([0.05388658121228218, -0.08418846130371094, 0..."
event,division ii,"(theenterpriserecord_Kobe_Bryant,_Tim_Duncan_h...","([0.05186019837856293, -0.08002828806638718, 0..."
event,division ii,"(thedenverpost_Kobe_Bryant,_Tim_Duncan,_Kevin_...","([0.07538960874080658, -0.012295975349843502, ..."
...,...,...,...
product,twitter,(westhawaiitoday_Column:_Remembering_the_Final...,"([-0.09581921249628067, 0.04146881774067879, 0..."
product,twitter,(chicagosuntimes_Polling_Place:_Which_World_Se...,"([-0.11595650762319565, -0.05029453709721565, ..."
product,twitter,(dailyreviewatlas_Leaked_memo:_Amazon_strategy...,"([-0.11139538139104843, -0.015247128903865814,..."
product,twitter,(journalstar_OSF_preparation_for_COVID_influx_...,"([-0.06296169012784958, 0.03564852103590965, -..."


In [48]:
exploded.shape

(31372, 2)

In [104]:
from tqdm.notebook import tqdm 
from multiprocessing import Pool 

def getCos(inList): 
    return 1 - distance.cosine(inList[0], inList[1])

def getCosSeries(inSeries): 
    return inSeries.apply(getCos)

#tqdm.pandas()
#exploded.head(1000000)["embedding"].progress_map(getCos)

testDf = exploded
testEmbeddings = testDf["embedding"]

with Pool(12) as pool: 
    splitList = np.array_split(testEmbeddings, 10)
    similarityArrs = list(tqdm(pool.imap(getCosSeries, splitList), total=10))
    similarity = pd.concat(similarityArrs)

testDf["similarity"] = similarity

  0%|          | 0/10 [00:02<?, ?it/s]

In [112]:
sanity = testDf[testDf["similarity"] >= .7]
sanity["key"][203]

("dailyherald_'Pure_business'_at_Biden-Putin_summit:_No_hugs,_no_brickbats",
 'waterburyrepublicanamerican_‘Pure_business’_at_Biden-Putin_summit:_No_hugs,_no_brickbats')

# Version to run as script on server!

In [13]:
import pandas as pd
import numpy as np
import csv
import re
from scipy.spatial import distance 
from tqdm import tqdm 
from multiprocessing import Pool 

#Output path for the dataframe after filtering and getting cosine similarity of filtered pairs 
outputPath = "/home/blitt/projects/localNews/data/interim/similarityAnalysis/articlePairsCosineSim.tsv"

#read in data 
df = pd.read_csv("/shared/3/projects/benlitterer/localNews/NetworkMVP/dataWithEmbeddings.tsv", sep="\t", converters={"embedding":lambda x: np.array(x.strip("[]").split(), dtype=float)})

print("Input data shape: " + str(df.shape))

df = df[["key", "topics", "embedding"]]

print("parsing")

def cleanList(inList): 
    return [str(re.sub("[^a-zA-Z0-9 ]", "", item).lower()) for item in inList]

def parseList(inStr): 
    split = inStr.split("\'), (\'")
    return [cleanList(item.split("', '")) for item in split]

#parse topics from string to actual list of tuples 
df["topics"] = df["topics"].apply(parseList)

print("parsed")

#test out idea for creating reverse mapping 
df = df.dropna(subset=["topics"])

#bring each tuple into its own row 
df = df.explode("topics")

#bring each tuple entry into its own column 
#split ent_type, entity pairs to columns 
df[["ent_type","entity"]] = pd.DataFrame(df["topics"].tolist(), index=df.index)

print("formatted") 

#keep only the entity types that may be interesting 
toKeep = ["org","event", "person", "work_of_art", "product"]
df = df[df["ent_type"].isin(toKeep)]

#the data grouped into entity clusters 
grouped = df[["embedding", "ent_type", "entity", "key"]].groupby(by=["ent_type", "entity"]).agg(list)

#get the cluster length 
grouped["articleNum"] = grouped["key"].apply(len)

#filter out named entities that are too common, since they likely aren't that meaningful/specific 
groupedLean = grouped[(grouped["articleNum"] > 5) & (grouped["articleNum"] < 1000)]

print("Shape after grouping: " + str(groupedLean.shape))

def getPairwise(inList): 
    return [(item1, item2) for item2 in inList for item1 in inList]

#we only needed the articleNum column for filtering 
df = groupedLean[["key", "embedding"]]

#get all of the pairs for a given list of articles and a given list of embeddings 
df["embedding"] = df["embedding"].apply(getPairwise)
df["key"] = df["key"].apply(getPairwise)

#"explode" the pairs into a seperate row (this is when we have a TON of data)
exploded = df.apply(pd.Series.explode).drop_duplicates(subset=["key"])

print("Exploded size: " + str(exploded.shape))

def getCos(inList): 
    return 1 - distance.cosine(inList[0], inList[1])

def getCosSeries(inSeries): 
    return inSeries.apply(getCos)

embeddings = exploded["embedding"]

with Pool(12) as pool: 
    splitList = np.array_split(embeddings, 10)
    similarityArrs = list(tqdm(pool.imap(getCosSeries, splitList), total=10))
    similarity = pd.concat(similarityArrs)

exploded["similarity"] = similarity

print("writing")
#exploded[["key", "similarity"]].to_csv(outputPath, sep="\t", quoting=csv.QUOTE_NONNUMERIC)
print("written")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Input data shape: (308837, 48)
parsing
parsed
formatted
Shape after grouping: (921, 3)
Exploded size: (55110, 2)


  0%|          | 0/10 [00:00<?, ?it/s]

KeyboardInterrupt: 