In [1]:
"""
This file...
1. finds the cosine similarity between articles with at least 1 named entity 
2. filters down to only article pairs closer than 1 week 
3. creates a graph of articles with cosine sim. weighted edges
4. outputs connected components of that graph after considering only weighted edges above a certain 
similarity threshold
"""

'\nThis file...\n1. finds the cosine similarity between articles with at least 1 named entity \n2. filters down to only article pairs closer than 1 week \n3. creates a graph of articles with cosine sim. weighted edges\n4. outputs connected components of that graph after considering only weighted edges above a certain \nsimilarity threshold\n'

In [2]:
from numpy import dot
import math
import pandas as pd
import numpy as np
from ast import literal_eval
import re
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial import distance 
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
import seaborn as sns
from tqdm import tqdm 
import networkx as nx
import pickle



In [3]:
#NOTE: very important, which entity categories to keep 
#article showing all entity types below
# https://www.kaggle.com/code/curiousprogrammer/entity-extraction-and-classification-using-spacy
TO_KEEP = ["org","event", "person", "work_of_art", "product"]

#only keep named entities that have a # of articles associated with them in this range 
CLUSTER_CUTOFF = [2, 20000]

#this is the lowest cosine similarity threshold we will use 
SIM_THRESH = .8

INVERTED_ENT_PATH = "/shared/3/projects/newsDiffusion/data/interim/NEREmbedding/invertedEntityIndex.pkl"
EMBEDS_PATH = "/shared/3/projects/newsDiffusion/data/processed/articleEmbeddings/embeddings.pkl"
CLEANED_DF_PATH = "/shared/3/projects/newsDiffusion/data/processed/newsData/fullDataWithNERCleaned.tsv"

#how many days apart do we allow our articles to be? 
#if we consider adding edge between them based on cosine similarity 
#7 allows for Sunday - Sunday but NOT Sunday - Monday of following week 
DATE_FILTER = 7

## Load data

In [9]:
#this is the df with our inverted index in it
invertedDf = pd.read_pickle(INVERTED_ENT_PATH)

#what are the entities with the top number of associated articles 
invertedDf = invertedDf.sort_values("numArticles", ascending=False)

#remove any entities with only one article 
invertedDf = invertedDf[invertedDf["numArticles"] >= CLUSTER_CUTOFF[0]]

In [14]:
#all entities with at least two associated articles
len(invertedDf)

2614309

In [15]:
#number of entities which get cut off
len(invertedDf[invertedDf["numArticles"] > 20000])

133

In [16]:
#number remaining 
2614309 - 133

2614176

In [24]:
invertedDf = invertedDf[["entity", "ent_type", "numArticles"]]

In [25]:
removedEnts = invertedDf[invertedDf["numArticles"] > 20000]

In [29]:
#output this table to storage for github
removedEnts.to_csv("/shared/3/projects/newsDiffusion/data/processed/pubData/removedEntities.tsv", sep="\t")

In [30]:
#easy way to look at a list of these entities 
list(invertedDf.loc[invertedDf["numArticles"] > 20000, "entity"])[:10]

['trump',
 'biden',
 'donald trump',
 'joe biden',
 'trump',
 'congress',
 'senate',
 'house',
 'twitter',
 'facebook']