In [None]:
"""
Get an inverted index where the entities are keys and each key is associated with a list of news articles 
"""

In [3]:
import pandas as pd
import numpy as np
from ast import literal_eval
import re
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial import distance 
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
from tqdm import tqdm 
import networkx as nx

In [4]:
#NOTE: very important, which entity categories to keep 
#article showing all entity types below
# https://www.kaggle.com/code/curiousprogrammer/entity-extraction-and-classification-using-spacy
TO_KEEP = ["org","event", "person", "work_of_art", "product"]

#for testing 
#NROWS = 20000

OUT_PATH = "/shared/3/projects/newsDiffusion/data/interim/NEREmbedding/embeddingClusterList_NewPiplineTest.tsv"
INVERTED_OUT_PATH = "/shared/3/projects/newsDiffusion/data/interim/NEREmbedding/invertedEntityIndex.pkl"
CLEANED_DF_PATH = ""

In [17]:
#load in main data source 
#we don't want to use "content", because it takes up a lot of space and
#we have already embedded the content. Can always merge back in later so long as we 
#keep the "key" column
LOAD_COLS = list(pd.read_csv("/shared/3/projects/newsDiffusion/data/processed/newsData/fullDataWithNERCleaned.tsv", \
                     nrows = 1, sep="\t").columns)
LOAD_COLS.remove("content")

In [18]:
#load in main data source 
print("loading news data")
df = pd.read_csv("/shared/3/projects/newsDiffusion/data/processed/newsData/fullDataWithNERCleaned.tsv",\
                 sep="\t", usecols = LOAD_COLS)

df["date"] = pd.to_datetime(df["date"], format="mixed")

df["year"] = df["date"].dt.year

loading news data


  df = pd.read_csv("/shared/3/projects/newsDiffusion/data/processed/newsData/fullDataWithNERCleaned.tsv",\


'\nTODO: rewrite this so that we don\'t do any merging. We just use an embedding dictionary loaded from a pickled object  \n\n#load in Embeddings, which haven\'t been merged yet\n#we merge them in this step because they are very large and don\'t\n#want to write them to disk again if we can help it\nprint("loading embeddings")\nembeddingsDf = pd.read_csv("/shared/3/projects/newsDiffusion/data/interim/NEREmbedding/embeddingsKeys.tsv", sep="\t", names=["key", "embedding"], converters={"embedding":lambda x: np.array(x.strip("[]").split(","), dtype=float)})\n\nprint("merging embeddings")\ndf = pd.merge(df, embeddingsDf, how="inner", on="key")\ndf.dropna(subset=["key", "embedding"])\nprint(str(len(df)) + " rows after merging, dropping na keys, embeddings")\n\n'

In [19]:
firstLocal = min(df.loc[(df["national"] == False), "date"])

In [20]:
#filter to only the overlapping sections 
#should give us everything from April 1, 2020 - December 31, 2021 
df = df[df["date"] >= firstLocal]

#filter so we only use the part of 2020 where we have overlap 
#df = df[df["year"] == 2020]

#get length of new rows 
print(str(len(df)) + " rows in overlapping period")

4228898 rows in overlapping period


In [21]:
print("date range: ")
print(max(pd.to_datetime(df["date"])))
print(min(pd.to_datetime(df["date"])))

date range: 
2021-12-31 00:00:00
2020-04-01 00:00:00


In [22]:
#NOTE: used to have embeddings here, but don't need that anymore with current method 
leanDf = df[["key", "NamedEntities"]]

print("parsing")

def cleanList(inList): 
    return [str(re.sub("[^a-zA-Z0-9 ]", "", item).lower()) for item in inList]

def parseList(inStr): 
    split = inStr.split("\'), (\'")
    return [cleanList(item.split("', '")) for item in split]

#parse topics from string to actual list of tuples 
leanDf["NamedEntities"] = leanDf["NamedEntities"].apply(parseList)

print("parsed")

#test out idea for creating reverse mapping 
#how many na vals do we have in "NamedEntities"? 
print(str(sum(leanDf["NamedEntities"].isna())) + " NA values in Named Entities column")
print("Filling with '' instead")
leanDf["NamedEntities"] = leanDf["NamedEntities"].fillna("")


parsing


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  leanDf["NamedEntities"] = leanDf["NamedEntities"].apply(parseList)


parsed
0 NA values in Named Entities column
Filling with '' instead


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  leanDf["NamedEntities"] = leanDf["NamedEntities"].fillna("")


Note: we see below that we have things like "date: week" as named entities. This must be addressed somewhere 

In [23]:
#bring each tuple into its own row 
print("exploding #1")
invertedDf = leanDf.explode("NamedEntities")

#bring each tuple entry into its own column 
#split ent_type, entity pairs to columns 
print("splitting entity, type")
invertedDf[["ent_type","entity"]] = pd.DataFrame(invertedDf["NamedEntities"].tolist(), index=invertedDf.index)

#remove occurences where we double count an entity for the same article 
invertedDf = invertedDf.drop_duplicates(subset=["key", "ent_type", "entity"])

print("filtering by entity type, grouping")
#keep only the entity types that may be interesting 
invertedDf = invertedDf[invertedDf["ent_type"].isin(TO_KEEP)]

exploding #1
splitting entity, type
filtering by entity type, grouping


In [24]:
#group articles by their named entities  
invertedDf = invertedDf[["ent_type", "entity", "key"]].groupby(by=["ent_type", "entity"]).agg(list)

print(str(len(invertedDf)) + " rows in entity-grouped df")

7436473 rows in entity-grouped df


In [25]:
invertedDf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,key
ent_type,entity,Unnamed: 2_level_1
event,,[3731573]
event,730am,[4353276]
event,arab,"[4933735, 4933736]"
event,duty,[1622968]
event,immunoprivilege,[2500373]


In [26]:
invertedDf["numArticles"] = invertedDf["key"].apply(len)


In [27]:
len(invertedDf)

7436473

In [29]:
INVERTED_OUT_PATH = "/shared/3/projects/newsDiffusion/data/interim/NEREmbedding/invertedEntityIndex.pkl"

#NOTE: start again here
# export our named entity inverted index so that we can do analysis as we need to in another script 
invertedDf.reset_index().to_pickle(INVERTED_OUT_PATH, compression=None)