### DATA PREPROCESSING

In this notebook we will pre-process the NELA-GT-22 dataset. It will allow us to filter the data by time and by newspaper. Then, we will compute similarities between items (pieces of news) and from that transition probabilities

In [1]:
# import python dependencies
import os
import tqdm
import pickle
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# import utils dependencies
from preprocessing_utils import execute_query_pandas, zero_diagonal, \
                                cut_links, row_normalize

In [2]:
# create instance of Transformer to compute the embedding of each item
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

  return self.fget.__get__(instance, owner)()


#### Case Study
- News from the same newspaper (**The Guardian**) for a three month period (January to April)
- News from the same newspaper (**The New York Times**) for a two week perdiod (2022-01-01 to 2022-01-15)

In [3]:
path = 'nela-gt-2022.db'
sources = ['theguardian']
#sources = ['abcnews']
# Note that we need to add extra quotes around each source's name
# for the query to work properly e.g.: "'thenewyorktimes'"
sources_str = ["'%s'" % s for s in sources]
query = "SELECT * FROM newsdata WHERE source IN (%s)" % ",".join(sources_str)

df = execute_query_pandas(path, query)
# convert date to datetime format
df['date'] = pd.to_datetime(df['date'])

# map item id for convenience
map_dict_items = {old_id:new_id for new_id,old_id in enumerate(df.id.unique())}
df.replace({'id':map_dict_items}, inplace=True)


# compute a similarity_matrix between all the items within each time slot
time_slots = [('2022-01-01','2022-04-01')]
for time_slot in time_slots:
    # temporal df of the time slot
    tmp_df = df[(df.date>=time_slot[0])&(df.date<time_slot[1])]
    # drop duplicate items
    tmp_df = tmp_df.drop_duplicates(subset='content')
    # save
    tmp_df.to_csv(f'pandas_df/df_{sources[0]}_from_{time_slot[0]}_to_{time_slot[1]}')
    
    # compute embeddings
    embeddings = list()
    for sentence in tqdm.tqdm(tmp_df.content):
        embeddings.append(model.encode(sentence))
    
    # compute similarities between pieces of news
    similarities = cosine_similarity(np.array(embeddings))
    
    # zero diagonal
    similarities = zero_diagonal(similarities)
    
    # save similarity_matrix between all items (not normalized)
    with open(f'similarity_matrices/similarity_matrix_{sources[0]}_from_{time_slot[0]}_to_{time_slot[1]}', 'wb') as f:
        pickle.dump(similarities,f)

    # cut links to get the top_k
    for k in [5,10,20]:
        similarities_k = cut_links(similarities,k).astype(np.float64)

        # normalize the rows of the matrix so that they represent transition probabilities
        trans_prob = row_normalize(similarities_k)
        # save transition_probabilities
        with open(f'transition_probabilities/transition_probabilities_{sources[0]}_from_{time_slot[0]}_to_{time_slot[1]}_k_{k}', 'wb') as f:
            pickle.dump(trans_prob,f)

100%|██████████| 620/620 [00:12<00:00, 51.09it/s]
