In [1]:
import gensim
import spacy
import pandas as pd
import numpy as np
from tqdm import tqdm
import multiprocessing as mp
tqdm.pandas()
print(pd.__version__)

2.0.1


# Data Preprocessing


## Load the Data

In [2]:
# Intial data cleaning step
data = pd.read_csv('all-the-news-2-1.csv')
data = data[data['publication'].isna() == False]
data = data[data['article'].isna() == False].reset_index(drop = True)

In [3]:
data.columns

Index(['date', 'year', 'month', 'day', 'author', 'title', 'article', 'url',
       'section', 'publication'],
      dtype='object')

In [4]:
test_data = data[0:10000]

In [3]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner', 'tagger'])

In [27]:
#Tokenization speed test
from tqdm import tqdm
import time
docs = test_data['article']
#batches = np.array_split(docs, 100)
final_tokens = []
# t1 = time.time()
# for doc in tqdm(nlp.pipe(docs, batch_size=50)):
#     tokens = [token.text for token in doc]
#     final_tokens.append(tokens)
# t2 = time.time()
# print(f'Batching took {t2-t1:0.04f} seconds')
t1 = time.time()
final_tokens = []
for doc in tqdm(nlp.pipe(docs,disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer","ner"])):
    tokens = [token.text for token in doc]
    final_tokens.append(tokens)
t2 = time.time()
print(f'without Batching took {t2-t1:0.04f} seconds')
t1 = time.time()
final_tokens = []
for doc in nlp.pipe(docs, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer","ner"], n_process = 2, batch_size = 2000):
    tokens = [token.text for token in doc]
    final_tokens.append(tokens)
t2 = time.time()
print(f'other parts disabled took {t2-t1:0.04f} seconds')    

30000it [00:44, 675.24it/s]


without Batching took 44.4306 seconds
other parts disabled took 114.1044 seconds


In [None]:
#Tokenization
from tqdm import tqdm
docs = data['article']
batches = np.array_split(docs, 100)
final_tokens = []
for batch in batches:
    for doc in tqdm(nlp.pipe(batch,disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer","ner"])):
        tokens = [token.text for token in doc]
        final_tokens.append(tokens)

25842it [00:31, 826.78it/s]
25842it [00:33, 768.38it/s]
25842it [00:32, 792.19it/s]
25842it [00:31, 820.47it/s]
25842it [00:32, 797.57it/s]
25842it [00:30, 843.84it/s] 
25842it [00:30, 838.06it/s]
25842it [00:16, 1538.18it/s]
25842it [00:26, 982.05it/s] 
25842it [00:28, 912.38it/s] 
25842it [00:25, 1028.01it/s]
25842it [00:18, 1424.39it/s]
25842it [00:22, 1157.12it/s]
25842it [00:24, 1056.85it/s]
25842it [00:21, 1196.06it/s]
25842it [00:22, 1136.00it/s]
25842it [00:22, 1155.93it/s]
25842it [00:25, 1000.09it/s]
25842it [00:25, 1026.49it/s]
25842it [00:19, 1358.57it/s]
25842it [00:22, 1164.87it/s]
25842it [00:27, 938.04it/s] 
25842it [00:21, 1214.84it/s]
25842it [00:19, 1292.81it/s]
25842it [00:55, 467.55it/s] 
25842it [00:28, 912.13it/s] 
25842it [00:24, 1069.77it/s]
25842it [00:20, 1268.76it/s]
25842it [00:20, 1272.50it/s]
25842it [00:20, 1262.20it/s]
25842it [00:21, 1227.23it/s]
25842it [00:19, 1293.96it/s]
25842it [00:22, 1132.53it/s]
25842it [00:23, 1123.24it/s]
25842it [00:24, 1071

## Doc2vec model

In [6]:
# First will try on downsampled corpus
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
df_resample = data.groupby('publication', as_index = False).progress_apply(lambda x: x.sample(3332))
df_resample = df_resample.reset_index(drop = True)

100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 39.96it/s]


In [77]:
# tokenize our documents
final_tokens= []
docs = df_resample['article']
for doc in tqdm(nlp.pipe(docs,disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer","ner"])):
    tokens = [token.text.lower() for token in doc]
    final_tokens.append(tokens)
df_resample['tokens'] = final_tokens

86632it [02:25, 595.47it/s] 


In [9]:
# Intialize logging to see progress
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [80]:
# create tagged documents
# Note: since this is already a small sample of our original data, I have not created a train-test split, because
# I can pull more documents from the full data to create testing data
documents = [TaggedDocument(article[0], [i,article[1]]) for i, article in enumerate(zip(df_resample.tokens, df_resample.publication))]

In [81]:
# Rename
train_corpus = documents

In [83]:
model = Doc2Vec(vector_size=50, window=2, min_count=2, workers=16, epochs=25)

2023-05-15 15:58:19,487 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d50,n5,w2,mc2,s0.001,t16>', 'datetime': '2023-05-15T15:58:19.487430', 'gensim': '4.3.1', 'python': '3.11.3 | packaged by Anaconda, Inc. | (main, Apr 19 2023, 23:46:34) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}


In [84]:
model.build_vocab(train_corpus)

2023-05-15 15:58:20,561 : INFO : collecting all words and their counts
2023-05-15 15:58:20,561 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-05-15 15:58:21,732 : INFO : PROGRESS: at example #10000, processed 7222590 words (6170069 words/s), 104582 word types, 4 tags
2023-05-15 15:58:22,954 : INFO : PROGRESS: at example #20000, processed 14856279 words (6252672 words/s), 150504 word types, 7 tags
2023-05-15 15:58:24,302 : INFO : PROGRESS: at example #30000, processed 22807456 words (5900340 words/s), 216028 word types, 10 tags
2023-05-15 15:58:26,616 : INFO : PROGRESS: at example #40000, processed 36819116 words (6057233 words/s), 279756 word types, 13 tags
2023-05-15 15:58:27,684 : INFO : PROGRESS: at example #50000, processed 43323200 words (6089949 words/s), 299827 word types, 16 tags
2023-05-15 15:58:28,337 : INFO : PROGRESS: at example #60000, processed 47275519 words (6062764 words/s), 319858 word types, 19 tags
2023-05-15 15:58:29,673 

In [85]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

2023-05-15 15:58:35,444 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 16 workers on 194541 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=2 shrink_windows=True', 'datetime': '2023-05-15T15:58:35.444338', 'gensim': '4.3.1', 'python': '3.11.3 | packaged by Anaconda, Inc. | (main, Apr 19 2023, 23:46:34) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'train'}
2023-05-15 15:58:36,456 : INFO : EPOCH 0 - PROGRESS: at 5.30% examples, 1618667 words/s, in_qsize 30, out_qsize 1
2023-05-15 15:58:37,458 : INFO : EPOCH 0 - PROGRESS: at 8.34% examples, 1719191 words/s, in_qsize 31, out_qsize 0
2023-05-15 15:58:38,458 : INFO : EPOCH 0 - PROGRESS: at 11.22% examples, 1763469 words/s, in_qsize 31, out_qsize 0
2023-05-15 15:58:39,460 : INFO : EPOCH 0 - PROGRESS: at 15.63% examples, 1746591 words/s, in_qsize 30, out_qsize 1
2023-05-15 15:58:40,462 : INFO : EPOCH 0 - PROGRESS: at 19.23% examples, 1774572 words/s, in_qsize 31, out_

In [86]:
model.save('sampled_articles.model')

2023-05-15 16:15:01,377 : INFO : Doc2Vec lifecycle event {'fname_or_handle': 'sampled_articles.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-05-15T16:15:01.377173', 'gensim': '4.3.1', 'python': '3.11.3 | packaged by Anaconda, Inc. | (main, Apr 19 2023, 23:46:34) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'saving'}
2023-05-15 16:15:01,378 : INFO : not storing attribute cum_table
2023-05-15 16:15:01,577 : INFO : saved sampled_articles.model


In [98]:
model.save_word2vec_format('doc_tensor.w2v', doctag_vec=True, word_vec=False)

2023-05-15 16:18:19,595 : INFO : storing 86658x50 projection weights into doc_tensor.w2v


In [81]:
%run ./env/Lib/site-packages/gensim/scripts/word2vec2tensor.py -i doc_tensor.w2v -o sample_articles

2023-05-09 22:38:26,445 : INFO : running ./env/Lib/site-packages/gensim/scripts/word2vec2tensor.py -i doc_tensor.w2v -o sample_articles
2023-05-09 22:38:26,446 : INFO : loading projection weights from doc_tensor.w2v


ValueError: could not convert string to float: ''

## Now we need to get a sample from each publication
Then find the most related articles to that sample, and pull those out

Then, see if the publications are the same, or authors are the same

Then, compare NER to see if we can find a similar topic - this delves into topic modeling, which is another NLP technique that I need to do more research on

In [100]:
# Grab a sample from each publication
sample = df_resample.groupby('publication', as_index = False).progress_apply(lambda x: x.sample(10, random_state = 123))

100%|█████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 331.35it/s]


In [102]:
sample = sample.reset_index().drop('level_0',axis =1).rename(columns={'level_1':'tag'})

In [201]:
# Compute top 10 similar documents to each sample, insert in new dataframe
rows = []
for index, row in sample.iterrows():
    tag = row.tag
    original_author = row.author
    original_pub = row.publication
    original_article = row.article
    for i in model.dv.most_similar(tag,topn = 10):
        data = df_resample.iloc[i[0]]
        author = data.author
        article = data.article
        publication = data.publication
        rows.append([i[0],author,article,publication,tag,i[1],original_author, original_pub,original_article])
most_similar = pd.DataFrame(rows, columns = ['tag','author','article','publication','similar_to','cos_sim','original_author','original_pub','original_article'])

In [202]:
most_similar.head(5)

Unnamed: 0,tag,author,article,publication,similar_to,cos_sim,original_author,original_pub,original_article
0,22,Alison Snyder,Major companies are bringing together new mach...,Axios,465,0.74494,Jeff Nesbit (contributor),Axios,Scientists have developed a way to pull water ...
1,2983,Erin Ross,"Bats are world-class echolocators, capable of ...",Axios,465,0.741492,Jeff Nesbit (contributor),Axios,Scientists have developed a way to pull water ...
2,65475,Knvul Sheikh,Trilobites Scientists have created “soft” magn...,The New York Times,465,0.725941,Jeff Nesbit (contributor),Axios,Scientists have developed a way to pull water ...
3,2525,Alison Snyder,"When it comes to computing, much of the focus ...",Axios,465,0.721293,Jeff Nesbit (contributor),Axios,Scientists have developed a way to pull water ...
4,21208,Chris Ciaccia,close Video Phone charging clothes in our futu...,Fox News,465,0.718649,Jeff Nesbit (contributor),Axios,Scientists have developed a way to pull water ...


In [192]:
pub_grouped = most_similar.groupby(['similar_to'])

## Now, we must figure out if the authors are the same
This will give us an idea as to whether or not authorship is a key bias point

In [212]:
# Simple test to see number of similar authors
same_author = 0
for index, row in most_similar.iterrows():
    if row.author == row.original_author:
        same_author+=1
print(f'{(same_author/len(most_similar))*100:0.2f}% of authors are the same, or {same_author} authors')

5.73% of authors are the same, or 149 authors


In [211]:
same_pub = 0
for index, row in most_similar.iterrows():
    if row.publication == row.original_pub:
        same_pub+=1
same_pub
print(f'{(same_pub/len(most_similar))*100:0.2f}% of publications are the same, or {same_pub} publishers')

24.23% of publications are the same, or 630 publishers


According to these results, publication seems to matter more than authorship, which is very suprising to me

## We should now investigate if the topics of these articles are similar, and we can perhaps attempt to pull out important words

## Attempted tensorboard sample visualization
This is completely seperate from the rest, but if you want a cool interactive visualization you can find it here:



In [7]:
def read_corpus(documents):
    for i, article in enumerate(documents):
        yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(article, max_len=30), [i])

In [4]:
df_resample2 = data.groupby('publication', as_index = False).progress_apply(lambda x: x.sample(3323))
df_resample2 = df_resample2.reset_index(drop = True)

100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 39.65it/s]


In [59]:
df_resample2.to_csv('downsampled_data.csv')

In [58]:
len(df_resample2)

13000

In [71]:
train_corpus = list(read_corpus(df_resample2.article))

In [None]:
len(train_corpus)

In [61]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=25, min_count=2, epochs=20)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

2023-05-14 00:21:20,179 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d25,n5,w5,mc2,s0.001,t3>', 'datetime': '2023-05-14T00:21:20.179941', 'gensim': '4.3.1', 'python': '3.10.11 (tags/v3.10.11:7d4cc5a, Apr  5 2023, 00:38:17) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}
2023-05-14 00:21:20,384 : INFO : collecting all words and their counts
2023-05-14 00:21:20,385 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-05-14 00:21:21,303 : INFO : PROGRESS: at example #10000, processed 6418079 words (6994058 words/s), 102443 word types, 0 tags
2023-05-14 00:21:21,585 : INFO : collected 115561 word types and 13000 unique tags from a corpus of 13000 examples and 8557376 words
2023-05-14 00:21:21,586 : INFO : Creating a fresh vocabulary
2023-05-14 00:21:21,774 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 retains 71665 unique words (62.01% of original 115561, drops 43896)', 'datetime

In [64]:
model.save_word2vec_format('doc_tensor2.w2v', doctag_vec=True, word_vec=False)

2023-05-14 00:24:18,539 : INFO : storing 13000x25 projection weights into doc_tensor2.w2v


In [65]:
%run ./env/Lib/site-packages/gensim/scripts/word2vec2tensor.py -i doc_tensor2.w2v -o downsampled

2023-05-14 00:24:33,892 : INFO : running ./env/Lib/site-packages/gensim/scripts/word2vec2tensor.py -i doc_tensor2.w2v -o downsampled
2023-05-14 00:24:33,893 : INFO : loading projection weights from doc_tensor2.w2v
2023-05-14 00:24:34,145 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (13000, 25) matrix of type float32 from doc_tensor2.w2v', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-05-14T00:24:34.145676', 'gensim': '4.3.1', 'python': '3.10.11 (tags/v3.10.11:7d4cc5a, Apr  5 2023, 00:38:17) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'load_word2vec_format'}
2023-05-14 00:24:34,326 : INFO : 2D tensor file saved to downsampled_tensor.tsv
2023-05-14 00:24:34,327 : INFO : Tensor metadata file saved to downsampled_metadata.tsv
2023-05-14 00:24:34,327 : INFO : finished running word2vec2tensor.py


In [98]:
df_resample2['author']

0                     Orion Rummler
1                  Kia Kokalitcheva
2                         Ben Geman
3                     Haley Britzky
4                     Ursula Perano
                    ...            
12995               Aarian Marshall
12996                   WIRED Staff
12997                   WIRED Staff
12998    Will Bedingfield, WIRED UK
12999                   WIRED Staff
Name: author, Length: 13000, dtype: object

In [89]:
df_resample2['author'] = df_resample2['author'].fillna('None').astype(str)

In [100]:
with open('downsampled_metadata.tsv','w', encoding = "utf-8") as w:
    w.write('Publication\tURL\n')
    for i,j in zip(df_resample2.publication,df_resample2.url):
        w.write("%s\t%s\n" % (i,j))

In [94]:
test = 0
for i,j,k in zip(df_resample2.publication,df_resample2.url,df_resample2.author):
    test+=1
print(test)

13000


Unnamed: 0,Publication\tAuthor\tURL
0,Axios\tOrion Rummler\thttps://www.axios.com/el...
1,Axios\tKia Kokalitcheva\thttps://www.axios.com...
2,Axios\tBen Geman\thttps://www.axios.com/democr...
3,Axios\tHaley Britzky\thttps://www.axios.com/tr...
4,Axios\tUrsula Perano\thttps://www.axios.com/tr...
...,...
12425,Wired\tWIRED Staff\thttps://www.wired.com/stor...
12426,Wired\tAarian Marshall\thttps://www.wired.com/...
12427,Wired\tWIRED Staff\thttps://www.wired.com/2016...
12428,Wired\tWIRED Staff\thttps://www.wired.com/stor...
