In [None]:
# Importing packages
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import torch
from torch import nn
import torch.optim as optim
import random
import re

### Reading the news dataset

In [60]:
news_data = pd.read_csv("data/data.csv")

In [61]:
# checking NA counts across columns
news_data.isna().sum()

article_id          0
source_id       80880
source_name         0
author           8219
title              40
description       383
url                 0
url_to_image     5624
published_at        0
content             0
category           42
full_content    46943
dtype: int64

In [62]:
# total number of rows
len(news_data)

105375

In [89]:
# retaining only those rows where full_content of the news is available
news_data_sub = news_data.dropna(subset=['full_content'])
# retaining only the news article column for our word2vec purpose
news_data_sub = news_data_sub[['full_content']]

In [90]:
# verifying again that there are no na values in our full_content column in news_data_sub
news_data_sub.isna().sum()

full_content    0
dtype: int64

In [91]:
# slicing only sample_size rows to reduce processing time
random.seed(1)
sample_size = 100
news_data_sub = news_data_sub.iloc[random.sample(range(len(news_data_sub)), sample_size)]
news_data_sub.reset_index(drop=True, inplace=True)

In [92]:
news_data_sub

Unnamed: 0,full_content
0,In closing arguments for Sam Bankman-Fried's c...
1,Perry Carpenter is Chief Evangelist for KnowBe...
2,"When you buy through our links, Insider may ea..."
3,NEW DELHI: The BJP has lodged a complaint with...
4,Wittenberg Investment Management Inc. lowered ...
...,...
95,Cowen AND Company LLC raised its position in s...
96,"Dublin, Nov. 20, 2023 (GLOBE NEWSWIRE) -- Th..."
97,"THE HAGUE, Netherlands --Anti-Islam populist G..."
98,An elusive echidna feared extinct after disapp...


In [93]:
print(f"We have {len(news_data_sub)} articles available to process!")

We have 100 articles available to process!


In [94]:
def pre_process_data(doc: str) -> list:
    '''
    INPUT:
        doc: in our use case, it will be a string of multiple sentences
    OUTPUT:
        processed_doc: a list of tokens for the input doc
    '''
    # convert to lower case
    doc = doc.lower()

    # remove urls, punctuations, special characters, numbers
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    doc = re.sub(r"_+", "", doc)
    doc = url_pattern.sub('', doc)
    doc = re.sub(r'[^\w\s]','', doc) 
    doc = re.sub(r'\d+', '', doc)

    # tokenize
    doc = doc.split()

    # remove stopwords
    doc = list(filter(lambda x: x not in stopwords.words('english'), doc))

    # stemming
    stemmer = SnowballStemmer("english", True)
    doc = [stemmer.stem(word) for word in doc]

    # lemmatisation
    wnl = WordNetLemmatizer()
    doc = list(map(lambda word: wnl.lemmatize(word, pos="v"),doc))
    

    return doc

In [95]:
print(pre_process_data(' https://www.google.com ______ 12345 @#$% The quick brown foxes are jumping over the lazy dogs!!!!'))

['quick', 'brown', 'fox', 'jump', 'lazi', 'dog']


In [96]:
# pre-processing every news article

# we will store the pre-process article in a new column 'preprocessed_full_content'

tqdm.pandas(desc="Pre-processing news articles") 
news_data_sub['preprocessed_full_content'] = news_data_sub['full_content'].progress_apply(pre_process_data)

Pre-processing news articles: 100%|██████████| 100/100 [00:03<00:00, 26.71it/s]


In [97]:
# Example of an unprocessed article
news_data_sub['full_content'][1]

"Perry Carpenter is Chief Evangelist for KnowBe4 Inc., provider of the popular Security Awareness Training & Simulated Phishing platform.   Numerous studies confirm that the absence of security technology isn’t what tends to get organizations into trouble. On the contrary, it’s humans. People are the ones who make poor security decisions and judgment errors: They click on bogus links, visit the wrong websites, download malware-loaded files, take security for granted and use weak passwords. Knowingly or unknowingly, they can put organizations at risk. While a robust security culture has been hailed as an answer to most human-related security challenges, it continues to elude many businesses because it requires chief information security officers (CISCOs) to build relationships at various levels and understand the idiosyncrasies of various business units. Various reports highlight how many CISOs struggle with competing priorities and how their security strategies often lack alignment wit

In [98]:
# Example of the same article but after preprocessing (first 10 tokens)
news_data_sub['preprocessed_full_content'][1][:10]

['perri',
 'carpent',
 'chief',
 'evangelist',
 'knowb',
 'inc',
 'provid',
 'popular',
 'secur',
 'awar']

In [99]:
# creating our vocabulary
vocab = sorted(list(set([word for pre_processed_article_list in news_data_sub.preprocessed_full_content for word in pre_processed_article_list])))

In [100]:
len(vocab)

6381

In [101]:
vocab[:10] #need to pre-process better

['abandon',
 'abba',
 'abbi',
 'abc',
 'abdel',
 'abduct',
 'abdullah',
 'abdulrahman',
 'abdurahman',
 'aberdeen']

### Creating (target, context) pairs

In [102]:
def target_context_pairs(doc: list, window_size: int) -> list:
    '''
    INPUT: 
        doc: a list of pre-processed tokens of a news article
        window_size: an integer specifying the window size around the centre/context word to create the context-target pairs
    OUTPUT:
        target_context_pairs: a list of the form [(target1, context1), (target1, context2),..], etc.
        The number of targets for a given context depends on the window_size.
        The example above is for window_size=1.
    '''
    target_context_pairs = []
    for index in range(window_size,len(doc)-window_size-1):

        target = doc[index]
        context = doc[index-window_size:index+window_size+1]
        context.remove(target)
        target_context_pairs = target_context_pairs + [(target, word) for word in context]
    return target_context_pairs

In [103]:
# creating the target, context pairs for every news article

# we will store the context target pairs in a new column 'target_context_pairs'

tqdm.pandas(desc="Creating (target, context) pairs for each news article") 
news_data_sub['target_context_pairs'] = news_data_sub['preprocessed_full_content'].progress_apply(lambda x: target_context_pairs(x, window_size=3))

Creating (target, context) pairs for each news article: 100%|██████████| 100/100 [00:01<00:00, 55.45it/s]


In [104]:
news_data_sub.head(5)

Unnamed: 0,full_content,preprocessed_full_content,target_context_pairs
0,In closing arguments for Sam Bankman-Fried's c...,"[close, argument, sam, bankmanfri, crimin, tri...","[(bankmanfri, close), (bankmanfri, argument), ..."
1,Perry Carpenter is Chief Evangelist for KnowBe...,"[perri, carpent, chief, evangelist, knowb, inc...","[(evangelist, perri), (evangelist, carpent), (..."
2,"When you buy through our links, Insider may ea...","[buy, link, insid, may, earn, affili, commissi...","[(may, buy), (may, link), (may, insid), (may, ..."
3,NEW DELHI: The BJP has lodged a complaint with...,"[new, delhi, bjp, lodg, complaint, elect, comm...","[(lodg, new), (lodg, delhi), (lodg, bjp), (lod..."
4,Wittenberg Investment Management Inc. lowered ...,"[wittenberg, invest, manag, inc, lower, hold, ...","[(inc, wittenberg), (inc, invest), (inc, manag..."


In [105]:
# appending all the context-target pairs to create our data set for the cbow model
model_data = [word for sub_target_context_pairs_list in news_data_sub.target_context_pairs for word in sub_target_context_pairs_list]

In [106]:
model_data[100:110]

[('decis', 'crimin'),
 ('decis', 'case'),
 ('testifi', 'madeth'),
 ('testifi', 'unusu'),
 ('testifi', 'decis'),
 ('testifi', 'crimin'),
 ('testifi', 'case'),
 ('testifi', 'potentiallyrisk'),
 ('crimin', 'unusu'),
 ('crimin', 'decis')]

In [107]:
print(f"We have {len(model_data)} (target, context) pairs!")

We have 351150 (target, context) pairs!


In [108]:
# mapping our tokens to indices and vice versa  
def token_mapping(vocabulary: list) -> tuple:
    '''
    INPUT:
        vocabulary: a list of unique tokens across our corpus, in alphabetical order
    OUTPUT:
        (word2index, index2word):
            word2index: a dictionary with the keys being tokens, and the values being indices
            index2word: a dictionary with the keys being indices, and the values being tokens
    '''
    word2index = dict()
    index2word = dict()
    for index, token in enumerate(vocabulary):
        word2index[token] = index
        index2word[index] = token

    return (word2index, index2word)

In [109]:
word2index_dict, index2word_dict = token_mapping(vocabulary=vocab)

### Preparing training data set

In [112]:
training_pairs = list()

for val in model_data:
    target = val[0]
    context = val[1]
    training_pairs.append((word2index_dict[target],word2index_dict[context]))

In [113]:
training_pairs[:5]

[(498, 1022), (498, 353), (498, 4940), (498, 1320), (498, 5824)]

### SkipGram Model

In [114]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)
    def forward(self, center_word):
        embedding = self.embed(center_word)
        output = self.linear(embedding)
        return output

### Training the SkipGram Model

In [121]:
def train(model, data, optimizer, loss_fn, epochs=100, batch_size=32):
    model.train()
    for epoch in range(epochs):
        for i in range(0, len(data), batch_size):
            batch = data[i:i + batch_size]
            center_words, context_words = zip(*batch)
            center_words = torch.LongTensor(center_words)
            context_words = torch.LongTensor(context_words)
            optimizer.zero_grad()
            outputs = model(center_words)
            loss = loss_fn(outputs, context_words)
            loss.backward()
            optimizer.step()
        # if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

In [122]:
vocab_size = len(vocab)
embedding_dim = 100
model = SkipGram(vocab_size, embedding_dim)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
train(model, training_pairs, optimizer, loss_fn, epochs=10, batch_size=10)

Epoch 0, Loss: 10.3306
Epoch 1, Loss: 9.8582
Epoch 2, Loss: 11.7958
Epoch 3, Loss: 11.7787
Epoch 4, Loss: 11.7625
Epoch 5, Loss: 11.6886
Epoch 6, Loss: 10.6799
Epoch 7, Loss: 9.5930
Epoch 8, Loss: 11.3218
Epoch 9, Loss: 11.2477


In [None]:
model.embed.weight.data.shape # V x N matrix

torch.Size([6381, 100])

In [125]:
def get_embedding(model, word_to_index, word):
    model.eval()
    index = word_to_index[word]
    embedding = model.embed(torch.LongTensor([index]))
    return embedding.detach().numpy()

sample_embedding = get_embedding(model, word2index_dict, "crimin")
print(f"\nEmbedding for 'crimin': {sample_embedding}")
print(f"Embedding shape: {sample_embedding.shape}")


Embedding for 'crimin': [[ 0.6524583  -0.11030392  0.08138298  0.0712603  -0.01790532  0.5110589
   0.14788385  0.07459371  0.24618366  0.17944013 -0.03617151  0.4555257
  -0.1924063   0.10443083  0.51467943  0.16954361  1.108788    0.09267556
  -0.05829034  0.24714412 -0.27283612 -0.21425495 -0.3757515  -0.10037919
   0.1660098  -0.17708589  0.49903297  0.09123883 -0.18389769 -0.74992573
  -0.01808611  0.23615855  0.12774436 -0.3017926   0.62091327 -0.06487557
   0.11615281  0.01948647 -0.20326275 -0.38602903  0.1141483  -0.24586317
  -0.29403657  0.15456884 -0.6076997  -0.36643687  0.13225085 -0.3912071
   0.42355117 -0.23048699 -1.1307319   0.07886155 -0.29142532 -0.4941296
  -0.08560681 -1.1921178  -0.27141258 -0.23502968  0.40122065 -0.15172422
  -0.33440393 -0.22674124  0.3290078  -0.2649746  -0.30342585  0.06082836
   0.05070811  0.15376125 -0.17219119 -0.4478474   0.1294718  -0.06213101
   0.05291691 -0.06071489 -0.11313146  0.187523    0.0077584  -0.05461296
   0.19394918  0.

In [126]:
sample_embedding[0].shape

(100,)

### Word Similarities

In [127]:
# extract the model weights
word_vectors = model.embed.weight.data


In [128]:
word_vectors.shape

torch.Size([6381, 100])

In [210]:
# pick a random sample of word indices from vocabulary
random.seed(123)
sample_size = len(vocab) #use len(vocab) if you want to use entire vocabulary
random_sample = random.sample(range(len(vocab)), sample_size)
random_words = [vocab[i] for i in random_sample]

In [211]:
random_words[:5]

['attorney', 'forb', 'braverman', 'would', 'mahmoud']

In [212]:
# get the word embeddings
word_vectors = [get_embedding(model, word2index_dict, w)[0].tolist() for w in random_words]

In [213]:
np.array(word_vectors).shape

(6381, 100)

In [214]:
def similarity(x: list,y: list) -> float:
    '''
    INPUT:
        x, y: input word vectors
    OUTPUT:
        cosine_score: cosine similarity score between x and y
    '''
    x = np.array(x)
    y = np.array(y)

    return np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y))

In [None]:
def display_word_similarity(word_list: list, embedding_list: list, k: int):
    '''
    INPUT:
        word_list: a list of words for which iteratively word similarity with every word in the vocabulary (except itself) will be calculated
        embedding_list: a list where the 'i'th element is the word vector for the word 'i' in word_list
        k: int value of the number of most similar words to display for each input word
    OUTPUT:
        similarity_df: a dataframe with cols 'input_word', 'top_1', 'top_2',...,'top_k', 
        where 'top_i' will store (output_word, cosine_score)
        top_df: storing the a filtered version of similarity_df where rows that have at least one cosine score >=0.75 is retained
    '''
    similarity_df = pd.DataFrame()
    top_df = pd.DataFrame()
    c=1
    for input_word in tqdm(word_list):
        input_word_dict = dict()
        for word in filter(lambda x: x !=input_word, word_list):
            word_vec_x = word_vectors[word_list.index(input_word)]
            word_vec_y = word_vectors[word_list.index(word)]
            score = similarity(word_vec_x, word_vec_y)
            input_word_dict[word] = score # cosine score of input_word with an output word
        
        # sort the dictionary by value of scores
        input_word_dict = sorted(input_word_dict.items(), key=lambda x: x[1], reverse=True)
        row_df = pd.DataFrame({'input_word':input_word}, index = [c])
        row_df = pd.concat([row_df, 
                            pd.DataFrame({f'top_{i+1}': list(map(lambda x: f'({x[0]},{round(x[1],2)})',input_word_dict))[i]
        for i in range(k)}, index = [c])], axis=1)
        similarity_df = pd.concat([similarity_df, row_df], axis = 0)

        if all([y > 0.6 for x,y in input_word_dict[:k]]):
            top_df = pd.concat([top_df, row_df], axis = 0)


        c+=1
    return similarity_df, top_df



In [223]:
df1, df2 = display_word_similarity(word_list=random_words,
                            embedding_list=word_vectors,
                            k=3)

100%|██████████| 6381/6381 [47:06<00:00,  2.26it/s]


In [253]:
# row 84, 839, 932, 2188, 2422, 2537, 4074, 5219, 6016
specific_indices = [84, 839, 932, 2188, 2422, 2537, 4074, 5219, 6016]
df2.loc[specific_indices]

Unnamed: 0,input_word,top_1,top_2,top_3
84,fish,"(sheep,0.75)","(poultri,0.71)","(cattl,0.63)"
839,cardiovascular,"(fibrot,0.67)","(hematolog,0.65)","(immunolog,0.63)"
932,icici,"(mahindra,0.65)","(laggard,0.54)","(tcs,0.52)"
2188,mussolini,"(dictat,0.68)","(fascist,0.67)","(benito,0.63)"
2422,earthquak,"(himalayan,0.67)","(seismic,0.63)","(ncs,0.6)"
2537,acid,"(amino,0.65)","(vitamin,0.56)","(antioxid,0.53)"
4074,bipolar,"(disord,0.55)","(candid,0.54)","(schizophrenia,0.5)"
5219,inject,"(liquid,0.54)","(dosag,0.53)","(semisolid,0.51)"
6016,swine,"(livestock,0.67)","(cattl,0.63)","(sheep,0.61)"


In [225]:
df1

Unnamed: 0,input_word,top_1,top_2,top_3
1,attorney,"(prosecutor,0.55)","(roo,0.48)","(forest,0.46)"
2,forb,"(pick,0.38)","(fair,0.31)","(punish,0.3)"
3,braverman,"(andi,0.34)","(deislam,0.33)","(kwara,0.32)"
4,would,"(globe,0.38)","(politicallyconnect,0.37)","(coalit,0.37)"
5,mahmoud,"(abba,0.5)","(abdullah,0.4)","(pray,0.37)"
...,...,...,...,...
6377,alnasr,"(wish,0.5)","(rantisi,0.44)","(chanc,0.4)"
6378,hydrat,"(humancaus,0.41)","(networkdens,0.34)","(liga,0.33)"
6379,profession,"(model,0.43)","(daili,0.4)","(prior,0.38)"
6380,renu,"(vig,0.52)","(vicechancellor,0.44)","(düsseldorf,0.4)"
