# NLP2022: Assignment 2

### -Avirup Das (MDS202013)

In [None]:
# Importing necessary packages
import os, re, json, math, nltk, pickle, spacy, string
import scipy, logging, gensim, gc
import pandas as pd
import numpy as np
import numpy.matlib as mat
from gensim.models import Word2Vec, word2vec

# Mounting google drive
from google.colab import drive
drive.mount('/content/drive')

# Setting paths
path_to_json = 'pdf_json/'
drive_directory = 'drive/MyDrive/Data/NLP2022/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
gensim.__version__

'4.1.2'

In [None]:
# Function to construct initial co-occurence matrix with a ramped 4-word window
def cooc_matrix(most_freq, tokens):
  # Calculate indices of words
  index, i = {}, 0
  for word in most_freq:
    index[word]=i
    i+=1

  # Initialising matrix for 14000 most frequent words
  dim = len(index)
  matrix = np.zeros((dim, dim), np.float32)
  # Processing weights for words
  for i in range(0, len(tokens)-1):
    try:
      matrix[index[tokens[i]], index[tokens[i+1]]] +=4
      matrix[index[tokens[i+1]], index[tokens[i]]] +=4
      matrix[index[tokens[i]], index[tokens[i+2]]] +=3
      matrix[index[tokens[i+2]], index[tokens[i]]] +=3
      matrix[index[tokens[i]], index[tokens[i+3]]] +=2
      matrix[index[tokens[i+3]], index[tokens[i]]] +=2
      matrix[index[tokens[i]], index[tokens[i+4]]] +=1
      matrix[index[tokens[i+4]], index[tokens[i]]] +=1
    except (IndexError, KeyError):
      continue

  return matrix

Calclating correlation using the definition provided in Table 4 of the COALS paper:
$\begin{align*}
w'_{a,b} &= \dfrac{Tw_{a,b}-\sum_jw_{a,j}\cdot \sum_iw_{i,b}}{\left(\sum_jw_{a,j}\cdot \left(T-\sum_jw_{a,j}\right)\cdot \sum_i w_{i,b}\cdot \left(T-\sum_i w_{i,b}\right)\right)^{1/2}}\\
T &= \sum_i\sum_j w_{i,j}
\end{align*}$

In [None]:
# Function to convert raw counts to correlations
def corr_norm(matrix):
  T, dim = np.sum(matrix), matrix.shape[0]
  w_a = mat.repmat(np.sum(matrix, axis=1), dim, 1).T
  w_b = mat.repmat(np.sum(matrix, axis=1), dim, 1)
  matrix = (T * matrix - w_a * w_b) / np.sqrt(w_a * (T - w_a) * w_b * (T - w_b))
  return matrix

'''
 Function to perform step 3 of the COALS method:
 Discard negative correlations and take square root
 of positive values
'''
def filt_neg(matrix):
  matrix[np.where(matrix<0)]=0
  matrix = np.sqrt(matrix)
  return matrix

Now we calculate the SVD as $X = U\Sigma V^T$, after which a $k$-dimensionality (for our case $k=50$ vector for any word $c$ would be generated by computing $X_c\hat{V}\hat{\Sigma}^{-1}$ where $X_c$ is the COALS vector for the word c. Also note that the $\hat{\Sigma}^{-1}$ term removes the influence of the singular values from the resulant vector and failing to include it would place too much weight on the first few components.

In [None]:
def coals_svd(matrix, k=50, save=True):
  # Using sparse svd to speed up computation since the co-occurence matrix is sparse
  u, sigma, v_t = scipy.sparse.linalg.svds(matrix, k=k)
  vec = matrix @ v_t.T @ np.linalg.inv(np.diag(sigma))
  if save:
    np.save(drive_directory+'coals_svd', vec)
  return vec

In [None]:
def word_to_vec(corpus, frequency, vocab, initial_matrix, total_words, size=50, epochs=50, save_model=True):
  # Calculating word vectors with the COALS matrix (of size 50) as the initial matrix
  logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
  text = word2vec.Text8Corpus(corpus)
  model = word2vec.Word2Vec(vector_size=size, window=5, min_count=1, workers=10)
  model.build_vocab_from_freq(frequency)
  index = [vocab[w] for w in model.wv.key_to_index]
  model.wv.vectors = initial_matrix[index,:]
  model.train(text, total_words = total_words, epochs = epochs)
  if save_model:
    model.save(drive_directory+'w2vmodel2.bin')

In [None]:
# Loading tokens
token_name = 'token_batch'
token_files = [tok_batch for tok_batch in os.listdir(drive_directory) if tok_batch.startswith(token_name)]
tokens = []
for i in token_files:
  with open(drive_directory+i, 'rb') as f:
    tokens.extend(pickle.load(f))

# Creating corpus from tokens
with open('corp.txt', 'w') as f:
  f.write(" ".join(tokens))

# Loading vocab
with open(drive_directory+'vocab_file.json', 'r') as read_file:
  frequency = json.loads(read_file.read())

# Taking 14000 most frequent words for constructing COALS matrix
vocab = list(frequency.keys())[:14000]
frequency = {k:frequency[k][0] for k in vocab}
vocab = {k:v for v,k in enumerate(vocab)}

### COALS method:

In [None]:
%%time
# Step 1: Construct Co-Occurence matrix
matrix = cooc_matrix(sorted(vocab.keys()), tokens)
print('*** Co-Occurence Matrix Calculated ***')
# Counting no. of tokens for word2vec model
tok_count = len(tokens)
# Deleting tokens to clear up RAM
del(tokens)
gc.collect()

*** Co-Occurence Matrix Calculated ***
CPU times: user 46min 5s, sys: 6.18 s, total: 46min 11s
Wall time: 46min 22s


In [None]:
%%time
# Step 2: Calculate correlation matrix
matrix = corr_norm(matrix)
print('*** Correlations Calculated ***')

# Step 3: Filter negative values and take square root
matrix = filt_neg(matrix)

*** Correlations Calculated ***
CPU times: user 29.5 s, sys: 3.1 s, total: 32.6 s
Wall time: 32.8 s


In [None]:
%%time
# Generating Word2Vec of size 50
initial_coals = coals_svd(matrix, 50)
print('*** SVD Calculated ***')

*** SVD Calculated ***
CPU times: user 37.6 s, sys: 14.2 s, total: 51.8 s
Wall time: 26.8 s


### Word2Vec Model:

In [None]:
# Training word2vec model
print('\n\n*** Starting Word2Vec ***\n\n')
initial_coals = np.load(drive_directory+'coals_svd.npy')
word_to_vec('corp.txt', frequency, vocab, initial_coals, tok_count, epochs=20)
print('\n\n***Word2Vec Done ***\n\n')

2022-05-18 06:35:05,767 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=50, alpha=0.025)', 'datetime': '2022-05-18T06:35:05.767300', 'gensim': '4.1.2', 'python': '3.7.13 (default, Apr 24 2022, 01:04:09) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.188+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}
2022-05-18 06:35:05,769 : INFO : Processing provided word frequencies
2022-05-18 06:35:05,775 : INFO : collected 14000 unique word types, with total frequency of 108760376
2022-05-18 06:35:05,779 : INFO : Creating a fresh vocabulary
2022-05-18 06:35:05,851 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 14000 unique words (100.0%% of original 14000, drops 0)', 'datetime': '2022-05-18T06:35:05.850930', 'gensim': '4.1.2', 'python': '3.7.13 (default, Apr 24 2022, 01:04:09) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.188+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'prepare_vocab'}
2022-05-18 06:35:05,853 : INFO : Word2Vec lifecycle event {'msg': '



*** Starting Word2Vec ***




2022-05-18 06:35:05,970 : INFO : deleting the raw counts dictionary of 14000 items
2022-05-18 06:35:05,972 : INFO : sample=0.001 downsamples 25 most-common words
2022-05-18 06:35:05,975 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 105340405.21541853 word corpus (96.9%% of prior 108760376)', 'datetime': '2022-05-18T06:35:05.975768', 'gensim': '4.1.2', 'python': '3.7.13 (default, Apr 24 2022, 01:04:09) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.188+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'prepare_vocab'}
2022-05-18 06:35:06,154 : INFO : estimated required memory for 14000 words and 50 dimensions: 12600000 bytes
2022-05-18 06:35:06,156 : INFO : resetting layer weights
2022-05-18 06:35:06,178 : INFO : Word2Vec lifecycle event {'msg': 'training model with 10 workers on 14000 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2022-05-18T06:35:06.178924', 'gensim': '4.1.2', 'python': '3.7.13 (default, Apr



***Word2Vec Done ***




### Similar Words:

In [None]:
# Loading trained word2vec model
model = word2vec.Word2Vec.load(drive_directory+'w2vmodel2.bin')
# List of keywords related to Covid-19
keywords = ['ill', 'ambulance', 'pulmonary', 'oxygen', 'antibiotic', 'serum', 
            'symptom', 'treatment', 'mortality', 'seroconversion']
# Top 5 most similar words for each of the above keywords along with the similarity score
for keyword in keywords:
  print(f'\n\nTop 5 most similar words for the root word: {keyword}')
  print(model.wv.most_similar(keyword, 5))

2022-05-18 07:56:24,214 : INFO : loading Word2Vec object from drive/MyDrive/Data/NLP2022/w2vmodel2.bin
2022-05-18 07:56:24,257 : INFO : loading wv recursively from drive/MyDrive/Data/NLP2022/w2vmodel2.bin.wv.* with mmap=None
2022-05-18 07:56:24,260 : INFO : setting ignored attribute cum_table to None
2022-05-18 07:56:24,426 : INFO : Word2Vec lifecycle event {'fname': 'drive/MyDrive/Data/NLP2022/w2vmodel2.bin', 'datetime': '2022-05-18T07:56:24.426341', 'gensim': '4.1.2', 'python': '3.7.13 (default, Apr 24 2022, 01:04:09) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.188+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'loaded'}




Top 5 most similar words for the root word: ill
[('appra', 0.5576099157333374), ('injured', 0.47327178716659546), ('satisfied', 0.47108936309814453), ('unsuitable', 0.4583607017993927), ('unwell', 0.4580364227294922), ('meet', 0.4530877470970154), ('resuscitate', 0.4453481435775757), ('sedate', 0.44295671582221985), ('comfortable', 0.44224822521209717), ('glasgow', 0.41757169365882874)]


Top 5 most similar words for the root word: ambulance
[('battery', 0.6321083307266235), ('instal', 0.627493143081665), ('port', 0.6249685287475586), ('dispatch', 0.6136040091514587), ('telecommunication', 0.6005160212516785), ('logistics', 0.5930819511413574), ('dedicated', 0.5923635363578796), ('station', 0.5919446349143982), ('workstation', 0.585475742816925), ('rfid', 0.5816987752914429)]


Top 5 most similar words for the root word: pulmonary
[('coronary', 0.5944786071777344), ('vascular', 0.5902355313301086), ('myocardial', 0.577247679233551), ('congest', 0.5585733652114868), ('portal', 0.55699

We observe that in most cases, the similar words make sense (except a few) thus we conclude that our model is performing well.