# NLP2022: Assignment 3

### -Avirup Das (MDS202013)

In [1]:
# Importing necessary packages
import os, re, json, math, nltk, pickle, spacy, string
import scipy, logging, gensim, gc, random, torch
import pandas as pd
import numpy as np
import numpy.matlib as mat
from tqdm import tqdm
from gensim.models import Word2Vec, word2vec
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# Mounting google drive
from google.colab import drive
drive.mount('/content/drive')

# Setting paths
path_to_json = 'pdf_json/'
drive_directory = 'drive/MyDrive/Data/NLP2022/'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
# Loading english documents from drive
with open(drive_directory+'eng_corpus.txt','r') as f:
  eng_corpus= f.read()

eng_corpus = eng_corpus.split('\n')

In [11]:
# Pre-processing corpus
def rem_quote(corpus):
  # Remove apostrophe (\') and quotes
  return re.sub('[\"\']', '', re.sub('\'', '', corpus))

def rem_contractions(corpus):
  '''
    Removes all contractions from the corpus
  '''
  # Removing specific contractions
  corpus = re.sub(r"won\'t", " will not ", corpus)
  corpus = re.sub(r"can\'t", " can not ", corpus)
  corpus = re.sub(r"e.g.", " example ", corpus)
  corpus = re.sub(r"i.e.", " that is ", corpus)

  # Removing general contractions
  corpus = re.sub(r"n\'t", " not ", corpus)
  corpus = re.sub(r"\'re", " are ", corpus)
  corpus = re.sub(r"\'s", " is ", corpus)
  corpus = re.sub(r"\'d", " would ", corpus)
  corpus = re.sub(r"\'ll", " will ", corpus)
  corpus = re.sub(r"\'t", " not ", corpus)
  corpus = re.sub(r"\'ve", " have ", corpus)
  corpus = re.sub(r"\'m", " am ", corpus)

  return corpus

def rem_url(corpus):
  '''
  Remove all urls from the corpus. Searching for the regex pattern http(s)://
  or http(s):\\ followed by any set of characters unless it is a blank space (' '),
  newline ('\n'), tab-space ('\t'), any white space ('\s') or ending brackets ')'
  '''
  regex = re.compile(r'https?:/\/\.*[^\r\n\s\t\)]*')
  corpus = re.sub(regex, '', corpus)

  # Removing email IDs
  return re.sub(re.compile(r"\S*@\S*\s?"),'', corpus)

def rem_brackets(corpus):
  '''
  Remove parenthesis and all characters appearing between them (if any).
  Also remove citations which are in third brackets (eg: [3])
  '''
  regex = re.compile(r'[\[\(]+[^\n\[\(]*[\)\]]+')
  return re.sub(regex, '', corpus)

def rem_nums_chars(corpus):
  '''
   Remove all special characters (except sentence delimiters .?!),
   digits and single letters (except 'a') and then encode and
   decode in ASCII to remove all invalid characters
  '''
  corpus = re.sub(r'''[\[\]\(\)\{\}['",:;#$%&><=*\\\/\+\-\@]''', ' ', corpus)
  corpus = re.sub(r'[0-9?]', ' ', corpus)
  corpus = re.sub(r' [b-zB-Z] ', ' ', corpus)
  corpus = corpus.encode("ascii", "ignore")
  return corpus.decode()

def remove_extra_space(corpus):
  # Remove extra spaces and tabs to be just one white space. 
  return re.sub(re.compile(r'(?<= ) +'), '', corpus)

def correct_sentence_end(text):
  '''
   Function to correct sentence structure. There should not be any whitespace before .,! or ? and 
   a whitespace after .! or ?. This is important to extract out sentences from a text. 
   We also handle decimals since there should not be spaces after decimal points.
  '''
  pattern1 = re.compile(r'( +)([.!?])')
  text = re.sub(pattern1, '\g<2>', text)
  pattern2 = re.compile(r'([?!])([^ ])')
  text = re.sub(pattern2, '\g<1>' + ' ' + '\g<2>', text)
  return re.sub('(\.)([^ \d])', '\g<1>' + ' ' + '\g<2>', text)

def preprocess(corpus):
  # Defining our sequence of pre-processing steps
  corpus = rem_contractions(corpus)
  corpus = rem_url(corpus)
  corpus = rem_brackets(corpus)
  corpus = rem_quote(corpus)
  corpus = rem_nums_chars(corpus)
  corpus = remove_extra_space(corpus)
  corpus = correct_sentence_end(corpus)
  return corpus

In [12]:
# Pre-process entire corpus
def preprocess_corpus(docs, batch_size=6000):
  '''
   Pre-processing the whole corpus in batches to avoid memory-overflow
   and writing batches to disk
  '''
  n_batches = math.ceil(len(docs)/batch_size)
  corpus = ''
  for i in range(n_batches):
    batch_begin= batch_size*i
    batch = docs[batch_begin:batch_begin+batch_size]
    if len(corpus)!=0:
      corpus = corpus+'\n'+ preprocess('\n'.join(batch))
    else:
      corpus = preprocess('\n'.join(batch))
    print(f'Pre-processing {i+1}-th batch: Done')
  # Save processed corpus
  with open(drive_directory+f'processed_corpus.txt', 'w') as f:
      f.write(corpus)
  return corpus

In [13]:
%%time
eng_corpus = preprocess_corpus(eng_corpus)

Pre-processing 1-th batch: Done
Pre-processing 2-th batch: Done
Pre-processing 3-th batch: Done
Pre-processing 4-th batch: Done
Pre-processing 5-th batch: Done
Pre-processing 6-th batch: Done
Pre-processing 7-th batch: Done
Pre-processing 8-th batch: Done
Pre-processing 9-th batch: Done
Pre-processing 10-th batch: Done
CPU times: user 7min 2s, sys: 17.2 s, total: 7min 19s
Wall time: 7min 24s


In [2]:
with open(drive_directory+f'processed_corpus.txt', 'r') as f:
  eng_corpus=f.read()

In [24]:
# Taking 2000 random samples from english docs
doc_idx=random.sample(range(0, len(eng_corpus.split('\n'))), 2000)
print(doc_idx)

[2852, 24831, 17791, 33971, 26516, 3930, 26813, 10909, 19192, 52220, 38346, 52619, 4106, 32009, 6057, 20074, 26593, 21600, 50588, 19388, 52806, 50363, 41305, 13372, 7065, 19563, 93, 30302, 11357, 10419, 49000, 4150, 28202, 20810, 30075, 3112, 41684, 37611, 22029, 2330, 32915, 42384, 28824, 23960, 26260, 50034, 22795, 38728, 26796, 2048, 14614, 37166, 10952, 17072, 49845, 41960, 46945, 40132, 5574, 21974, 1467, 8489, 48358, 34020, 25710, 44096, 6493, 6375, 19057, 34451, 29988, 14869, 37013, 14002, 23415, 41536, 7834, 14657, 22581, 12145, 10725, 20103, 39195, 20841, 22118, 18799, 3229, 14390, 27217, 14805, 15625, 49610, 51459, 9705, 28118, 17660, 6070, 31178, 28754, 46599, 13744, 30570, 46395, 32255, 15858, 15627, 8022, 26176, 2516, 42461, 46114, 11349, 12072, 11257, 19972, 37745, 42901, 13961, 46088, 10202, 41773, 35516, 1546, 22988, 19570, 17356, 17954, 13317, 34704, 41914, 28876, 42048, 5980, 11063, 37399, 25244, 26009, 15023, 52447, 44108, 32803, 5782, 16355, 21812, 15250, 47221, 303

In [26]:
# Extracting sentences from the sample which have length greater than 5 but less than 20
sentences, docs = [], eng_corpus.split('\n')
for doc in [docs[i] for i in doc_idx]:
  try:
    for sentence in sent_tokenize(doc):
      if len(sentence.split())>5 and len(sentence.split())<20:
        sentences.append(sentence)
  except:
    pass

del docs

In [8]:
# Compute Word vectors
def word_vecs(corpus_path, size=50, save_model=True): 
  logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
  text = word2vec.Text8Corpus(corpus_path)
  model = word2vec.Word2Vec(text, vector_size=size, window=5, min_count=20, workers=10, epochs=10)
  if save_model:
    model.save(drive_directory+'word2vec3.bin')
  return model

# model = word_vecs(drive_directory+'processed_corpus.txt')
model = word_vecs(drive_directory+'processed_corpus.txt')

2022-05-19 03:16:51,969 : INFO : collecting all words and their counts
2022-05-19 03:16:54,304 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-05-19 03:17:30,063 : INFO : PROGRESS: at sentence #10000, processed 100000000 words, keeping 474310 word types
2022-05-19 03:18:03,407 : INFO : PROGRESS: at sentence #20000, processed 200000000 words, keeping 715089 word types
2022-05-19 03:18:11,151 : INFO : collected 764945 word types from a corpus of 222967702 raw words and 22297 sentences
2022-05-19 03:18:11,153 : INFO : Creating a fresh vocabulary
2022-05-19 03:18:11,934 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=20 retains 94068 unique words (12.297354711776663%% of original 764945, drops 670877)', 'datetime': '2022-05-19T03:18:11.934110', 'gensim': '4.1.2', 'python': '3.7.13 (default, Apr 24 2022, 01:04:09) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.188+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'prepare_vocab'}
2022-05-19 03:18:11,936 : IN

In [102]:
# Vocab size of word2vec model
print(f"Number of words= {len(list(model.wv.index_to_key))}")

Number of words= 94068


In [103]:
'''
 Function to implement RNN with specifications mentioned in class where
 the sum of the hidden values are taken as the sentence vectors. We have
 also randomly initialised the matrix embeddings.
'''
def sentence_embedding(word_vec_size, sent_vec_size):
  # Initialise null matrix for sentence vectors
  sent_vec_mat = np.zeros((len(sentences), sent_vec_size))
  # Initialising random weights
  W = np.random.normal(0, 0.01, (sent_vec_size, word_vec_size))
  U = np.random.normal(0, 0.01, (sent_vec_size, sent_vec_size))
  h = np.zeros(sent_vec_size)
  for n, sent in tqdm(enumerate(sentences)):
    for word in re.split(' ', sent.strip()):
      try:
        x = model.wv[word]
        x = scipy.stats.zscore(x)
        h = np.tanh((U@h.T) + (W@x.T))
      except:
        None
      sent_vec_mat[n] += h
      h = h/sent_vec_size
  return sent_vec_mat

# Function to calculate cosine similarity between two vectors
def cosine_similarity(x, y):
  return 1-scipy.spatial.distance.cosine(x , y)

'''
 Function which takes index of a sentence, calculates cosine 
 similarity between that and the rest of the sentences and displays
 10 most similar sentences
'''
def print_sim_sentences(sent_idx):
  print('-'*150, '\n')
  cos = list()
  for i in range(len(sentences)):
    cos.append(cosine_similarity(sv_mat[i], sv_mat[sent_idx]))
  cos = torch.from_numpy(np.array(cos))

  values, idx = torch.topk(cos, k=11, axis=-1)
  for i in range(1,11):
    print(f"{sentences[idx[i]]}\nScore= {float(cos[idx[i]])}", end='\n\n')

'''
 Function which takes an array of sentence indices and displays
 10 most similar sentence for each of them along with the cosine
 similarity scores
'''
def sim_sentences(rand_idx):
  for i, idx in enumerate(rand_idx):
    print('='*150)
    print(f"\033[1m Case {i+1}: \033[0;0m")
    print('Most similar sentences to: \n', sentences[idx], '\n')
    print_sim_sentences(idx)
  print('='*150)

# Obtain sentence embedding from RNN
sv_mat= sentence_embedding(50,100)

130082it [05:17, 409.45it/s]


In [104]:
# Randomly choose 5 sentences to find similar sentences
idx = np.random.randint(0, len(sentences), 5)
print('5 random sentences: ')
print('-'*150, '\n')
for i in idx:
    print(sentences[i], '\n') 

5 random sentences: 
------------------------------------------------------------------------------------------------------------------------------------------------------ 

rapamycin has no effect on the expression of hla abc in culture treated with ifn as well. 

coronaviruses are nonsegmented enveloped posit that is ense single strand rna viruses. 

some tumors had marked cellular anaplasia or nuclear atypia. 

ppm respect that is wh that is carbon appeared at. 

low histam that is diet was initiated and repeat tryptase rema that is elevated. 



In [105]:
# Display 10 similar sentences for each of the 5 randomly chosen sentence
sim_sentences(idx)

[1m Case 1: [0;0m
Most similar sentences to: 
 rapamycin has no effect on the expression of hla abc in culture treated with ifn as well. 

------------------------------------------------------------------------------------------------------------------------------------------------------ 

in normal culture mtor inhibition has no effect on the expression of hla abc.
Score= 0.8599215903032725

on the other hand dihydroxyvitamin induced ddit in a concentration dependent manner in hulm cells.
Score= 0.8051176835340293

ifn production in the absence of fucose treatment was set as the control.
Score= 0.7845211469280885

as shown in figure the metabolic activity of vero cells pretreated with the intracellular extracts of ln.
Score= 0.7804139060725089

when htra was depleted by shrna the expression of id and was largely unaffected.
Score= 0.7787930942158968

after of treatment with pge the bim fluorescence intensity was attenuated compared with that in the dkd group.
Score= 0.7739735649535

**Distance Measure:** The distance measure used in this RNN is the cosine similarity which is given by:
$$sim(x,y) = \dfrac{x.y}{||x||\ ||y||}$$
where $||x||$ is the Euclidean norm of the vector x. This measure is prevalently used in such cases since it measures the angle between the vectors in the vector-space (which is 100 dimensional in our case). Another measure that could have been used is the usual Euclidean distance. The cosine similarity was used during word vectors and it is justified using it for sentence vectors since we are obtaining the sentence vectors by adding up the hidden values which is essentially averaging. Since there is no training involved we are taking the sum, otherwise we would have taken the last hidden value.