# NLP2022: Assignment 4

### -Avirup Das (MDS202013)

In [None]:
# Importing necessary libraries
import os, gensim, logging, pickle
from gensim.models import word2vec
import pandas as pd
from tqdm import tqdm
import networkx as nx
import matplotlib.pyplot as plt

# Mounting google drive
from google.colab import drive
drive.mount('/content/drive')

# Setting paths
path_to_json = 'pdf_json/'
drive_directory = 'drive/MyDrive/Data/NLP2022/'

Mounted at /content/drive


In [None]:
%%time
def word_vecs(token_name):
  # Loading tokens from drive
  print('*** Loading Tokens ***')
  token_files = [tok_batch for tok_batch in os.listdir(drive_directory) if tok_batch.startswith(token_name)]
  tokens = []
  for i in token_files:
    with open(drive_directory+i, 'rb') as f:
      tokens.extend(pickle.load(f))
  with open('corp.txt', 'w') as f:
    f.write(" ".join(tokens))

  text = word2vec.Text8Corpus('corp.txt')
  print('*** Text Extracted ***')
  del tokens
  logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
  model = word2vec.Word2Vec(text, size=128, window=5, min_count=50, workers=10, iter=10)
  model.save(drive_directory+'text8.model.bin')
  return model
  

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 20 µs


In [None]:
model = word_vecs('token_batch')

*** Loading Tokens ***
*** Text Extracted ***


2022-04-30 14:06:30,355 : INFO : collecting all words and their counts
2022-04-30 14:06:30,363 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-04-30 14:07:06,695 : INFO : PROGRESS: at sentence #10000, processed 100000000 words, keeping 576691 word types
2022-04-30 14:07:12,299 : INFO : collected 629005 word types from a corpus of 115314245 raw words and 11532 sentences
2022-04-30 14:07:12,300 : INFO : Loading a fresh vocabulary
2022-04-30 14:07:13,122 : INFO : effective_min_count=50 retains 43600 unique words (6% of original 629005, drops 585405)
2022-04-30 14:07:13,123 : INFO : effective_min_count=50 leaves 112957994 word corpus (97% of original 115314245, drops 2356251)
2022-04-30 14:07:13,282 : INFO : deleting the raw counts dictionary of 629005 items
2022-04-30 14:07:13,333 : INFO : sample=0.001 downsamples 21 most-common words
2022-04-30 14:07:13,335 : INFO : downsampling leaves estimated 109731446 word corpus (97.1% of prior 112957994)
2022-04-30 1

In [None]:
# Loading model from Drive
model = word2vec.Word2Vec.load(drive_directory+'text8.model.bin')

In [None]:
# Calculate similar words
def calc_sim_word(model, source, topn=5):
  print(f'Computing similar words for source - {source}')
  sim_word = [source]
  try:
    top_words = model.wv.most_similar(source, topn=topn)
    sim_word.extend([val[0] for val in top_words])
  except KeyError as err:
    print(err.args)
  return sim_word

#funcion to find similar words at 3 levels. I tried for similar words at 4 level deep but the graph is getting clumsy so I am keeping calculating similar words at level 3.
def find_sim_word(model, keywords, save=True):
  df = pd.DataFrame(columns= [f'word_{i-1}' for i in range(1,7)])
  '''
   keywords receives a list of keywords from the vocabulary
   which in turn would be used to find similar words
  '''
  for source in keywords:
    sim_word = calc_sim_word(model, source)
    print(sim_word)
    df.loc[len(df)] = sim_word
        
    for i in range(1,len(sim_word)):
      second_sim_word = calc_sim_word(model, sim_word[i])
      df.loc[len(df)] = second_sim_word

      for j in range(1, len(second_sim_word)):
        third_sim_word = calc_sim_word(model, second_sim_word[j])
        df.loc[len(df)] = third_sim_word

  if save:
    df.to_csv(drive_directory+'sim_words.csv')
    
  return df

In [None]:
# [c'patient', 'ambulance', 'pulmonary', 'oxygen', 'antibiotic', c'respiration', 'treatment', 'mortality', c'blood', 'symptom']
# ['ill', 'ambulance', 'pulmonary', 'oxygen', 'antibiotic', 'serum', 'symptom', 'treatment', 'mortality', 'seroconversion']
keywords = ['ill', 'ambulance', 'pulmonary', 'oxygen', 'antibiotic', 'serum', 'symptom', 'treatment', 'mortality', 'seroconversion']
df = find_sim_word(model, keywords, save=False)
#color of each keyword
cmap_components = {keywords[0]:'#E74C3C', keywords[1]:'#8E44AD', keywords[2]:'#F4D03F', keywords[3]:'#27AE60', 
                   keywords[4]:'#5499C7', keywords[5]:'#A04000', keywords[6]:'#5B2C6F',
                   keywords[7]:'#138D75', keywords[8]:'#717D7E', keywords[9]:'#34495E'}
df

Computing similar words for source - ill
['ill', 'unwell', 'dyspneic', 'afflicted', 'sick', 'anemic']
Computing similar words for source - unwell
Computing similar words for source - ill
Computing similar words for source - dyspneic
Computing similar words for source - sick
Computing similar words for source - afebr
Computing similar words for source - breathless
Computing similar words for source - dyspneic
Computing similar words for source - tachypneic
Computing similar words for source - unwell
Computing similar words for source - afebr
Computing similar words for source - tachycardic
Computing similar words for source - breathless
Computing similar words for source - afflicted
Computing similar words for source - afflict
Computing similar words for source - affected
Computing similar words for source - undernourished
Computing similar words for source - distressed
Computing similar words for source - unwell
Computing similar words for source - sick
Computing similar words for sour

Unnamed: 0,word_0,word_1,word_2,word_3,word_4,word_5
0,ill,unwell,dyspneic,afflicted,sick,anemic
1,unwell,ill,dyspneic,sick,afebr,breathless
2,ill,unwell,dyspneic,afflicted,sick,anemic
3,dyspneic,tachypneic,unwell,afebr,tachycardic,breathless
4,sick,unwell,caretaker,afraid,refuse,home
...,...,...,...,...,...,...
305,seroconversion,seroconversions,seropositivity,seroconverted,seroposit,viraemia
306,recrudescence,reactivation,recrudescent,persistence,reinfection,reactivations
307,viruria,bkv,dnaemia,cand,hypogammaglobul,hyperhomocyste
308,viraemic,viraemia,recrudescence,seronegat,seroconverted,seroconvert


In [None]:
words = df

g = nx.cubical_graph()

#Construct the graph
#for every source word(node), add edges - connected through similarity
#As there two levels of similar words, we need to repeat the same to construct the graph
for i, row in words.iterrows():
    for j in range(1,len(row)):
        g.add_node(row[j])
    for j in range(1,len(row)):
        g.add_edge(row[1], row[j])    
      
G = nx.petersen_graph()
plt.subplot()
options = {'font_size':20}

fedges = filter(lambda x: g.degree()[x[0]] >= 3 and g.degree()[x[1]] >= 5, g.edges())
f = nx.Graph()                                                                                                                                     
f.add_edges_from(fedges)

plt.rcParams['figure.figsize'] = [80,50]

for keyword in keywords:
  subgraph = nx.node_connected_component(f,keyword)
  nx.draw(g.subgraph(subgraph), with_labels=True,**options, edge_color = cmap_components[keyword], node_color = cmap_components[keyword], label = keyword)

plt.legend(loc = 'upper left', fontsize = 'medium', prop={'size': 20})
plt.title("Knowledge graph", fontsize = 30)
plt.savefig(drive_directory+'knowledge_graph.pdf')
plt.show()

Output hidden; open in https://colab.research.google.com to view.