<a href="https://colab.research.google.com/github/cemreefe/cmpe493-project/blob/main/gensim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gensim Doc2Vec

In [43]:
!pip3 install xmltodict

import os
import io   
import re
import json
import math
import pickle
import string
import tarfile
import xmltodict
import numpy as np
import pandas as pd

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords



**Dataset download**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [71]:
def read_file(path):
  with open(path, 'r') as f:
    return f.read()

In [None]:
if not os.path.exists('drive/MyDrive/CMPE/CMPE493'):
  os.makedirs('drive/MyDrive/CMPE/CMPE493')

In [None]:
if not os.path.exists('drive/MyDrive/CMPE/CMPE493/topics-rnd5.xml'):
  !curl https://ir.nist.gov/covidSubmit/data/topics-rnd5.xml --output drive/MyDrive/CMPE/CMPE493/topics-rnd5.xml

if not os.path.exists('drive/MyDrive/CMPE/CMPE493/qrels-covid_d5_j0.5-5.txt'):
  !curl https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j0.5-5.txt --output drive/MyDrive/CMPE/CMPE493/qrels-covid_d5_j0.5-5.txt

if not os.path.exists('drive/MyDrive/CMPE/CMPE493/cord-19_2020-07-16.tar.gz'):
  !curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2020-07-16.tar.gz --output drive/MyDrive/CMPE/CMPE493/cord-19_2020-07-16.tar.gz

In [None]:
if not os.path.exists('2020-07-16'):
  tar = tarfile.open('drive/MyDrive/CMPE/CMPE493/cord-19_2020-07-16.tar.gz', "r:gz")
  tar.extractall()
  tar.close()

**Using pandas dataframes to read and prepare the data**



In [49]:
df_metadata = pd.read_csv('2020-07-16/metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [50]:
# Delete unused data columns
del df_metadata['sha'], df_metadata['source_x'], df_metadata['doi'], df_metadata['pmcid'], df_metadata['pubmed_id'], df_metadata['license'], df_metadata['publish_time'], df_metadata['authors'], df_metadata['journal'], df_metadata['mag_id'], df_metadata['who_covidence_id'], df_metadata['arxiv_id'], df_metadata['pdf_json_files'], df_metadata['pmc_json_files'], df_metadata['url'], df_metadata['s2_id']

In [51]:
# Delete duplicate document entries
df_metadata.drop_duplicates(subset='cord_uid', keep='first', inplace=True)

In [52]:
df_metadata

Unnamed: 0,cord_uid,title,abstract
0,ug7v899j,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...
3,2b73a28n,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide...
4,9785vg6d,Gene expression in epithelial cells in respons...,Respiratory syncytial virus (RSV) and pneumoni...
...,...,...,...
192504,z4ro6lmh,Rapid radiological improvement of COVID-19 pne...,
192505,hi8k8wvb,SARS E protein in phospholipid bilayers: an an...,Abstract We report on an anomalous X-ray refle...
192506,ma3ndg41,Italian Society of Interventional Cardiology (...,COVID‐19 pandemic raised the issue to guarante...
192507,wh10285j,"Nimble, Together: A Training Program's Respons...",


In [53]:
# Read relevances file
topic_relevances = 'topic iter document_id judgement\n' + read_file('drive/MyDrive/CMPE/CMPE493/qrels-covid_d5_j0.5-5.txt')

df_relevances = pd.read_csv(  io.StringIO(topic_relevances)  , sep=" ")
del df_relevances['iter']

df_relevances

Unnamed: 0,topic,document_id,judgement
0,1,005b2j4b,2
1,1,00fmeepz,1
2,1,010vptx3,2
3,1,0194oljo,1
4,1,021q9884,1
...,...,...,...
69313,50,zvop8bxh,2
69314,50,zwf26o63,1
69315,50,zwsvlnwe,0
69316,50,zxr01yln,1


In [54]:
# Read topics file
topics_obj = xmltodict.parse(read_file('drive/MyDrive/CMPE/CMPE493/topics-rnd5.xml'))
topics     = json.loads(json.dumps(topics_obj))

# Query, question and narrative fields are concatenated
topics_dict = {}
for topic in topics['topics']['topic']:
  # a topic has the following fields:
  #  * @number
  #  * narrative
  #  * query
  #  * question
  topics_dict[topic['@number']] = topic['query'] + ' ' + topic['question'] + ' ' + topic['narrative']

# Data so far

* `topics_dict` 
      has `topic-id` for keys, and topic description for values
* `df_relevances` 
      has the following three columns:
      topic	document-id	judgement
* `df_metadata`
      holds information about the documents
      has the following three columns (others are deleted):
      cord_uid	title	abstract


In [55]:
# Download nltk English stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [56]:
# Title and abstracts of the documents are concatenated
docs = np.array(df_metadata)
contents = {}

for doc in docs:
  contents[doc[0]] = f'{doc[1]} {doc[2]}'

### `contents` is a dictionary with document id keys and f'{document title} {document content}' values.
```
document_id: f'{document_title} {document_content}'
```

In [57]:
porter_stemmer = PorterStemmer()
sw = stopwords.words('english')

# preprocessing
# case folding
# punctuation removal
# number deletion
# stemming & stopword removal
def preprocess(s):
  s = s.casefold()
  s = s.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
  s = re.sub("\d+", "", s) #Delete numbers
  s = [porter_stemmer.stem(word) for word in s.split() if word not in sw and word != 'nan']
  return s

In [58]:
import gensim

# Create a gensim training corpus
def get_corpus():
  for i, (id, doc) in enumerate(contents.items()):
    tokens = preprocess(doc)
    yield gensim.models.doc2vec.TaggedDocument(tokens, [id])

train_corpus = list(get_corpus())

In [59]:
train_corpus[0]

TaggedDocument(words=['clinic', 'featur', 'cultur', 'proven', 'mycoplasma', 'pneumonia', 'infect', 'king', 'abdulaziz', 'univers', 'hospit', 'jeddah', 'saudi', 'arabia', 'object', 'retrospect', 'chart', 'review', 'describ', 'epidemiolog', 'clinic', 'featur', 'patient', 'cultur', 'proven', 'mycoplasma', 'pneumonia', 'infect', 'king', 'abdulaziz', 'univers', 'hospit', 'jeddah', 'saudi', 'arabia', 'method', 'patient', 'posit', 'pneumonia', 'cultur', 'respiratori', 'specimen', 'januari', 'decemb', 'identifi', 'microbiolog', 'record', 'chart', 'patient', 'review', 'result', 'patient', 'identifi', 'requir', 'admiss', 'infect', 'commun', 'acquir', 'infect', 'affect', 'age', 'group', 'common', 'infant', 'pre', 'school', 'children', 'occur', 'year', 'round', 'common', 'fall', 'spring', 'three', 'quarter', 'patient', 'comorbid', 'twenti', 'four', 'isol', 'associ', 'pneumonia', 'upper', 'respiratori', 'tract', 'infect', 'bronchiol', 'cough', 'fever', 'malais', 'common', 'symptom', 'crepit', 'whee

In [60]:
# Create model and build vocabulary
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=0, epochs=10)
model.build_vocab(train_corpus)

In [61]:
# Train the model for 10 epochs
for epoch in range(10):
  print(f"epoch {epoch}")
  model.train(train_corpus, total_examples=model.corpus_count, epochs=1)
  checkpoint = f"model_epoch_{epoch}"
  model.save(ckpnt)
  print(f"Saving {checkpoint}")

epoch 0
Saving model_epoch_0
epoch 1
Saving model_epoch_1
epoch 2
Saving model_epoch_2
epoch 3
Saving model_epoch_3
epoch 4
Saving model_epoch_4
epoch 5
Saving model_epoch_5
epoch 6
Saving model_epoch_6
epoch 7
Saving model_epoch_7
epoch 8
Saving model_epoch_8
epoch 9
Saving model_epoch_9


In [62]:
# Calculate document vectors from trained model
def get_doc_vectors():
  for i,id in enumerate(contents.keys()):
    yield model.infer_vector(train_corpus[i].words)

doc_vectors = list(get_doc_vectors())

In [63]:
# Calculate topic vectors
def get_topic_vectors():
  for topic in topics_dict.values():
    yield model.infer_vector(preprocess(topic))

topic_vectors = list(get_topic_vectors())

In [64]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between document vectors and topic vectors
sims = cosine_similarity(doc_vectors, topic_vectors)
sims.shape

(191175, 50)

In [65]:
# Turn similarity matrix into dataframe
all_sims = pd.DataFrame(sims, columns=list(np.array(range(50))+1))
all_sims.insert(loc=0, column='doc_id', value=contents.keys())
all_sims

Unnamed: 0,doc_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50
0,ug7v899j,0.510217,0.290419,-0.420940,0.104117,0.459698,0.759546,0.597959,0.446059,0.096564,0.767767,-0.375335,0.757279,0.686751,0.813195,0.602045,-0.401058,0.640280,0.639647,-0.531942,-0.145571,0.558234,0.634011,-0.286233,0.768077,0.614874,0.419839,0.553364,-0.139056,0.877275,0.446061,0.170305,0.132401,0.782284,0.810898,0.709978,0.746385,0.779310,0.808731,-0.383410,0.111775,0.740194,0.790909,0.789767,-0.476659,0.836568,0.406703,0.149617,0.581618,0.470618,-0.739610
1,02tnwd4m,0.514904,0.223475,-0.380493,-0.054793,0.478527,0.675941,0.556421,0.556519,0.075754,0.751291,-0.410097,0.710592,0.833871,0.794844,0.582653,-0.269018,0.616772,0.670298,-0.549024,-0.220630,0.572700,0.675672,-0.379126,0.627551,0.558553,0.344319,0.502025,-0.079718,0.780689,0.224118,0.065047,0.193168,0.757221,0.757302,0.734572,0.661427,0.796414,0.769710,-0.442722,-0.017816,0.721909,0.802311,0.760348,-0.404099,0.852284,0.406241,0.086892,0.570414,0.510869,-0.739583
2,ejv2xln0,0.119593,-0.127364,-0.176322,-0.046515,0.051053,0.146200,0.190187,0.031149,-0.165073,0.307142,-0.300394,0.234618,0.211119,0.272406,0.346283,-0.042242,0.055709,0.179917,-0.226061,-0.252174,0.137456,0.256052,-0.075660,0.075394,-0.047610,0.054308,0.067804,-0.193455,0.256446,-0.153333,0.007550,-0.178446,0.103225,0.260846,0.238690,0.066439,0.216625,0.219421,-0.243380,-0.183005,0.228825,0.221843,0.102959,-0.276622,0.362590,0.178283,0.095110,0.059794,-0.033423,-0.304027
3,2b73a28n,-0.252970,-0.102425,0.233077,0.326855,-0.215103,-0.290404,-0.148630,-0.339220,0.025364,-0.287894,0.112645,-0.318244,-0.178540,-0.329668,-0.167548,0.163472,-0.361229,-0.240506,0.245890,0.053780,-0.271122,-0.251024,0.198065,-0.322041,-0.221492,-0.206439,-0.207404,-0.030195,-0.323066,-0.110438,0.076241,-0.044999,-0.305407,-0.365039,-0.179620,-0.312128,-0.377078,-0.326621,0.042361,-0.074436,-0.185509,-0.373001,-0.345405,0.131767,-0.271041,0.029712,0.045976,-0.251117,-0.295455,0.299066
4,9785vg6d,0.432292,0.236456,-0.204592,0.100294,0.308543,0.648237,0.536486,0.320575,-0.008198,0.625461,-0.415829,0.608649,0.678770,0.610987,0.394375,-0.394170,0.501200,0.513636,-0.381566,-0.334832,0.535407,0.518045,-0.211124,0.579513,0.535040,0.379470,0.388972,-0.197963,0.602544,0.220784,0.115885,0.192031,0.600664,0.625938,0.594389,0.525987,0.637149,0.559649,-0.389648,0.144379,0.533795,0.590827,0.521855,-0.484675,0.707862,0.361442,0.209930,0.415796,0.285103,-0.545125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191170,z4ro6lmh,0.370104,0.311163,-0.243361,-0.037196,0.375336,0.700008,0.585929,0.465128,0.142060,0.756703,-0.369333,0.611788,0.624607,0.681085,0.569559,-0.287415,0.543607,0.672209,-0.472094,-0.306943,0.431862,0.510536,-0.469758,0.619907,0.544428,0.418270,0.393195,-0.119072,0.739344,0.488561,0.091545,0.169037,0.749646,0.641888,0.581889,0.594761,0.673785,0.705289,-0.496052,0.164606,0.604438,0.710658,0.676877,-0.451153,0.752073,0.249669,0.096457,0.527149,0.387299,-0.516231
191171,hi8k8wvb,-0.021561,-0.090377,0.004497,-0.054203,-0.087996,-0.032113,0.060502,0.161954,-0.074414,-0.024302,0.079966,0.047421,0.108065,0.040190,0.013113,0.150036,-0.015522,0.063635,-0.180612,-0.032246,-0.018342,0.071417,-0.018194,-0.124397,-0.148712,-0.081691,0.106115,0.032403,0.033577,-0.063273,-0.035811,0.086053,-0.011449,-0.033587,-0.037290,-0.068084,0.014158,0.035433,0.046710,0.028278,0.027857,0.003511,0.034324,0.120412,0.016883,-0.056238,-0.221573,0.014968,0.225055,-0.057898
191172,ma3ndg41,0.432971,0.218505,-0.333119,-0.001757,0.500271,0.718146,0.603150,0.606181,0.037073,0.820836,-0.419061,0.746810,0.758152,0.827496,0.621225,-0.380229,0.741129,0.696993,-0.537444,-0.258266,0.712775,0.676903,-0.452610,0.742873,0.608899,0.307736,0.525526,-0.127777,0.841754,0.356557,0.130438,0.237814,0.843323,0.811127,0.703298,0.762378,0.856425,0.813703,-0.584246,-0.039519,0.777738,0.834642,0.804377,-0.454182,0.862518,0.421289,0.096089,0.527068,0.496766,-0.734823
191173,wh10285j,0.539469,0.318638,-0.388899,0.036802,0.540763,0.697379,0.623631,0.373331,0.070623,0.788027,-0.559659,0.714497,0.720056,0.749880,0.631461,-0.366651,0.624806,0.595258,-0.436413,-0.258798,0.613679,0.601124,-0.392440,0.660292,0.560189,0.481845,0.438107,-0.209752,0.824415,0.287437,0.070388,0.179708,0.732805,0.723135,0.707326,0.703474,0.751507,0.848889,-0.458974,0.018723,0.724166,0.769958,0.743375,-0.468656,0.836362,0.428140,-0.002563,0.581530,0.416721,-0.655158


In [66]:
# Reshape the dataframe
all_sims = all_sims.melt(
    id_vars=['doc_id'],
    var_name='topic',
    value_name='similarity',
)

all_sims.columns = ['document_id', 'topic', 'similarity']
all_sims

Unnamed: 0,document_id,topic,similarity
0,ug7v899j,1,0.510217
1,02tnwd4m,1,0.514904
2,ejv2xln0,1,0.119593
3,2b73a28n,1,-0.252970
4,9785vg6d,1,0.432292
...,...,...,...
9558745,z4ro6lmh,50,-0.516231
9558746,hi8k8wvb,50,-0.057898
9558747,ma3ndg41,50,-0.734823
9558748,wh10285j,50,-0.655158


In [67]:
# Only use document-topic pairs that are present in df_relevances
new_results = pd.merge(all_sims, df_relevances, left_on=['document_id', 'topic'], right_on=['document_id', 'topic'])
del new_results['judgement']
new_results

Unnamed: 0,document_id,topic,similarity
0,sw4wtxdk,1,0.377638
1,6wu024ng,1,0.311055
2,sbxqwfmy,1,0.135647
3,1rhy8td0,1,0.153037
4,t7rxmzvi,1,-0.051768
...,...,...,...
69313,j6y806qu,50,0.165518
69314,bv6xa8v8,50,-0.618492
69315,7g3p570l,50,0.302385
69316,eqfz0wpm,50,-0.667896


In [72]:
# Prepare results for writing to file
# Only take even topics for evaluation
nae = np.array(new_results)
results = []
for i, row in enumerate(nae):
  if not row[1] % 2:
    results.append(f'{row[1]} 0 {row[0]} 0 {row[2]} 0')

In [73]:
results[:10]

['2 0 bbvxu8op 0 -0.022221462801098824 0',
 '2 0 s4y6uxsb 0 0.15705905854701996 0',
 '2 0 1qo1krxv 0 0.3636695146560669 0',
 '2 0 0rq0wdpq 0 -0.00459859287366271 0',
 '2 0 1rzcrkmt 0 0.16667301952838898 0',
 '2 0 gbdaad4l 0 -0.02000168338418007 0',
 '2 0 h3yxymh3 0 0.2522464692592621 0',
 '2 0 ni6iyzdn 0 0.11236879229545593 0',
 '2 0 1rhy8td0 0 0.24380475282669067 0',
 '2 0 yci0a6bt 0 0.00887655932456255 0']

In [74]:
# Write results to file
with open('results_gensim.txt', 'w') as f:
    f.write('\n'.join(results))