<a href="https://colab.research.google.com/github/cemreefe/cmpe493-project/blob/main/gensim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gensim Doc2Vec

In [1]:
!pip3 install xmltodict

import os
import io   
import re
import json
import math
import pickle
import string
import tarfile
import xmltodict
import numpy as np
import pandas as pd

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

Collecting xmltodict
  Downloading https://files.pythonhosted.org/packages/28/fd/30d5c1d3ac29ce229f6bdc40bbc20b28f716e8b363140c26eff19122d8a5/xmltodict-0.12.0-py2.py3-none-any.whl
Installing collected packages: xmltodict
Successfully installed xmltodict-0.12.0


**Dataset download**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def read_file(path):
  with open(path, 'r') as f:
    return f.read()

In [4]:
if not os.path.exists('drive/MyDrive/CMPE/CMPE493'):
  os.makedirs('drive/MyDrive/CMPE/CMPE493')

In [5]:
if not os.path.exists('drive/MyDrive/CMPE/CMPE493/topics-rnd5.xml'):
  !curl https://ir.nist.gov/covidSubmit/data/topics-rnd5.xml --output drive/MyDrive/CMPE/CMPE493/topics-rnd5.xml

if not os.path.exists('drive/MyDrive/CMPE/CMPE493/qrels-covid_d5_j0.5-5.txt'):
  !curl https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j0.5-5.txt --output drive/MyDrive/CMPE/CMPE493/qrels-covid_d5_j0.5-5.txt

if not os.path.exists('drive/MyDrive/CMPE/CMPE493/cord-19_2020-07-16.tar.gz'):
  !curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2020-07-16.tar.gz --output drive/MyDrive/CMPE/CMPE493/cord-19_2020-07-16.tar.gz

In [6]:
if not os.path.exists('2020-07-16'):
  tar = tarfile.open('drive/MyDrive/CMPE/CMPE493/cord-19_2020-07-16.tar.gz', "r:gz")
  tar.extractall()
  tar.close()

**Using pandas dataframes to read and prepare the data**



In [7]:
df_metadata = pd.read_csv('2020-07-16/metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
# Delete unused data columns
del df_metadata['sha'], df_metadata['source_x'], df_metadata['doi'], df_metadata['pmcid'], df_metadata['pubmed_id'], df_metadata['license'], df_metadata['publish_time'], df_metadata['authors'], df_metadata['journal'], df_metadata['mag_id'], df_metadata['who_covidence_id'], df_metadata['arxiv_id'], df_metadata['pdf_json_files'], df_metadata['pmc_json_files'], df_metadata['url'], df_metadata['s2_id']

In [9]:
# Delete duplicate document entries
df_metadata.drop_duplicates(subset='cord_uid', keep='first', inplace=True)

In [10]:
df_metadata

Unnamed: 0,cord_uid,title,abstract
0,ug7v899j,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...
3,2b73a28n,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide...
4,9785vg6d,Gene expression in epithelial cells in respons...,Respiratory syncytial virus (RSV) and pneumoni...
...,...,...,...
192504,z4ro6lmh,Rapid radiological improvement of COVID-19 pne...,
192505,hi8k8wvb,SARS E protein in phospholipid bilayers: an an...,Abstract We report on an anomalous X-ray refle...
192506,ma3ndg41,Italian Society of Interventional Cardiology (...,COVID‐19 pandemic raised the issue to guarante...
192507,wh10285j,"Nimble, Together: A Training Program's Respons...",


In [11]:
# Read relevances file
topic_relevances = 'topic iter document_id judgement\n' + read_file('drive/MyDrive/CMPE/CMPE493/qrels-covid_d5_j0.5-5.txt')

df_relevances = pd.read_csv(  io.StringIO(topic_relevances)  , sep=" ")
del df_relevances['iter']

df_relevances

Unnamed: 0,topic,document_id,judgement
0,1,005b2j4b,2
1,1,00fmeepz,1
2,1,010vptx3,2
3,1,0194oljo,1
4,1,021q9884,1
...,...,...,...
69313,50,zvop8bxh,2
69314,50,zwf26o63,1
69315,50,zwsvlnwe,0
69316,50,zxr01yln,1


In [12]:
# Read topics file
topics_obj = xmltodict.parse(read_file('drive/MyDrive/CMPE/CMPE493/topics-rnd5.xml'))
topics     = json.loads(json.dumps(topics_obj))

# Query, question and narrative fields are concatenated
topics_dict = {}
for topic in topics['topics']['topic']:
  # a topic has the following fields:
  #  * @number
  #  * narrative
  #  * query
  #  * question
  topics_dict[topic['@number']] = topic['query'] + ' ' + topic['question'] + ' ' + topic['narrative']

# Data so far

* `topics_dict` 
      has `topic-id` for keys, and topic description for values
* `df_relevances` 
      has the following three columns:
      topic	document-id	judgement
* `df_metadata`
      holds information about the documents
      has the following three columns (others are deleted):
      cord_uid	title	abstract


In [13]:
# Download nltk English stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
# Title and abstracts of the documents are concatenated
docs = np.array(df_metadata)
contents = {}

for doc in docs:
  contents[doc[0]] = f'{doc[1]} {doc[2]}'

### `contents` is a dictionary with document id keys and f'{document title} {document content}' values.
```
document_id: f'{document_title} {document_content}'
```

In [15]:
porter_stemmer = PorterStemmer()
sw = stopwords.words('english')

# preprocessing
# case folding
# punctuation removal
# number deletion
# stemming & stopword removal
def preprocess(s):
  s = s.casefold()
  s = s.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
  s = re.sub("\d+", "", s) #Delete numbers
  s = [porter_stemmer.stem(word) for word in s.split() if word not in sw and word != 'nan']
  return s

In [16]:
import gensim

# Create a gensim training corpus
def get_corpus():
  for i, (id, doc) in enumerate(contents.items()):
    tokens = preprocess(doc)
    yield gensim.models.doc2vec.TaggedDocument(tokens, [id])

train_corpus = list(get_corpus())

In [17]:
train_corpus[0]

TaggedDocument(words=['clinical', 'features', 'culture', 'proven', 'mycoplasma', 'pneumoniae', 'infections', 'king', 'abdulaziz', 'university', 'hospital', 'jeddah', 'saudi', 'arabia', 'objective', 'retrospective', 'chart', 'review', 'describes', 'epidemiology', 'clinical', 'features', 'patients', 'culture', 'proven', 'mycoplasma', 'pneumoniae', 'infections', 'king', 'abdulaziz', 'university', 'hospital', 'jeddah', 'saudi', 'arabia', 'methods', 'patients', 'positive', 'pneumoniae', 'cultures', 'respiratory', 'specimens', 'january', 'december', 'identified', 'microbiology', 'records', 'charts', 'patients', 'reviewed', 'results', 'patients', 'identified', 'required', 'admission', 'infections', 'community', 'acquired', 'infection', 'affected', 'age', 'groups', 'common', 'infants', 'pre', 'school', 'children', 'occurred', 'year', 'round', 'common', 'fall', 'spring', 'three', 'quarters', 'patients', 'comorbidities', 'twenty', 'four', 'isolates', 'associated', 'pneumonia', 'upper', 'respirat

In [18]:
# Create model and build vocabulary
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=0, epochs=10)
model.build_vocab(train_corpus)

In [28]:
# Train the model for 10 epochs
for epoch in range(10):
  print(f"epoch {0}")
  model.train(train_corpus, total_examples=model.corpus_count, epochs=1)
  checkpoint = f"model_epoch_{epoch}"
  model.save(ckpnt)
  print(f"Saving {checkpoint}")

epoch 0
Saving model_epoch_0
epoch 0
Saving model_epoch_1
epoch 0
Saving model_epoch_2
epoch 0
Saving model_epoch_3
epoch 0
Saving model_epoch_4
epoch 0
Saving model_epoch_5
epoch 0
Saving model_epoch_6
epoch 0
Saving model_epoch_7
epoch 0
Saving model_epoch_8
epoch 0
Saving model_epoch_9


In [None]:
# If the model is already saved, load it
model = gensim.models.Doc2Vec.load("/content/model_saved")

In [32]:
# Calculate document vectors from trained model
def get_doc_vectors():
  for i,id in enumerate(contents.keys()):
    yield model.infer_vector(train_corpus[i].words)

doc_vectors = list(get_doc_vectors())

In [33]:
# Calculate topic vectors
def get_topic_vectors():
  for topic in topics_dict.values():
    yield model.infer_vector(preprocess(topic))

topic_vectors = list(get_topic_vectors())

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between document vectors and topic vectors
sims = cosine_similarity(doc_vectors, topic_vectors)
sims.shape

(191175, 50)

In [37]:
# Turn similarity matrix into dataframe
all_sims = pd.DataFrame(sims, columns=list(np.array(range(50))+1))
all_sims.insert(loc=0, column='doc_id', value=contents.keys())
all_sims

Unnamed: 0,doc_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50
0,ug7v899j,0.596043,0.114696,0.569689,0.238675,-0.127176,0.515202,0.456173,0.653287,-0.536386,0.503201,-0.287935,0.259197,0.235555,0.608375,0.357777,-0.378631,0.458299,0.614574,-0.238087,0.464924,0.538290,0.300126,-0.337744,0.542985,0.620747,0.662459,0.644062,0.400177,0.673938,0.637860,0.226369,-0.241241,-0.127543,0.630616,0.234150,0.654674,0.478214,-0.098738,0.443163,0.300569,0.644467,0.444021,0.468761,-0.496850,0.589724,0.001500,-0.017369,0.684079,0.539584,-0.204076
1,02tnwd4m,0.656149,0.165370,0.610869,0.132833,0.080571,0.717309,0.623897,0.485581,-0.678951,0.293575,-0.450048,0.265266,0.433568,0.756284,0.418513,-0.384763,0.377901,0.582398,-0.290251,0.588656,0.577001,0.161587,-0.534044,0.459400,0.651151,0.640244,0.623260,0.480368,0.639249,0.583449,0.014136,-0.407766,-0.223433,0.713289,0.151208,0.667113,0.552959,-0.275068,0.537903,0.130020,0.659636,0.592693,0.580974,-0.597770,0.572516,-0.059740,0.217653,0.678070,0.710973,-0.162762
2,ejv2xln0,0.503608,0.143631,0.365914,0.046706,-0.120822,0.415910,0.325123,0.372858,-0.603359,0.162930,-0.325481,0.237815,0.209885,0.469082,0.261437,-0.215242,0.273188,0.337350,-0.051667,0.295950,0.433364,0.245466,-0.368732,0.444684,0.439286,0.435452,0.458754,0.217094,0.419068,0.333199,0.133156,-0.370596,-0.289526,0.366026,0.102792,0.506419,0.284314,-0.249283,0.149176,0.013223,0.450067,0.291749,0.321596,-0.434872,0.491575,-0.047966,0.137739,0.455059,0.330447,-0.187430
3,2b73a28n,0.790354,0.267563,0.489167,0.081084,0.003402,0.724547,0.626692,0.753026,-0.739408,0.509820,-0.397364,0.509335,0.362914,0.808294,0.504698,-0.401835,0.649564,0.655198,-0.294628,0.620412,0.610512,0.242995,-0.471842,0.651785,0.832867,0.782230,0.791959,0.508068,0.831725,0.518505,0.144765,-0.282651,-0.248390,0.713912,0.425446,0.774063,0.625803,-0.090331,0.489296,0.290405,0.712139,0.640716,0.559953,-0.488264,0.711151,0.064795,0.217072,0.755713,0.717317,-0.130817
4,9785vg6d,0.644597,0.338061,0.458881,0.070150,-0.015888,0.558650,0.473704,0.628643,-0.635602,0.360529,-0.410649,0.396382,0.344974,0.598681,0.391070,-0.309860,0.484950,0.400675,-0.262776,0.468335,0.593520,0.183372,-0.375889,0.399032,0.656626,0.610456,0.565504,0.106015,0.672027,0.404817,0.160718,-0.367060,-0.308177,0.564198,0.121206,0.652034,0.583035,-0.011258,0.352291,0.438707,0.570343,0.447755,0.464247,-0.428481,0.547659,0.074129,0.282718,0.645842,0.626474,0.118887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191170,z4ro6lmh,0.813713,0.374905,0.608504,0.067827,-0.008511,0.712775,0.703821,0.802897,-0.744554,0.507535,-0.421446,0.555457,0.484747,0.776452,0.550110,-0.377455,0.642911,0.669504,-0.197367,0.644339,0.642197,0.151296,-0.527909,0.571084,0.886674,0.770175,0.796110,0.448347,0.858879,0.634033,0.246687,-0.414748,-0.243374,0.802374,0.332260,0.748461,0.675483,-0.226129,0.531397,0.355722,0.762515,0.675058,0.682007,-0.519126,0.763705,-0.067752,0.163741,0.826348,0.740035,-0.005892
191171,hi8k8wvb,0.649675,0.114893,0.462234,-0.068550,0.018868,0.644254,0.505011,0.501255,-0.632540,0.332476,-0.219932,0.298587,0.383206,0.677271,0.425010,-0.249967,0.517149,0.602181,-0.233736,0.490201,0.414454,0.141578,-0.512417,0.411307,0.640041,0.636786,0.590064,0.485456,0.583818,0.469067,0.078790,-0.279283,-0.162722,0.561244,0.221699,0.601676,0.374632,-0.224308,0.558843,0.172132,0.614412,0.502536,0.496785,-0.407790,0.544263,-0.078205,0.107230,0.625533,0.627175,-0.266682
191172,ma3ndg41,0.894369,0.362060,0.598848,-0.016565,-0.068264,0.770542,0.749940,0.743219,-0.810962,0.499976,-0.440890,0.562289,0.466509,0.876474,0.577003,-0.396449,0.710532,0.709765,-0.345624,0.605658,0.729826,0.126267,-0.597500,0.634788,0.897372,0.847421,0.792024,0.510327,0.891980,0.654358,0.230885,-0.433814,-0.196575,0.801364,0.359183,0.829799,0.637401,-0.373251,0.561746,0.335523,0.831983,0.695148,0.670259,-0.630591,0.822286,-0.072720,0.225691,0.805893,0.790287,0.018404
191173,wh10285j,0.802626,0.176393,0.507098,0.056303,-0.020775,0.718485,0.654296,0.703406,-0.716990,0.519769,-0.373201,0.404194,0.400792,0.818562,0.507517,-0.476489,0.633749,0.655930,-0.424602,0.596194,0.629921,0.119599,-0.467928,0.693819,0.834927,0.803754,0.732844,0.541977,0.812887,0.623481,-0.009405,-0.406031,-0.245299,0.727019,0.333128,0.820710,0.628126,-0.186122,0.598490,0.289245,0.731841,0.656103,0.655946,-0.591733,0.708342,0.008535,0.226135,0.737017,0.760663,-0.150136


In [38]:
# Reshape the dataframe
all_sims = all_sims.melt(
    id_vars=['doc_id'],
    var_name='topic',
    value_name='similarity',
)

all_sims.columns = ['document_id', 'topic', 'similarity']
all_sims

Unnamed: 0,document_id,topic,similarity
0,ug7v899j,1,0.596043
1,02tnwd4m,1,0.656149
2,ejv2xln0,1,0.503608
3,2b73a28n,1,0.790354
4,9785vg6d,1,0.644597
...,...,...,...
9558745,z4ro6lmh,50,-0.005892
9558746,hi8k8wvb,50,-0.266682
9558747,ma3ndg41,50,0.018404
9558748,wh10285j,50,-0.150136


In [40]:
# Only use document-topic pairs that are present in df_relevances
new_results = pd.merge(all_sims, df_relevances, left_on=['document_id', 'topic'], right_on=['document_id', 'topic'])
del new_results['judgement']
new_results

Unnamed: 0,document_id,topic,similarity
0,sw4wtxdk,1,0.699380
1,6wu024ng,1,0.637197
2,sbxqwfmy,1,0.275676
3,1rhy8td0,1,0.333234
4,t7rxmzvi,1,0.338207
...,...,...,...
69313,j6y806qu,50,0.331123
69314,bv6xa8v8,50,0.128356
69315,7g3p570l,50,0.107246
69316,eqfz0wpm,50,-0.008581


In [41]:
# Prepare results for writing to file
nae = np.array(new_results)
results = []
for i, row in enumerate(nae):
  results.append(f'{row[1]} 0 {row[0]} 0 {row[2]} 0')

In [42]:
# Write results to file
with open('results_gensim.txt', 'w') as f:
    f.write('\n'.join(results))