<a href="https://colab.research.google.com/github/cemreefe/cmpe493-project/blob/main/tfidf-cosine_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TF-IDF Vectorizer


In [None]:
!pip3 install xmltodict

import os
import io   
import re
import json
import math
import pickle
import string
import tarfile
import xmltodict
import numpy as np
import pandas as pd

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

Collecting xmltodict
  Downloading https://files.pythonhosted.org/packages/28/fd/30d5c1d3ac29ce229f6bdc40bbc20b28f716e8b363140c26eff19122d8a5/xmltodict-0.12.0-py2.py3-none-any.whl
Installing collected packages: xmltodict
Successfully installed xmltodict-0.12.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive



**Dataset download**


In [None]:
def read_file(path):
  with open(path, 'r') as f:
    return f.read()

In [None]:
if not os.path.exists('drive/MyDrive/CMPE/CMPE493'):
  os.makedirs('drive/MyDrive/CMPE/CMPE493')

In [None]:
if not os.path.exists('drive/MyDrive/CMPE/CMPE493/topics-rnd5.xml'):
  !curl https://ir.nist.gov/covidSubmit/data/topics-rnd5.xml --output drive/MyDrive/CMPE/CMPE493/topics-rnd5.xml

if not os.path.exists('drive/MyDrive/CMPE/CMPE493/qrels-covid_d5_j0.5-5.txt'):
  !curl https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j0.5-5.txt --output drive/MyDrive/CMPE/CMPE493/qrels-covid_d5_j0.5-5.txt

if not os.path.exists('drive/MyDrive/CMPE/CMPE493/cord-19_2020-07-16.tar.gz'):
  !curl https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2020-07-16.tar.gz --output drive/MyDrive/CMPE/CMPE493/cord-19_2020-07-16.tar.gz

In [None]:
if not os.path.exists('2020-07-16'):
  tar = tarfile.open('drive/MyDrive/CMPE/CMPE493/cord-19_2020-07-16.tar.gz', "r:gz")
  tar.extractall()
  tar.close()

**Using pandas dataframes to read and prepare the data**

In [None]:
df_metadata = pd.read_csv('2020-07-16/metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
# Delete unused data columns
del df_metadata['sha'], df_metadata['source_x'], df_metadata['doi'], df_metadata['pmcid'], df_metadata['pubmed_id'], df_metadata['license'], df_metadata['publish_time'], df_metadata['authors'], df_metadata['journal'], df_metadata['mag_id'], df_metadata['who_covidence_id'], df_metadata['arxiv_id'], df_metadata['pdf_json_files'], df_metadata['pmc_json_files'], df_metadata['url'], df_metadata['s2_id']

In [None]:
# Delete duplicate document entries
df_metadata.drop_duplicates(subset='cord_uid', keep='first', inplace=True)

In [None]:
df_metadata

Unnamed: 0,cord_uid,title,abstract
0,ug7v899j,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...
3,2b73a28n,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide...
4,9785vg6d,Gene expression in epithelial cells in respons...,Respiratory syncytial virus (RSV) and pneumoni...
...,...,...,...
192504,z4ro6lmh,Rapid radiological improvement of COVID-19 pne...,
192505,hi8k8wvb,SARS E protein in phospholipid bilayers: an an...,Abstract We report on an anomalous X-ray refle...
192506,ma3ndg41,Italian Society of Interventional Cardiology (...,COVID‐19 pandemic raised the issue to guarante...
192507,wh10285j,"Nimble, Together: A Training Program's Respons...",


In [None]:
# Read relevances file
topic_relevances = 'topic iter document_id judgement\n' + read_file('drive/MyDrive/CMPE/CMPE493/qrels-covid_d5_j0.5-5.txt')

df_relevances = pd.read_csv(  io.StringIO(topic_relevances)  , sep=" ")
del df_relevances['iter']

df_relevances

Unnamed: 0,topic,document_id,judgement
0,1,005b2j4b,2
1,1,00fmeepz,1
2,1,010vptx3,2
3,1,0194oljo,1
4,1,021q9884,1
...,...,...,...
69313,50,zvop8bxh,2
69314,50,zwf26o63,1
69315,50,zwsvlnwe,0
69316,50,zxr01yln,1


In [None]:
# Read topics file
topics_obj = xmltodict.parse(read_file('drive/MyDrive/CMPE/CMPE493/topics-rnd5.xml'))
topics     = json.loads(json.dumps(topics_obj))

# Query, question and narrative fields are concatenated
topics_dict = {}
for topic in topics['topics']['topic']:
  # a topic has the following fields:
  #  * @number
  #  * narrative
  #  * query
  #  * question
  topics_dict[topic['@number']] = topic['query'] + ' ' + topic['question'] + ' ' + topic['narrative']

# Data so far

* `topics_dict` 
      has `topic-id` for keys, and topic description for values
* `df_relevances` 
      has the following three columns:
      topic	document-id	judgement
* `df_metadata`
      holds information about the documents
      has the following three columns (others are deleted):
      cord_uid	title	abstract


In [None]:
# Download nltk English stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Title and abstracts of the documents are concatenated
docs = np.array(df_metadata)
contents = {}

for doc in docs:
  contents[doc[0]] = f'{doc[1]} {doc[2]}'

In [None]:
porter_stemmer = PorterStemmer()
sw = stopwords.words('english')

# preprocessing
# case folding
# punctuation removal
# number deletion
# stemming & stopword removal
def preprocess(s):
  s = s.casefold()
  s = s.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
  s = re.sub("\d+", "", s) #Delete numbers
  s = ' '.join([porter_stemmer.stem(word) for word in s.split() if word not in sw and word != 'nan'])
  return s

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit TfidfVectorizer on documents 
doc_vectorizer = TfidfVectorizer(preprocessor=preprocess)
doc_vectors = doc_vectorizer.fit_transform(list(contents.values()))

print("There are", len(doc_vectorizer.vocabulary_), "tokens in our vocabulary.")

There are 145387 tokens in our vocabulary.


In [None]:
# Vectorize the topics with the same vectorizer
topic_vectors = doc_vectorizer.transform(list(topics_dict.values()))

In [None]:
# number of documents, length of vector
doc_vectors.shape

(191175, 145387)

In [None]:
# number of topics, length of vector
topic_vectors.shape

(50, 145387)

In [None]:
# key: document_id
# val: document's TFIDF vector
doc_vectors_dict = {
    docs[i][0] : doc_vectors[i]
    for i in range(0,len(docs))
}

In [None]:
# key: topic_id
# val: topic's TFIDF vector
topic_vectors_dict = {
    key : topic_vectors[i]
    for i, key in enumerate(topics_dict.keys())
}

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

#Take random doc_id and print most relevant topic
ex_document = df_metadata.sample().cord_uid.to_string().split()[1]
similarity = cosine_similarity(doc_vectors_dict[ex_document],topic_vectors).flatten()

best_sim      = np.max(similarity)
best_topic_id = np.argmax(similarity) + 1

print(f"Example document id: {ex_document}")
print(f"Maximum cosine similarity among all topics: {best_sim:.4f}")
print(f"Argmax of maximum cos. sim.: {best_topic_id}")
print(f"Contents of document '{ex_document}': {contents[ex_document]}")
print(f"Contents of topic '{best_topic_id}': {topics_dict[str(best_topic_id)]}")


Example document id: 6mrd9axh
Maximum cosine similarity among all topics: 0.0921
Argmax of maximum cos. sim.: 3
Contents of document '6mrd9axh': Genetic Characterization of Middle East Respiratory Syndrome Coronavirus, South Korea, 2018 We evaluated genetic variation in Middle East respiratory syndrome coronavirus (MERS-CoV) imported to South Korea in 2018 using specimens from a patient and isolates from infected Caco-2 cells. The MERS-CoV strain in this study was genetically similar to a strain isolated in Riyadh, Saudi Arabia, in 2017.
Contents of topic '3': coronavirus immunity will SARS-CoV2 infected people develop immunity? Is cross protection possible? seeking studies of immunity developed due to infection with SARS-CoV2 or cross protection gained due to infection with other coronavirus types


In [None]:
sims = cosine_similarity(topic_vectors, doc_vectors)
sims.shape

(50, 191175)

**results.csv columns:**
qid, iter, docno, rank, sim, run_id 

In [None]:
results = np.ndarray((len(contents)*int(len(topics_dict)/2),1))
results = list(results)

#Even results
for j,topic in enumerate(topics_dict.keys()):
  if(j%2 != 0):
    for i,doc_id in enumerate(contents.keys()):
      element = f'{topic} 0 {doc_id} 0 {sims[j][i]} 0'
      results[int(j/2)*len(contents.keys())+i] = element

In [None]:
results[:10]

['2 0 ug7v899j 0 0.0 0',
 '2 0 02tnwd4m 0 0.011011974322833466 0',
 '2 0 ejv2xln0 0 0.012277492779388261 0',
 '2 0 2b73a28n 0 0.0 0',
 '2 0 9785vg6d 0 0.06059505130597204 0',
 '2 0 zjufx4fo 0 0.00864037385739897 0',
 '2 0 5yhe786e 0 0.0 0',
 '2 0 8zchiykl 0 0.0 0',
 '2 0 8qnrcgnk 0 0.008868282023550493 0',
 '2 0 jg13scgo 0 0.0035325109038701072 0']

In [None]:
# Write results to file
with open('results_tfidf.txt', 'w') as f:
    f.write('\n'.join(results))