# Semantic Search on Computer Science Research Papers abstract using TF-IDF

Utilizing TF-IDF to generate vectors for research paper abstracts and search query.
Finding similarity using cosine similarity metric between the query vector and each abstract vector.

In [17]:
import numpy as np
import pandas as pd

import dask.bag as db
import json

from string import digits
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import re
from tqdm import tqdm, notebook
import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
import operator

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [2]:
docs = db.read_text('../input/arxiv/arxiv-metadata-oai-snapshot.json').map(json.loads)

#### Sample data

In [3]:
docs.take(1)

({'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

In [None]:
# docs.count().compute()

Will be working on only the text fields and try to keep the data from 2010 onwards for CS to reduce the dataset.

In [4]:
# get_latest_version = lambda x: x['versions'][-1]['created']
# get_cs_papers = lambda x: any(category.startswith('cs') for category in x['categories'].split(' '))


# # get required fields
# trim = lambda x: {'id': x['id'],
#                   'authors': x['authors'],
#                   'title': x['title'],
#                   'doi': x['doi'],
#                   'category':x['categories'].split(' '),
#                   'abstract':x['abstract'],}

# # filter for papers published on or after 2019-01-01
# columns = ['id','category','abstract']
# docs_df = (docs
#            .filter(lambda x: int(get_latest_version(x).split(' ')[3]) > 2010)
#            .filter(lambda x: get_cs_papers(x))
#            .map(trim)
#            .compute())

# # convert to pandas
# docs_df = pd.DataFrame(docs_df)

# docs_df.drop(['doi'], axis=1, inplace=True)
# docs_df.head()

# docs_df.to_csv("cs_arxiv_from_2010.csv", index=False)

#### Reading structured data

In [5]:
docs_df = pd.read_csv('/kaggle/input/arxiv-cs-papers-abstract-from-2010/cs_arxiv_from_2010.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
len(docs_df)

484027

In [7]:
docs_df.head()

Unnamed: 0,id,authors,title,category,abstract
0,704.0213,Ketan D. Mulmuley Hariharan Narayanan,Geometric Complexity Theory V: On deciding non...,['cs.CC'],This article has been withdrawn because it h...
1,704.1409,Yao HengShuai,Preconditioned Temporal Difference Learning,"['cs.LG', 'cs.AI']",This paper has been withdrawn by the author....
2,704.1829,"Stefan Felsner, Kamil Kloch, Grzegorz Matecki,...",On-line Chain Partitions of Up-growing Semi-or...,['cs.DM'],On-line chain partition is a two-player game...
3,705.0561,Jing-Chao Chen,Iterative Rounding for the Closest String Problem,"['cs.DS', 'cs.CC']",The closest string problem is an NP-hard pro...
4,705.1025,David Eppstein,Recognizing Partial Cubes in Quadratic Time,['cs.DS'],We show how to test whether a graph with n v...


## Pre-processing

In [8]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [9]:
def clean_text(text, remove_stopwords=True):
    text = text.lower()
    text = text.split()
    tmp = []
    for word in text:
        if word in contractions:
            tmp.append(contractions[word])
        else:
            tmp.append(word)
    text = ' '.join(tmp)
    
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words('english'))
        text = [w for w in text if w not in stops]
        text = ' '.join(text)
        
    return text

In [12]:
preprocessor = spacy.load('en_core_web_lg')

In [13]:
def preprocess_data(data):
    processed_data = []
    data_list = data.values.tolist()
    vocab = set()
    for (_, _, title, _, abstract) in notebook.tqdm(data_list):
        title_abstract = title + '. ' + abstract
        cleaned_text = clean_text(title_abstract)
        sentence = preprocessor(cleaned_text)
        tokens = [word.lemma_ for word in sentence]
        vocab.update(tokens)
        processed_data.append(' '.join(tokens))
        vocab.update(tokens)
    return processed_data, list(vocab)

In [15]:
docs_text, vocab = preprocess_data(docs_df[:100000])
docs_text[:5]

  0%|          | 0/100000 [00:00<?, ?it/s]

['geometric complexity theory v decide nonvanishe generalize littlewood richardson coefficient article withdraw merge early article gct3 arxiv cs 0501076 cs cc series merge article available geometric complexity theory iii decide nonvanishe littlewood richardson coefficient journal algebraic combinatoric vol 36 issue 1 2012 pp 103 110 author ketan mulmuley hari narayanan milind sohoni new article gct5 slot series geometric complexity theory v equivalence blackbox derandomization polynomial identity test derandomization noether normalization lemma proceeding foc 2012 abstract arxiv 1209 5993 cs cc full version author ketan mulmuley',
 'precondition temporal difference learn paper withdraw author draft withdraw poor quality english unfortunately produce author start science route look icml version instead',
 'line chain partition grow semi order line chain partition two player game spoiler algorithm spoiler present partially order set point point algorithm assign incoming point immediate

## TF-IDF

TF-IDF is used to determine importance of a term(token) in a document compared to a collection of documents.

There are 2 components to this metric, which are multiple to compute the final score.

TF -> **Term Frequency** is the ratio of the frequency of the term in a document and total number of terms in the document.

IDF -> **Inverse Document Frequency** is the logarithm of the ratio of the total number of documents to the number of documents containing that term. Terms that appear in many documents receive a lower IDF score, while terms that appear in fewer documents receive a higher IDF score.

Multiplying the 2 values we get the TF-IDF score for a term for a document. TF_IDF vector for corpus is of the dimension -> (corpus_size, vocab_size). For a list of 10000 sentences and 100 unique words(term) it will be -> (10000, 100), where each value is the TF-IDF score.



#### Calcuating TF-IDF vectors for the the abstracts dataset

In [18]:
tfidf = TfidfVectorizer(lowercase=False)
doc_vectors = tfidf.fit_transform(docs_text)
doc_vectors.shape

(100000, 93368)

#### Calcuating TF-IDF vector for the query

In [19]:
cleaned_text = clean_text('temporal expression extraction')
sentence = preprocessor(cleaned_text)
tokens = [word.lemma_ for word in sentence]
test_sentence = ' '.join(tokens)
test_sentence

'temporal expression extraction'

In [20]:
test_vector = tfidf.transform([test_sentence])
test_vector

<1x93368 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

#### Using cosine similarity as metric to determine the most similar documents for the query

In [23]:
cosine_similarities = cosine_similarity(test_vector, doc_vectors).flatten()
document_scores = [item.item() for item in cosine_similarities]
len(document_scores)

100000

In [24]:
train_df = docs_df[:100000]
train_df['cosine_score'] = document_scores
train_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,authors,title,category,abstract,cosine_score
0,704.0213,Ketan D. Mulmuley Hariharan Narayanan,Geometric Complexity Theory V: On deciding non...,['cs.CC'],This article has been withdrawn because it h...,0.0
1,704.1409,Yao HengShuai,Preconditioned Temporal Difference Learning,"['cs.LG', 'cs.AI']",This paper has been withdrawn by the author....,0.083492
2,704.1829,"Stefan Felsner, Kamil Kloch, Grzegorz Matecki,...",On-line Chain Partitions of Up-growing Semi-or...,['cs.DM'],On-line chain partition is a two-player game...,0.0
3,705.0561,Jing-Chao Chen,Iterative Rounding for the Closest String Problem,"['cs.DS', 'cs.CC']",The closest string problem is an NP-hard pro...,0.0
4,705.1025,David Eppstein,Recognizing Partial Cubes in Quadratic Time,['cs.DS'],We show how to test whether a graph with n v...,0.0


#### Sorting documents based on cosine score

In [25]:
sorted_score_df = train_df.sort_values('cosine_score', ascending=False).head()

#### Looking at the top results

In [26]:
sorted_score_df.iloc[:5]['abstract'].values.tolist()

['  It is commonly acknowledged that temporal expression extractors are important\ncomponents of larger natural language processing systems like information\nretrieval and question answering systems. Extraction and normalization of\ntemporal expressions in Turkish has not been given attention so far except the\nextraction of some date and time expressions within the course of named entity\nrecognition. As TimeML is the current standard of temporal expression and event\nannotation in natural language texts, in this paper, we present an analysis of\ntemporal expressions in Turkish based on the related TimeML classification\n(i.e., date, time, duration, and set expressions). We have created a lexicon\nfor Turkish temporal expressions and devised considerably wide-coverage\npatterns using the lexical classes as the building blocks. We believe that the\nproposed patterns, together with convenient normalization rules, can be readily\nused by prospective temporal expression extraction tools f