In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# IMPORTS

In [None]:
import numpy as np
import pandas as pd
import re
import spacy
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
nlp = spacy.load('en_core_web_sm')

# LOADING FILES

In [None]:
dir = "drive/MyDrive/Ranked-Retrieval/dataset/"

In [None]:
documents = pd.read_csv(dir + 'documents.csv')
documents.head()

Unnamed: 0,docid,author,bibliography,body,title
0,2,ting-yili,"department of aeronautical engineering, rensse...",simple shear flow past a flat plate in an inco...,simple shear flow past a flat plate in an inco...
1,3,m. b. glauert,"department of mathematics, university of manch...",the boundary layer in simple shear flow past a...,the boundary layer in simple shear flow past a...
2,5,"wasserman,b.","j. ae. scs. 24, 1957, 924.",one-dimensional transient heat conduction into...,one-dimensional transient heat conduction into...
3,6,"campbell,w.f.","j. ae. scs. 25, 1958, 340.",one-dimensional transient heat flow in a multi...,one-dimensional transient heat flow in a multi...
4,12,"bisplinghoff,r.l.","j. ae. scs. 23, 1956, 289.",some structural and aerelastic considerations ...,some structural and aerelastic considerations ...


In [None]:
queries = pd.read_csv(dir + 'queries.csv')
queries.head()

Unnamed: 0,qid,query
0,1,what similarity laws must be obeyed when const...
1,2,what are the structural and aeroelastic proble...
2,3,what problems of heat conduction in composite ...
3,8,what methods -dash exact or approximate -dash ...
4,10,are real-gas transport properties for air avai...


In [None]:
qrel = pd.read_csv(dir + 'qrel.csv')
qrel.head()

Unnamed: 0,qid,docid
0,1,184
1,1,29
2,1,31
3,1,57
4,1,378


In [None]:
queries_val = pd.read_csv(dir + 'queries_val.csv')
queries_val.head()

Unnamed: 0,qid,query
0,189,is there a design method for calculating therm...
1,190,will an analysis of panel flutter based on arb...
2,191,"what is the criterion for true panel flutter, ..."
3,194,how can the analytical solution of the bucklin...
4,196,the problem of similarity for representative i...


In [None]:
qrel_val = pd.read_csv(dir + 'qrel_val.csv')
qrel_val.head()

Unnamed: 0,qid,docid
0,189,395
1,189,866
2,189,869
3,189,865
4,189,868


# TEXT PREPROCESSING

In [None]:
def preprocess(text):
  text=re.sub("-"," ",text)
  text=re.sub("[^a-z ]+","",text)
  text=re.sub("[\s]+"," ",text)
  doc=nlp(text)
  tokens=[token.lemma_ for token in doc if(token.is_stop==False)]
  return tokens

In [None]:
documents['tokens'] = documents['body'].apply(preprocess)
documents.head()

Unnamed: 0,docid,author,bibliography,body,title,tokens
0,2,ting-yili,"department of aeronautical engineering, rensse...",simple shear flow past a flat plate in an inco...,simple shear flow past a flat plate in an inco...,"[simple, shear, flow, past, flat, plate, incom..."
1,3,m. b. glauert,"department of mathematics, university of manch...",the boundary layer in simple shear flow past a...,the boundary layer in simple shear flow past a...,"[boundary, layer, simple, shear, flow, past, f..."
2,5,"wasserman,b.","j. ae. scs. 24, 1957, 924.",one-dimensional transient heat conduction into...,one-dimensional transient heat conduction into...,"[dimensional, transient, heat, conduction, dou..."
3,6,"campbell,w.f.","j. ae. scs. 25, 1958, 340.",one-dimensional transient heat flow in a multi...,one-dimensional transient heat flow in a multi...,"[dimensional, transient, heat, flow, multilaye..."
4,12,"bisplinghoff,r.l.","j. ae. scs. 23, 1956, 289.",some structural and aerelastic considerations ...,some structural and aerelastic considerations ...,"[structural, aerelastic, consideration, high, ..."


In [None]:
queries['tokens'] = queries['query'].apply(preprocess)
queries.head()

Unnamed: 0,qid,query,tokens
0,1,what similarity laws must be obeyed when const...,"[similarity, law, obey, construct, aeroelastic..."
1,2,what are the structural and aeroelastic proble...,"[structural, aeroelastic, problem, associate, ..."
2,3,what problems of heat conduction in composite ...,"[problem, heat, conduction, composite, slab, s..."
3,8,what methods -dash exact or approximate -dash ...,"[method, dash, exact, approximate, dash, prese..."
4,10,are real-gas transport properties for air avai...,"[real, gas, transport, property, air, availabl..."


In [None]:
queries_val['tokens']=queries_val['query'].apply(preprocess)
queries_val.head()

Unnamed: 0,qid,query,tokens
0,189,is there a design method for calculating therm...,"[design, method, calculate, thermal, fatigue, ..."
1,190,will an analysis of panel flutter based on arb...,"[analysis, panel, flutter, base, arbitrarily, ..."
2,191,"what is the criterion for true panel flutter, ...","[criterion, true, panel, flutter, oppose, smal..."
3,194,how can the analytical solution of the bucklin...,"[analytical, solution, buckle, strength, unifo..."
4,196,the problem of similarity for representative i...,"[problem, similarity, representative, investig..."


#  RANKING DOCUMENTS AND EVALUATING THEM USING MAP

---



## 1. JACCARD COEFFICIENT


In [None]:
temp_doc = documents[['docid', 'tokens']].copy()

In [None]:
def jaccard_coefficient(dtokens, qtokens):
  numerator=len(set(dtokens).intersection(set(qtokens)))
  denominator=len(set(dtokens).union(set(qtokens)))
  return numerator/denominator

In [None]:
jaccard_coefficient(temp_doc['tokens'][0],queries['tokens'][0])

0.02702702702702703

In [None]:
def jaccard_rank(qtokens):
  # Find jaccard coefficient for all docs
  temp_doc['jaccard'] = temp_doc['tokens'].apply(lambda x: jaccard_coefficient(x,qtokens))

  # Find top 5 most relevant docs
  relevant_docids = temp_doc.sort_values(by = 'jaccard', ascending = False).head()['docid'].values

  return relevant_docids

### 1.1. EVALUATION ON TRAIN SET

In [None]:
queries['jaccard_rel'] = queries['tokens'].apply(lambda x: jaccard_rank(x))
queries.head()

Unnamed: 0,qid,query,tokens,jaccard_rel
0,1,what similarity laws must be obeyed when const...,"[similarity, law, obey, construct, aeroelastic...","[51, 875, 12, 184, 1111]"
1,2,what are the structural and aeroelastic proble...,"[structural, aeroelastic, problem, associate, ...","[12, 51, 700, 746, 875]"
2,3,what problems of heat conduction in composite ...,"[problem, heat, conduction, composite, slab, s...","[5, 584, 6, 145, 582]"
3,8,what methods -dash exact or approximate -dash ...,"[method, dash, exact, approximate, dash, prese...","[122, 1306, 639, 655, 988]"
4,10,are real-gas transport properties for air avai...,"[real, gas, transport, property, air, availabl...","[405, 302, 436, 616, 583]"


In [None]:
queries['ground_truth'] = queries['qid'].apply(lambda x: qrel[qrel['qid'] == x]['docid'].values)
queries.head()

Unnamed: 0,qid,query,tokens,jaccard_rel,ground_truth
0,1,what similarity laws must be obeyed when const...,"[similarity, law, obey, construct, aeroelastic...","[51, 875, 12, 184, 1111]","[184, 29, 31, 57, 378]"
1,2,what are the structural and aeroelastic proble...,"[structural, aeroelastic, problem, associate, ...","[12, 51, 700, 746, 875]","[12, 746, 15, 184, 858]"
2,3,what problems of heat conduction in composite ...,"[problem, heat, conduction, composite, slab, s...","[5, 584, 6, 145, 582]","[5, 6, 90, 91, 119]"
3,8,what methods -dash exact or approximate -dash ...,"[method, dash, exact, approximate, dash, prese...","[122, 1306, 639, 655, 988]","[48, 122, 354, 360, 1005]"
4,10,are real-gas transport properties for air avai...,"[real, gas, transport, property, air, availabl...","[405, 302, 436, 616, 583]","[259, 405, 302, 436, 437]"


In [None]:
def average_precision(model_rel,ground_truth):
  tp=0
  precisions=[]

  # Finding precision at positions at which relevant document is returned
  for index, value in enumerate(model_rel):
    if value in ground_truth:
      tp += 1
      precisions.append(tp/(index+1))

  # If no relevant document in list then return 0
  if precisions == []:
    return 0
    
  return np.mean(precisions)

In [None]:
# Running on a sample
average_precision([5,6,1,2,4],[1,2,3,4,5])

0.8041666666666667

In [None]:
queries['jaccard_ap'] = queries.apply(lambda x: average_precision(x['jaccard_rel'], x['ground_truth']), axis = 1)
queries.head()

Unnamed: 0,qid,query,tokens,jaccard_rel,ground_truth,jaccard_ap
0,1,what similarity laws must be obeyed when const...,"[similarity, law, obey, construct, aeroelastic...","[51, 875, 12, 184, 1111]","[184, 29, 31, 57, 378]",0.25
1,2,what are the structural and aeroelastic proble...,"[structural, aeroelastic, problem, associate, ...","[12, 51, 700, 746, 875]","[12, 746, 15, 184, 858]",0.75
2,3,what problems of heat conduction in composite ...,"[problem, heat, conduction, composite, slab, s...","[5, 584, 6, 145, 582]","[5, 6, 90, 91, 119]",0.833333
3,8,what methods -dash exact or approximate -dash ...,"[method, dash, exact, approximate, dash, prese...","[122, 1306, 639, 655, 988]","[48, 122, 354, 360, 1005]",1.0
4,10,are real-gas transport properties for air avai...,"[real, gas, transport, property, air, availabl...","[405, 302, 436, 616, 583]","[259, 405, 302, 436, 437]",1.0


In [None]:
print('Mean Average Precision =', queries['jaccard_ap'].mean())

Mean Average Precision = 0.48960784313725514


### 1.2. EVALUATION ON VALIDATION SET

In [None]:
queries_val['ground_truth'] = queries_val['qid'].apply(lambda x: qrel_val[qrel_val['qid'] == x]['docid'].values)

In [None]:
queries_val['jaccard_rel'] = queries_val['tokens'].apply(lambda x: jaccard_rank(x))
queries_val.head()

Unnamed: 0,qid,query,tokens,ground_truth,jaccard_rel
0,189,is there a design method for calculating therm...,"[design, method, calculate, thermal, fatigue, ...","[395, 866, 869, 865, 868]","[868, 1306, 833, 906, 909]"
1,190,will an analysis of panel flutter based on arb...,"[analysis, panel, flutter, base, arbitrarily, ...","[15, 391, 285, 390, 864]","[390, 1008, 285, 21, 391]"
2,191,"what is the criterion for true panel flutter, ...","[criterion, true, panel, flutter, oppose, smal...","[914, 915, 285, 857, 858]","[285, 31, 864, 728, 914]"
3,194,how can the analytical solution of the bucklin...,"[analytical, solution, buckle, strength, unifo...","[739, 740, 742, 743, 744]","[932, 1050, 1172, 744, 1171]"
4,196,the problem of similarity for representative i...,"[problem, similarity, representative, investig...","[51, 185, 874, 875, 876]","[875, 1008, 184, 655, 180]"


In [None]:
queries_val['jaccard_ap'] = queries_val.apply(lambda x: average_precision(x['jaccard_rel'], x['ground_truth']), axis = 1)
queries_val.head()

Unnamed: 0,qid,query,tokens,ground_truth,jaccard_rel,jaccard_ap
0,189,is there a design method for calculating therm...,"[design, method, calculate, thermal, fatigue, ...","[395, 866, 869, 865, 868]","[868, 1306, 833, 906, 909]",1.0
1,190,will an analysis of panel flutter based on arb...,"[analysis, panel, flutter, base, arbitrarily, ...","[15, 391, 285, 390, 864]","[390, 1008, 285, 21, 391]",0.755556
2,191,"what is the criterion for true panel flutter, ...","[criterion, true, panel, flutter, oppose, smal...","[914, 915, 285, 857, 858]","[285, 31, 864, 728, 914]",0.7
3,194,how can the analytical solution of the bucklin...,"[analytical, solution, buckle, strength, unifo...","[739, 740, 742, 743, 744]","[932, 1050, 1172, 744, 1171]",0.25
4,196,the problem of similarity for representative i...,"[problem, similarity, representative, investig...","[51, 185, 874, 875, 876]","[875, 1008, 184, 655, 180]",1.0


In [None]:
print('Mean Average Precision on Validation Set =', queries_val['jaccard_ap'].mean())

Mean Average Precision on Validation Set = 0.4232323232323232


## 2. TERM FREQUENCY

### 2.1. COMPUTING TERM FREQUENCY

In [None]:
vocabulary = []

for i in documents['tokens'].values:
  vocabulary.extend(i)
vocabulary = set(vocabulary)

vocabulary=sorted(vocabulary)

In [None]:
print('Size of Vocabulary =', len(vocabulary))

Size of Vocabulary = 3068


In [None]:
tf_list_doc=[]

# Getting Term frequencies
for tokens in documents['tokens']:
  # Initliatizing a dictionary with 0 frequency
  doc_dict = dict.fromkeys(vocabulary,0)  
  
  # Counting term frequencies
  for term in tokens:
    doc_dict[term] += 1

  # Adding dictionary to list
  tf_list_doc.append(doc_dict)

In [None]:
len(tf_list_doc)

387

In [None]:
# Creating a dataframe of tf for documents
documents_tf = pd.concat([documents['docid'], pd.DataFrame(tf_list_doc)], axis = 1)
documents_tf.head()

Unnamed: 0,docid,ab,abbreviate,ability,ablating,ablation,able,abrupt,abruptly,absence,absolute,absorb,absorption,academic,accelerate,accelerated,acceleration,accept,acceptability,acceptance,accessible,accidental,accommodate,accommodation,accompany,accompanying,accomplish,accord,accordance,accordingly,account,accumulation,accuracy,accurate,accurately,achieve,ackeret,acoustic,acoustical,acquisition,...,width,will,williams,wind,window,windward,wing,winglike,wire,withstand,wkb,womersley,wood,word,work,worker,working,worth,worthy,wrinkle,write,writer,x,xenon,xiii,y,yaw,yawed,yawing,year,yield,york,young,youngs,z,zbrozek,zero,zeroth,zone,zuk
0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Function for Log Normalization
def log_normalize(x):
  if x != 0:
    return 1+np.log10(x)
  return 0

In [None]:
documents_tf.iloc[:, 1:] = documents_tf.iloc[:, 1:].applymap(log_normalize)

### 2.2. RANKING

In [None]:
def tf_rank(qtokens):
  qtokens = list(set(qtokens).intersection(vocabulary))
 
  columns=['docid']
  columns.extend(qtokens)

  # Retireving tf for query terms
  temp_doc = documents_tf.loc[:, columns].copy()

  # Adding all the frequencies
  temp_doc['tf_sum'] = temp_doc[qtokens].sum(axis=1)

  # Sorting dataframe according to sum of TF and getting relevant docs
  rel_docs = temp_doc.sort_values(by='tf_sum',ascending=False).head()['docid'].values

  return rel_docs

### 2.3. EVALUATION ON TRAIN SET

In [None]:
queries['tf_rel'] = queries['tokens'].apply(lambda x: tf_rank(x))
queries.head()

Unnamed: 0,qid,query,tokens,jaccard_rel,ground_truth,jaccard_ap,tf_rel
0,1,what similarity laws must be obeyed when const...,"[similarity, law, obey, construct, aeroelastic...","[51, 875, 12, 184, 1111]","[184, 29, 31, 57, 378]",0.25,"[51, 12, 184, 876, 252]"
1,2,what are the structural and aeroelastic proble...,"[structural, aeroelastic, problem, associate, ...","[12, 51, 700, 746, 875]","[12, 746, 15, 184, 858]",0.75,"[12, 172, 51, 746, 798]"
2,3,what problems of heat conduction in composite ...,"[problem, heat, conduction, composite, slab, s...","[5, 584, 6, 145, 582]","[5, 6, 90, 91, 119]",0.833333,"[5, 584, 625, 980, 91]"
3,8,what methods -dash exact or approximate -dash ...,"[method, dash, exact, approximate, dash, prese...","[122, 1306, 639, 655, 988]","[48, 122, 354, 360, 1005]",1.0,"[122, 234, 1104, 924, 556]"
4,10,are real-gas transport properties for air avai...,"[real, gas, transport, property, air, availabl...","[405, 302, 436, 616, 583]","[259, 405, 302, 436, 437]",1.0,"[302, 185, 616, 1009, 1313]"


In [None]:
queries['tf_ap'] = queries.apply(lambda x: average_precision(x['tf_rel'], x['ground_truth']), axis = 1)
queries.head()

Unnamed: 0,qid,query,tokens,jaccard_rel,ground_truth,jaccard_ap,tf_rel,tf_ap
0,1,what similarity laws must be obeyed when const...,"[similarity, law, obey, construct, aeroelastic...","[51, 875, 12, 184, 1111]","[184, 29, 31, 57, 378]",0.25,"[51, 12, 184, 876, 252]",0.333333
1,2,what are the structural and aeroelastic proble...,"[structural, aeroelastic, problem, associate, ...","[12, 51, 700, 746, 875]","[12, 746, 15, 184, 858]",0.75,"[12, 172, 51, 746, 798]",0.75
2,3,what problems of heat conduction in composite ...,"[problem, heat, conduction, composite, slab, s...","[5, 584, 6, 145, 582]","[5, 6, 90, 91, 119]",0.833333,"[5, 584, 625, 980, 91]",0.7
3,8,what methods -dash exact or approximate -dash ...,"[method, dash, exact, approximate, dash, prese...","[122, 1306, 639, 655, 988]","[48, 122, 354, 360, 1005]",1.0,"[122, 234, 1104, 924, 556]",1.0
4,10,are real-gas transport properties for air avai...,"[real, gas, transport, property, air, availabl...","[405, 302, 436, 616, 583]","[259, 405, 302, 436, 437]",1.0,"[302, 185, 616, 1009, 1313]",1.0


In [None]:
print('Mean Average Precision =', queries['tf_ap'].mean())

Mean Average Precision = 0.5995424836601307


### 2.4. EVALUATION ON VALIDATION SET

In [None]:
queries_val['tf_rel'] = queries_val['tokens'].apply(lambda x: tf_rank(x))
queries_val.head()

Unnamed: 0,qid,query,tokens,ground_truth,jaccard_rel,jaccard_ap,tf_rel
0,189,is there a design method for calculating therm...,"[design, method, calculate, thermal, fatigue, ...","[395, 866, 869, 865, 868]","[868, 1306, 833, 906, 909]",1.0,"[987, 395, 980, 924, 727]"
1,190,will an analysis of panel flutter based on arb...,"[analysis, panel, flutter, base, arbitrarily, ...","[15, 391, 285, 390, 864]","[390, 1008, 285, 21, 391]",0.755556,"[390, 122, 391, 1008, 766]"
2,191,"what is the criterion for true panel flutter, ...","[criterion, true, panel, flutter, oppose, smal...","[914, 915, 285, 857, 858]","[285, 31, 864, 728, 914]",0.7,"[766, 15, 285, 914, 859]"
3,194,how can the analytical solution of the bucklin...,"[analytical, solution, buckle, strength, unifo...","[739, 740, 742, 743, 744]","[932, 1050, 1172, 744, 1171]",0.25,"[932, 1172, 740, 1173, 744]"
4,196,the problem of similarity for representative i...,"[problem, similarity, representative, investig...","[51, 185, 874, 875, 876]","[875, 1008, 184, 655, 180]",1.0,"[662, 184, 966, 572, 1319]"


In [None]:
queries_val['tf_ap'] = queries_val.apply(lambda x: average_precision(x['tf_rel'], x['ground_truth']), axis = 1)
queries_val.head()

Unnamed: 0,qid,query,tokens,ground_truth,jaccard_rel,jaccard_ap,tf_rel,tf_ap
0,189,is there a design method for calculating therm...,"[design, method, calculate, thermal, fatigue, ...","[395, 866, 869, 865, 868]","[868, 1306, 833, 906, 909]",1.0,"[987, 395, 980, 924, 727]",0.5
1,190,will an analysis of panel flutter based on arb...,"[analysis, panel, flutter, base, arbitrarily, ...","[15, 391, 285, 390, 864]","[390, 1008, 285, 21, 391]",0.755556,"[390, 122, 391, 1008, 766]",0.833333
2,191,"what is the criterion for true panel flutter, ...","[criterion, true, panel, flutter, oppose, smal...","[914, 915, 285, 857, 858]","[285, 31, 864, 728, 914]",0.7,"[766, 15, 285, 914, 859]",0.416667
3,194,how can the analytical solution of the bucklin...,"[analytical, solution, buckle, strength, unifo...","[739, 740, 742, 743, 744]","[932, 1050, 1172, 744, 1171]",0.25,"[932, 1172, 740, 1173, 744]",0.366667
4,196,the problem of similarity for representative i...,"[problem, similarity, representative, investig...","[51, 185, 874, 875, 876]","[875, 1008, 184, 655, 180]",1.0,"[662, 184, 966, 572, 1319]",0.0


In [None]:
print('Mean Average Precision on Validation Set =', queries_val['tf_ap'].mean())

Mean Average Precision on Validation Set = 0.48459595959595964


## 3. INVERSE DOCUMENT FREQUENCY


### 3.1. COMPUTING IDF

In [None]:
print('No. of Documents =', documents.shape[0])

No. of Documents = 387


In [None]:
# Initializing a dictionary to stores IDF values
idf_dict = dict.fromkeys(vocabulary, 0)

# Count of non-zero values per column(Document Frequency)
non_zero_count = np.count_nonzero(documents_tf.iloc[:, 1:], axis=0)

# Assigning IDF values
for term, document_frequency in zip(list(vocabulary), non_zero_count):
  idf_dict[term] = np.log10(documents.shape[0] / (document_frequency))

In [None]:
idf_dict

{'ab': 2.28668096935493,
 'abbreviate': 2.5877109650189114,
 'ability': 2.5877109650189114,
 'ablating': 2.28668096935493,
 'ablation': 1.985650973690949,
 'able': 2.28668096935493,
 'abrupt': 2.5877109650189114,
 'abruptly': 2.28668096935493,
 'absence': 2.28668096935493,
 'absolute': 1.8887409606828927,
 'absorb': 2.5877109650189114,
 'absorption': 2.5877109650189114,
 'academic': 2.5877109650189114,
 'accelerate': 2.110589710299249,
 'accelerated': 2.5877109650189114,
 'acceleration': 2.110589710299249,
 'accept': 2.110589710299249,
 'acceptability': 2.5877109650189114,
 'acceptance': 2.5877109650189114,
 'accessible': 2.5877109650189114,
 'accidental': 2.5877109650189114,
 'accommodate': 2.5877109650189114,
 'accommodation': 2.5877109650189114,
 'accompany': 2.28668096935493,
 'accompanying': 2.5877109650189114,
 'accomplish': 2.28668096935493,
 'accord': 1.7426129250046545,
 'accordance': 2.5877109650189114,
 'accordingly': 1.985650973690949,
 'account': 1.189770956346874,
 'accum

### 3.2. RANKING

In [None]:
temp_doc = documents[['docid','tokens']].copy()

In [None]:
# Function for getting sum of IDF values for a query-document pair
def idf_sum(dtokens, qtokens):
  # Getting common terms in query and document
  common_term = set(dtokens).intersection(set(qtokens))

  # Getting IDF values for common terms
  idf_list=[ value for key, value in idf_dict.items() if key in common_term]
  
  return sum(idf_list)

In [None]:
def idf_rank(qtokens):
  # Getting sum of IDF values for all the quer-document pairs
  temp_doc['idf_sum'] = temp_doc['tokens'].apply(lambda x: idf_sum(x, qtokens))

  # Sorting dataframe according to sum of IDF and getting relevant docs
  rel_docs = temp_doc.sort_values(by = 'idf_sum', ascending = False).head()['docid'].values

  return rel_docs

### 3.3 EVALUATION ON TRAIN SET

In [None]:
queries['idf_rel'] = queries['tokens'].apply(lambda x: idf_rank(x))
queries.head()

Unnamed: 0,qid,query,tokens,jaccard_rel,ground_truth,jaccard_ap,tf_rel,tf_ap,idf_rel
0,1,what similarity laws must be obeyed when const...,"[similarity, law, obey, construct, aeroelastic...","[51, 875, 12, 184, 1111]","[184, 29, 31, 57, 378]",0.25,"[51, 12, 184, 876, 252]",0.333333,"[51, 184, 12, 601, 62]"
1,2,what are the structural and aeroelastic proble...,"[structural, aeroelastic, problem, associate, ...","[12, 51, 700, 746, 875]","[12, 746, 15, 184, 858]",0.75,"[12, 172, 51, 746, 798]",0.75,"[12, 172, 51, 746, 364]"
2,3,what problems of heat conduction in composite ...,"[problem, heat, conduction, composite, slab, s...","[5, 584, 6, 145, 582]","[5, 6, 90, 91, 119]",0.833333,"[5, 584, 625, 980, 91]",0.7,"[5, 91, 625, 584, 90]"
3,8,what methods -dash exact or approximate -dash ...,"[method, dash, exact, approximate, dash, prese...","[122, 1306, 639, 655, 988]","[48, 122, 354, 360, 1005]",1.0,"[122, 234, 1104, 924, 556]",1.0,"[122, 556, 1104, 234, 924]"
4,10,are real-gas transport properties for air avai...,"[real, gas, transport, property, air, availabl...","[405, 302, 436, 616, 583]","[259, 405, 302, 436, 437]",1.0,"[302, 185, 616, 1009, 1313]",1.0,"[302, 332, 405, 1009, 583]"


In [None]:
queries['idf_ap'] = queries.apply(lambda x: average_precision(x['idf_rel'], x['ground_truth']), axis = 1)
queries.head()

Unnamed: 0,qid,query,tokens,jaccard_rel,ground_truth,jaccard_ap,tf_rel,tf_ap,idf_rel,idf_ap
0,1,what similarity laws must be obeyed when const...,"[similarity, law, obey, construct, aeroelastic...","[51, 875, 12, 184, 1111]","[184, 29, 31, 57, 378]",0.25,"[51, 12, 184, 876, 252]",0.333333,"[51, 184, 12, 601, 62]",0.5
1,2,what are the structural and aeroelastic proble...,"[structural, aeroelastic, problem, associate, ...","[12, 51, 700, 746, 875]","[12, 746, 15, 184, 858]",0.75,"[12, 172, 51, 746, 798]",0.75,"[12, 172, 51, 746, 364]",0.75
2,3,what problems of heat conduction in composite ...,"[problem, heat, conduction, composite, slab, s...","[5, 584, 6, 145, 582]","[5, 6, 90, 91, 119]",0.833333,"[5, 584, 625, 980, 91]",0.7,"[5, 91, 625, 584, 90]",0.866667
3,8,what methods -dash exact or approximate -dash ...,"[method, dash, exact, approximate, dash, prese...","[122, 1306, 639, 655, 988]","[48, 122, 354, 360, 1005]",1.0,"[122, 234, 1104, 924, 556]",1.0,"[122, 556, 1104, 234, 924]",1.0
4,10,are real-gas transport properties for air avai...,"[real, gas, transport, property, air, availabl...","[405, 302, 436, 616, 583]","[259, 405, 302, 436, 437]",1.0,"[302, 185, 616, 1009, 1313]",1.0,"[302, 332, 405, 1009, 583]",0.833333


In [None]:
print('Mean Average Precision =', queries['idf_ap'].mean())

Mean Average Precision = 0.6256372549019611


### 3.4 EVALUATION ON VALIDATION SET

In [None]:
queries_val['idf_rel'] = queries_val['tokens'].apply(lambda x: idf_rank(x))
queries_val.head()

Unnamed: 0,qid,query,tokens,ground_truth,jaccard_rel,jaccard_ap,tf_rel,tf_ap,idf_rel
0,189,is there a design method for calculating therm...,"[design, method, calculate, thermal, fatigue, ...","[395, 866, 869, 865, 868]","[868, 1306, 833, 906, 909]",1.0,"[987, 395, 980, 924, 727]",0.5,"[987, 727, 212, 213, 779]"
1,190,will an analysis of panel flutter based on arb...,"[analysis, panel, flutter, base, arbitrarily, ...","[15, 391, 285, 390, 864]","[390, 1008, 285, 21, 391]",0.755556,"[390, 122, 391, 1008, 766]",0.833333,"[390, 122, 391, 1122, 15]"
2,191,"what is the criterion for true panel flutter, ...","[criterion, true, panel, flutter, oppose, smal...","[914, 915, 285, 857, 858]","[285, 31, 864, 728, 914]",0.7,"[766, 15, 285, 914, 859]",0.416667,"[766, 15, 914, 285, 739]"
3,194,how can the analytical solution of the bucklin...,"[analytical, solution, buckle, strength, unifo...","[739, 740, 742, 743, 744]","[932, 1050, 1172, 744, 1171]",0.25,"[932, 1172, 740, 1173, 744]",0.366667,"[1172, 932, 839, 740, 1173]"
4,196,the problem of similarity for representative i...,"[problem, similarity, representative, investig...","[51, 185, 874, 875, 876]","[875, 1008, 184, 655, 180]",1.0,"[662, 184, 966, 572, 1319]",0.0,"[662, 184, 966, 746, 1008]"


In [None]:
queries_val['idf_ap'] = queries_val.apply(lambda x: average_precision(x['idf_rel'], x['ground_truth']), axis = 1)
queries_val.head()

Unnamed: 0,qid,query,tokens,ground_truth,jaccard_rel,jaccard_ap,tf_rel,tf_ap,idf_rel,idf_ap
0,189,is there a design method for calculating therm...,"[design, method, calculate, thermal, fatigue, ...","[395, 866, 869, 865, 868]","[868, 1306, 833, 906, 909]",1.0,"[987, 395, 980, 924, 727]",0.5,"[987, 727, 212, 213, 779]",0.0
1,190,will an analysis of panel flutter based on arb...,"[analysis, panel, flutter, base, arbitrarily, ...","[15, 391, 285, 390, 864]","[390, 1008, 285, 21, 391]",0.755556,"[390, 122, 391, 1008, 766]",0.833333,"[390, 122, 391, 1122, 15]",0.755556
2,191,"what is the criterion for true panel flutter, ...","[criterion, true, panel, flutter, oppose, smal...","[914, 915, 285, 857, 858]","[285, 31, 864, 728, 914]",0.7,"[766, 15, 285, 914, 859]",0.416667,"[766, 15, 914, 285, 739]",0.416667
3,194,how can the analytical solution of the bucklin...,"[analytical, solution, buckle, strength, unifo...","[739, 740, 742, 743, 744]","[932, 1050, 1172, 744, 1171]",0.25,"[932, 1172, 740, 1173, 744]",0.366667,"[1172, 932, 839, 740, 1173]",0.25
4,196,the problem of similarity for representative i...,"[problem, similarity, representative, investig...","[51, 185, 874, 875, 876]","[875, 1008, 184, 655, 180]",1.0,"[662, 184, 966, 572, 1319]",0.0,"[662, 184, 966, 746, 1008]",0.0


In [None]:
print('Mean Average Precision on Validation Set =', queries_val['idf_ap'].mean())

Mean Average Precision on Validation Set = 0.353030303030303


## 4. TF-IDF

### 4.1 COMPUTING TF-IDF

In [None]:
documents_tfidf = documents_tf.iloc[:, 1:] * list(idf_dict.values())
documents_tfidf.head()

Unnamed: 0,ab,abbreviate,ability,ablating,ablation,able,abrupt,abruptly,absence,absolute,absorb,absorption,academic,accelerate,accelerated,acceleration,accept,acceptability,acceptance,accessible,accidental,accommodate,accommodation,accompany,accompanying,accomplish,accord,accordance,accordingly,account,accumulation,accuracy,accurate,accurately,achieve,ackeret,acoustic,acoustical,acquisition,acr,...,width,will,williams,wind,window,windward,wing,winglike,wire,withstand,wkb,womersley,wood,word,work,worker,working,worth,worthy,wrinkle,write,writer,x,xenon,xiii,y,yaw,yawed,yawing,year,yield,york,young,youngs,z,zbrozek,zero,zeroth,zone,zuk
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
documents_tfidf = pd.concat([documents['docid'], documents_tfidf], axis = 1)
documents_tfidf.head()

Unnamed: 0,docid,ab,abbreviate,ability,ablating,ablation,able,abrupt,abruptly,absence,absolute,absorb,absorption,academic,accelerate,accelerated,acceleration,accept,acceptability,acceptance,accessible,accidental,accommodate,accommodation,accompany,accompanying,accomplish,accord,accordance,accordingly,account,accumulation,accuracy,accurate,accurately,achieve,ackeret,acoustic,acoustical,acquisition,...,width,will,williams,wind,window,windward,wing,winglike,wire,withstand,wkb,womersley,wood,word,work,worker,working,worth,worthy,wrinkle,write,writer,x,xenon,xiii,y,yaw,yawed,yawing,year,yield,york,young,youngs,z,zbrozek,zero,zeroth,zone,zuk
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 4.2 RANKING

In [None]:
def tf_idf_rank(qtokens):
  qtokens = list(set(qtokens).intersection(vocabulary))

  columns = ['docid']
  columns.extend(qtokens)

  temp_doc = documents_tfidf.loc[:, columns].copy()

  # Adding all the frequencies
  temp_doc['tfidf_sum'] = temp_doc[qtokens].sum(axis = 1)

  # Sorting dataframe according to sum of TF-IDF and getting relevant docs
  rel_docs = temp_doc.sort_values(by = 'tfidf_sum', ascending = False).head()['docid'].values

  return rel_docs

### 4.3 EVALUATION ON TRAIN SET

In [None]:
queries['tfidf_rel'] = queries['tokens'].apply(lambda x: tf_idf_rank(x))
queries.head()

Unnamed: 0,qid,query,tokens,jaccard_rel,ground_truth,jaccard_ap,tf_rel,tf_ap,idf_rel,idf_ap,tfidf_rel
0,1,what similarity laws must be obeyed when const...,"[similarity, law, obey, construct, aeroelastic...","[51, 875, 12, 184, 1111]","[184, 29, 31, 57, 378]",0.25,"[51, 12, 184, 876, 252]",0.333333,"[51, 184, 12, 601, 62]",0.5,"[51, 184, 12, 601, 746]"
1,2,what are the structural and aeroelastic proble...,"[structural, aeroelastic, problem, associate, ...","[12, 51, 700, 746, 875]","[12, 746, 15, 184, 858]",0.75,"[12, 172, 51, 746, 798]",0.75,"[12, 172, 51, 746, 364]",0.75,"[12, 51, 172, 746, 364]"
2,3,what problems of heat conduction in composite ...,"[problem, heat, conduction, composite, slab, s...","[5, 584, 6, 145, 582]","[5, 6, 90, 91, 119]",0.833333,"[5, 584, 625, 980, 91]",0.7,"[5, 91, 625, 584, 90]",0.866667,"[5, 91, 90, 625, 584]"
3,8,what methods -dash exact or approximate -dash ...,"[method, dash, exact, approximate, dash, prese...","[122, 1306, 639, 655, 988]","[48, 122, 354, 360, 1005]",1.0,"[122, 234, 1104, 924, 556]",1.0,"[122, 556, 1104, 234, 924]",1.0,"[122, 556, 1104, 234, 924]"
4,10,are real-gas transport properties for air avai...,"[real, gas, transport, property, air, availabl...","[405, 302, 436, 616, 583]","[259, 405, 302, 436, 437]",1.0,"[302, 185, 616, 1009, 1313]",1.0,"[302, 332, 405, 1009, 583]",0.833333,"[302, 1009, 332, 405, 583]"


In [None]:
queries['tfidf_ap'] = queries.apply(lambda x: average_precision(x['tfidf_rel'], x['ground_truth']), axis = 1)
queries.head()

Unnamed: 0,qid,query,tokens,jaccard_rel,ground_truth,jaccard_ap,tf_rel,tf_ap,idf_rel,idf_ap,tfidf_rel,tfidf_ap
0,1,what similarity laws must be obeyed when const...,"[similarity, law, obey, construct, aeroelastic...","[51, 875, 12, 184, 1111]","[184, 29, 31, 57, 378]",0.25,"[51, 12, 184, 876, 252]",0.333333,"[51, 184, 12, 601, 62]",0.5,"[51, 184, 12, 601, 746]",0.5
1,2,what are the structural and aeroelastic proble...,"[structural, aeroelastic, problem, associate, ...","[12, 51, 700, 746, 875]","[12, 746, 15, 184, 858]",0.75,"[12, 172, 51, 746, 798]",0.75,"[12, 172, 51, 746, 364]",0.75,"[12, 51, 172, 746, 364]",0.75
2,3,what problems of heat conduction in composite ...,"[problem, heat, conduction, composite, slab, s...","[5, 584, 6, 145, 582]","[5, 6, 90, 91, 119]",0.833333,"[5, 584, 625, 980, 91]",0.7,"[5, 91, 625, 584, 90]",0.866667,"[5, 91, 90, 625, 584]",1.0
3,8,what methods -dash exact or approximate -dash ...,"[method, dash, exact, approximate, dash, prese...","[122, 1306, 639, 655, 988]","[48, 122, 354, 360, 1005]",1.0,"[122, 234, 1104, 924, 556]",1.0,"[122, 556, 1104, 234, 924]",1.0,"[122, 556, 1104, 234, 924]",1.0
4,10,are real-gas transport properties for air avai...,"[real, gas, transport, property, air, availabl...","[405, 302, 436, 616, 583]","[259, 405, 302, 436, 437]",1.0,"[302, 185, 616, 1009, 1313]",1.0,"[302, 332, 405, 1009, 583]",0.833333,"[302, 1009, 332, 405, 583]",0.75


In [None]:
print('Mean Average Precision =', queries['tfidf_ap'].mean())

Mean Average Precision = 0.6456372549019611


### 4.4 EVALUATION ON VALIDATION SET

In [None]:
queries_val['tfidf_rel'] = queries_val['tokens'].apply(lambda x: tf_idf_rank(x))
queries_val.head()

Unnamed: 0,qid,query,tokens,ground_truth,jaccard_rel,jaccard_ap,tf_rel,tf_ap,idf_rel,idf_ap,tfidf_rel
0,189,is there a design method for calculating therm...,"[design, method, calculate, thermal, fatigue, ...","[395, 866, 869, 865, 868]","[868, 1306, 833, 906, 909]",1.0,"[987, 395, 980, 924, 727]",0.5,"[987, 727, 212, 213, 779]",0.0,"[727, 987, 212, 213, 726]"
1,190,will an analysis of panel flutter based on arb...,"[analysis, panel, flutter, base, arbitrarily, ...","[15, 391, 285, 390, 864]","[390, 1008, 285, 21, 391]",0.755556,"[390, 122, 391, 1008, 766]",0.833333,"[390, 122, 391, 1122, 15]",0.755556,"[390, 122, 391, 766, 1008]"
2,191,"what is the criterion for true panel flutter, ...","[criterion, true, panel, flutter, oppose, smal...","[914, 915, 285, 857, 858]","[285, 31, 864, 728, 914]",0.7,"[766, 15, 285, 914, 859]",0.416667,"[766, 15, 914, 285, 739]",0.416667,"[766, 15, 914, 285, 75]"
3,194,how can the analytical solution of the bucklin...,"[analytical, solution, buckle, strength, unifo...","[739, 740, 742, 743, 744]","[932, 1050, 1172, 744, 1171]",0.25,"[932, 1172, 740, 1173, 744]",0.366667,"[1172, 932, 839, 740, 1173]",0.25,"[1172, 932, 839, 740, 1173]"
4,196,the problem of similarity for representative i...,"[problem, similarity, representative, investig...","[51, 185, 874, 875, 876]","[875, 1008, 184, 655, 180]",1.0,"[662, 184, 966, 572, 1319]",0.0,"[662, 184, 966, 746, 1008]",0.0,"[184, 662, 746, 966, 1008]"


In [None]:
queries_val['tfidf_ap'] = queries_val.apply(lambda x: average_precision(x['tfidf_rel'], x['ground_truth']), axis = 1)
queries_val.head()

Unnamed: 0,qid,query,tokens,ground_truth,jaccard_rel,jaccard_ap,tf_rel,tf_ap,idf_rel,idf_ap,tfidf_rel,tfidf_ap
0,189,is there a design method for calculating therm...,"[design, method, calculate, thermal, fatigue, ...","[395, 866, 869, 865, 868]","[868, 1306, 833, 906, 909]",1.0,"[987, 395, 980, 924, 727]",0.5,"[987, 727, 212, 213, 779]",0.0,"[727, 987, 212, 213, 726]",0.0
1,190,will an analysis of panel flutter based on arb...,"[analysis, panel, flutter, base, arbitrarily, ...","[15, 391, 285, 390, 864]","[390, 1008, 285, 21, 391]",0.755556,"[390, 122, 391, 1008, 766]",0.833333,"[390, 122, 391, 1122, 15]",0.755556,"[390, 122, 391, 766, 1008]",0.833333
2,191,"what is the criterion for true panel flutter, ...","[criterion, true, panel, flutter, oppose, smal...","[914, 915, 285, 857, 858]","[285, 31, 864, 728, 914]",0.7,"[766, 15, 285, 914, 859]",0.416667,"[766, 15, 914, 285, 739]",0.416667,"[766, 15, 914, 285, 75]",0.416667
3,194,how can the analytical solution of the bucklin...,"[analytical, solution, buckle, strength, unifo...","[739, 740, 742, 743, 744]","[932, 1050, 1172, 744, 1171]",0.25,"[932, 1172, 740, 1173, 744]",0.366667,"[1172, 932, 839, 740, 1173]",0.25,"[1172, 932, 839, 740, 1173]",0.25
4,196,the problem of similarity for representative i...,"[problem, similarity, representative, investig...","[51, 185, 874, 875, 876]","[875, 1008, 184, 655, 180]",1.0,"[662, 184, 966, 572, 1319]",0.0,"[662, 184, 966, 746, 1008]",0.0,"[184, 662, 746, 966, 1008]",0.0


In [None]:
print('Mean Average Precision on Validation Set =', queries_val['tfidf_ap'].mean())

Mean Average Precision on Validation Set = 0.413510101010101


## 5 TF-IDF BASED VECTOR SPACE MODEL

### 5.1 CREATING TF-IDF VECTORS FOR QUERIES

In [None]:
def gen_tfidf_queries(queries_data):
  tf_list_queries=[]

  # Getting Term frequencies
  for tokens in queries_data['tokens']:
    queries_dict = dict.fromkeys(vocabulary, 0)      
    # Counting term frequencies
    for term in set(tokens).intersection(vocabulary):
      queries_dict[term]+=1
    tf_list_queries.append(queries_dict)

  queries_tf = pd.DataFrame(tf_list_queries)

  queries_tf = queries_tf.applymap(log_normalize)

  queries_tfidf = queries_tf * list(idf_dict.values())

  queries_tfidf = pd.concat([queries_data['qid'], queries_tfidf], axis = 1)

  return queries_tfidf

In [None]:
queries_tfidf = gen_tfidf_queries(queries)
queries_tfidf.head()

Unnamed: 0,qid,ab,abbreviate,ability,ablating,ablation,able,abrupt,abruptly,absence,absolute,absorb,absorption,academic,accelerate,accelerated,acceleration,accept,acceptability,acceptance,accessible,accidental,accommodate,accommodation,accompany,accompanying,accomplish,accord,accordance,accordingly,account,accumulation,accuracy,accurate,accurately,achieve,ackeret,acoustic,acoustical,acquisition,...,width,will,williams,wind,window,windward,wing,winglike,wire,withstand,wkb,womersley,wood,word,work,worker,working,worth,worthy,wrinkle,write,writer,x,xenon,xiii,y,yaw,yawed,yawing,year,yield,york,young,youngs,z,zbrozek,zero,zeroth,zone,zuk
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
queries_val_tfidf = gen_tfidf_queries(queries_val)
queries_val_tfidf.head()

Unnamed: 0,qid,ab,abbreviate,ability,ablating,ablation,able,abrupt,abruptly,absence,absolute,absorb,absorption,academic,accelerate,accelerated,acceleration,accept,acceptability,acceptance,accessible,accidental,accommodate,accommodation,accompany,accompanying,accomplish,accord,accordance,accordingly,account,accumulation,accuracy,accurate,accurately,achieve,ackeret,acoustic,acoustical,acquisition,...,width,will,williams,wind,window,windward,wing,winglike,wire,withstand,wkb,womersley,wood,word,work,worker,working,worth,worthy,wrinkle,write,writer,x,xenon,xiii,y,yaw,yawed,yawing,year,yield,york,young,youngs,z,zbrozek,zero,zeroth,zone,zuk
0,189,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,190,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.888741,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.286681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 5.2 RANKING

In [None]:
temp_doc_tfidf = documents_tfidf.copy()

In [None]:
def tfidf_vsm_rank(queries_data):
  temp_doc_tfidf['tfidf_vsm'] = temp_doc_tfidf.apply(lambda x: cosine_similarity(x.values[1:].reshape(1, -1), queries_data[1:].values.reshape(1, -1)).item(), axis = 1)
  
  rel_docs = temp_doc_tfidf.sort_values(by = 'tfidf_vsm', ascending = False).head()['docid'].values

  temp_doc_tfidf.drop(columns = 'tfidf_vsm', inplace = True)

  return rel_docs

### 5.3 EVALUATION TRAIN SET

In [None]:
queries['tfidf_vsm_rel'] = queries_tfidf.apply(lambda x: tfidf_vsm_rank(x), axis = 1)
queries.head()

Unnamed: 0,qid,query,tokens,jaccard_rel,ground_truth,jaccard_ap,tf_rel,tf_ap,idf_rel,idf_ap,tfidf_rel,tfidf_ap,tfidf_vsm_rel
0,1,what similarity laws must be obeyed when const...,"[similarity, law, obey, construct, aeroelastic...","[51, 875, 12, 184, 1111]","[184, 29, 31, 57, 378]",0.25,"[51, 12, 184, 876, 252]",0.333333,"[51, 184, 12, 601, 62]",0.5,"[51, 184, 12, 601, 746]",0.5,"[51, 184, 12, 875, 332]"
1,2,what are the structural and aeroelastic proble...,"[structural, aeroelastic, problem, associate, ...","[12, 51, 700, 746, 875]","[12, 746, 15, 184, 858]",0.75,"[12, 172, 51, 746, 798]",0.75,"[12, 172, 51, 746, 364]",0.75,"[12, 51, 172, 746, 364]",0.75,"[12, 875, 51, 746, 184]"
2,3,what problems of heat conduction in composite ...,"[problem, heat, conduction, composite, slab, s...","[5, 584, 6, 145, 582]","[5, 6, 90, 91, 119]",0.833333,"[5, 584, 625, 980, 91]",0.7,"[5, 91, 625, 584, 90]",0.866667,"[5, 91, 90, 625, 584]",1.0,"[5, 90, 91, 584, 582]"
3,8,what methods -dash exact or approximate -dash ...,"[method, dash, exact, approximate, dash, prese...","[122, 1306, 639, 655, 988]","[48, 122, 354, 360, 1005]",1.0,"[122, 234, 1104, 924, 556]",1.0,"[122, 556, 1104, 234, 924]",1.0,"[122, 556, 1104, 234, 924]",1.0,"[556, 122, 569, 608, 639]"
4,10,are real-gas transport properties for air avai...,"[real, gas, transport, property, air, availabl...","[405, 302, 436, 616, 583]","[259, 405, 302, 436, 437]",1.0,"[302, 185, 616, 1009, 1313]",1.0,"[302, 332, 405, 1009, 583]",0.833333,"[302, 1009, 332, 405, 583]",0.75,"[405, 302, 436, 1009, 583]"


In [None]:
queries['tfidf_vsm_ap'] = queries.apply(lambda x: average_precision(x['tfidf_vsm_rel'], x['ground_truth']), axis = 1)
queries.head()

Unnamed: 0,qid,query,tokens,jaccard_rel,ground_truth,jaccard_ap,tf_rel,tf_ap,idf_rel,idf_ap,tfidf_rel,tfidf_ap,tfidf_vsm_rel,tfidf_vsm_ap
0,1,what similarity laws must be obeyed when const...,"[similarity, law, obey, construct, aeroelastic...","[51, 875, 12, 184, 1111]","[184, 29, 31, 57, 378]",0.25,"[51, 12, 184, 876, 252]",0.333333,"[51, 184, 12, 601, 62]",0.5,"[51, 184, 12, 601, 746]",0.5,"[51, 184, 12, 875, 332]",0.5
1,2,what are the structural and aeroelastic proble...,"[structural, aeroelastic, problem, associate, ...","[12, 51, 700, 746, 875]","[12, 746, 15, 184, 858]",0.75,"[12, 172, 51, 746, 798]",0.75,"[12, 172, 51, 746, 364]",0.75,"[12, 51, 172, 746, 364]",0.75,"[12, 875, 51, 746, 184]",0.7
2,3,what problems of heat conduction in composite ...,"[problem, heat, conduction, composite, slab, s...","[5, 584, 6, 145, 582]","[5, 6, 90, 91, 119]",0.833333,"[5, 584, 625, 980, 91]",0.7,"[5, 91, 625, 584, 90]",0.866667,"[5, 91, 90, 625, 584]",1.0,"[5, 90, 91, 584, 582]",1.0
3,8,what methods -dash exact or approximate -dash ...,"[method, dash, exact, approximate, dash, prese...","[122, 1306, 639, 655, 988]","[48, 122, 354, 360, 1005]",1.0,"[122, 234, 1104, 924, 556]",1.0,"[122, 556, 1104, 234, 924]",1.0,"[122, 556, 1104, 234, 924]",1.0,"[556, 122, 569, 608, 639]",0.5
4,10,are real-gas transport properties for air avai...,"[real, gas, transport, property, air, availabl...","[405, 302, 436, 616, 583]","[259, 405, 302, 436, 437]",1.0,"[302, 185, 616, 1009, 1313]",1.0,"[302, 332, 405, 1009, 583]",0.833333,"[302, 1009, 332, 405, 583]",0.75,"[405, 302, 436, 1009, 583]",1.0


In [None]:
print('Mean Average Precision =', queries['tfidf_vsm_ap'].mean())

Mean Average Precision = 0.6386764705882354


### 5.4 EVALUATION ON VALIDATION SET

In [None]:
queries_val['tfidf_vsm_rel'] = queries_val_tfidf.apply(lambda x: tfidf_vsm_rank(x), axis = 1)
queries_val.head()

Unnamed: 0,qid,query,tokens,ground_truth,jaccard_rel,jaccard_ap,tf_rel,tf_ap,idf_rel,idf_ap,tfidf_rel,tfidf_ap,tfidf_vsm_rel
0,189,is there a design method for calculating therm...,"[design, method, calculate, thermal, fatigue, ...","[395, 866, 869, 865, 868]","[868, 1306, 833, 906, 909]",1.0,"[987, 395, 980, 924, 727]",0.5,"[987, 727, 212, 213, 779]",0.0,"[727, 987, 212, 213, 726]",0.0,"[909, 215, 726, 304, 727]"
1,190,will an analysis of panel flutter based on arb...,"[analysis, panel, flutter, base, arbitrarily, ...","[15, 391, 285, 390, 864]","[390, 1008, 285, 21, 391]",0.755556,"[390, 122, 391, 1008, 766]",0.833333,"[390, 122, 391, 1122, 15]",0.755556,"[390, 122, 391, 766, 1008]",0.833333,"[390, 122, 391, 1008, 864]"
2,191,"what is the criterion for true panel flutter, ...","[criterion, true, panel, flutter, oppose, smal...","[914, 915, 285, 857, 858]","[285, 31, 864, 728, 914]",0.7,"[766, 15, 285, 914, 859]",0.416667,"[766, 15, 914, 285, 739]",0.416667,"[766, 15, 914, 285, 75]",0.416667,"[31, 285, 766, 15, 864]"
3,194,how can the analytical solution of the bucklin...,"[analytical, solution, buckle, strength, unifo...","[739, 740, 742, 743, 744]","[932, 1050, 1172, 744, 1171]",0.25,"[932, 1172, 740, 1173, 744]",0.366667,"[1172, 932, 839, 740, 1173]",0.25,"[1172, 932, 839, 740, 1173]",0.25,"[932, 1172, 1171, 938, 1050]"
4,196,the problem of similarity for representative i...,"[problem, similarity, representative, investig...","[51, 185, 874, 875, 876]","[875, 1008, 184, 655, 180]",1.0,"[662, 184, 966, 572, 1319]",0.0,"[662, 184, 966, 746, 1008]",0.0,"[184, 662, 746, 966, 1008]",0.0,"[1008, 184, 864, 966, 875]"


In [None]:
queries_val['tfidf_vsm_ap'] = queries_val.apply(lambda x: average_precision(x['tfidf_vsm_rel'], x['ground_truth']), axis = 1)
queries_val.head()

Unnamed: 0,qid,query,tokens,ground_truth,jaccard_rel,jaccard_ap,tf_rel,tf_ap,idf_rel,idf_ap,tfidf_rel,tfidf_ap,tfidf_vsm_rel,tfidf_vsm_ap
0,189,is there a design method for calculating therm...,"[design, method, calculate, thermal, fatigue, ...","[395, 866, 869, 865, 868]","[868, 1306, 833, 906, 909]",1.0,"[987, 395, 980, 924, 727]",0.5,"[987, 727, 212, 213, 779]",0.0,"[727, 987, 212, 213, 726]",0.0,"[909, 215, 726, 304, 727]",0.0
1,190,will an analysis of panel flutter based on arb...,"[analysis, panel, flutter, base, arbitrarily, ...","[15, 391, 285, 390, 864]","[390, 1008, 285, 21, 391]",0.755556,"[390, 122, 391, 1008, 766]",0.833333,"[390, 122, 391, 1122, 15]",0.755556,"[390, 122, 391, 766, 1008]",0.833333,"[390, 122, 391, 1008, 864]",0.755556
2,191,"what is the criterion for true panel flutter, ...","[criterion, true, panel, flutter, oppose, smal...","[914, 915, 285, 857, 858]","[285, 31, 864, 728, 914]",0.7,"[766, 15, 285, 914, 859]",0.416667,"[766, 15, 914, 285, 739]",0.416667,"[766, 15, 914, 285, 75]",0.416667,"[31, 285, 766, 15, 864]",0.5
3,194,how can the analytical solution of the bucklin...,"[analytical, solution, buckle, strength, unifo...","[739, 740, 742, 743, 744]","[932, 1050, 1172, 744, 1171]",0.25,"[932, 1172, 740, 1173, 744]",0.366667,"[1172, 932, 839, 740, 1173]",0.25,"[1172, 932, 839, 740, 1173]",0.25,"[932, 1172, 1171, 938, 1050]",0.0
4,196,the problem of similarity for representative i...,"[problem, similarity, representative, investig...","[51, 185, 874, 875, 876]","[875, 1008, 184, 655, 180]",1.0,"[662, 184, 966, 572, 1319]",0.0,"[662, 184, 966, 746, 1008]",0.0,"[184, 662, 746, 966, 1008]",0.0,"[1008, 184, 864, 966, 875]",0.2


In [None]:
print('Mean Average Precision =', queries_val['tfidf_vsm_ap'].mean())

Mean Average Precision = 0.3527777777777778
