In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd

In [2]:
documents = []
queries = []
relevances = []

for d in range(1400):
    f = open("./d/"+str(d+1)+".txt")
    documents.append(f.read())

for q in range(225):
    f = open("./q/"+str(q+1)+".txt")
    queries.append(f.read())
    
for r in range(225):
    f = open("./r/"+str(r+1)+".txt")
    relevances.append(f.read().split("\n"))

In [3]:
for i in range(225):
    del relevances[i][-1]
    relevances[i] = [int(x) for x in relevances[i]]

In [4]:
def calc_cos_relevant(queries, documents):
    result = []
    for i in range(225):
        sim = np.array(cosine_similarity(tfidf_queries_matrix[i], tfidf_documents_matrix)[0])
        topRelevant = sim.argsort()[-10:][::-1]+1
        result.append(topRelevant)
    return result

def calc_euc_relevant(queries, documents):
    result = []
    for i in range(225):
        sim = np.array(euclidean_distances(tfidf_queries_matrix[i], tfidf_documents_matrix)[0])
        topRelevant = sim.argsort()[:10][::-1]+1
        result.append(topRelevant)
    return result

In [5]:
def evaluate_results(relevances, possible_relevances):
    tp, fp, tn, fn = 0, 0, 0, 0
    for i in range(1400):
        if i in relevances and i in possible_relevances:
            tp += 1
        if i not in relevances and i in possible_relevances:
            fn += 1
        if i in relevances and i not in possible_relevances:
            fp += 1
        if i not in relevances and i not in possible_relevances:
            tn += 1
    return {'tp':tp, 'fp':fp,'tn':tn, 'fn':fn }

In [6]:
def measure(results):
    precision = results['tp']/(results['tp']+results['fp'])
    recall = results['tp']/(results['tp']+results['fn'])
    if precision+recall != 0: 
        f = 2*(precision*recall)/(precision+recall)
    else:
        f = np.nan
    return {'precision':precision, 'recall':recall,'f':f }

In [7]:
def average_measure_on_all_queries(relevances, possible_relevances):
    result = {'precision':0, 'recall':0, 'f':0 }
    counter = 0
    for i in range(225):
        tmp = measure(evaluate_results(relevances[i], my_relevances[i]))
        result['precision'] += tmp['precision']
        result['recall'] += tmp['recall']
        if not np.isnan(tmp['f']):
            result['f'] += tmp['f']
        else:
            counter += 1
    return {'precision':result['precision']/225, 'recall':result['recall']/225, 'f':result['f']/(225-counter) }

In [8]:
result = []

# TF-IDF Cosine similarity

In [9]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_documents_matrix = tfidf_vectorizer.fit_transform(documents)
tfidf_queries_matrix = tfidf_vectorizer.transform(queries)
my_relevances = calc_cos_relevant(tfidf_queries_matrix, tfidf_documents_matrix)
r = average_measure_on_all_queries(relevances, my_relevances)
r['alg'] = "TF-IDF Cosine similarity"
result.append(r)

# TF-IDF Euclidean distance

In [10]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_documents_matrix = tfidf_vectorizer.fit_transform(documents)
tfidf_queries_matrix = tfidf_vectorizer.transform(queries)
my_relevances = calc_euc_relevant(tfidf_queries_matrix, tfidf_documents_matrix)
r = average_measure_on_all_queries(relevances, my_relevances)
r['alg'] = "TF-IDF Euclidean distance"
result.append(r)

# Pure Term Frequency Cosine similarity

In [11]:
count_vectorizer = CountVectorizer()
count_documents_matrix = count_vectorizer.fit_transform(documents)
count_queries_matrix = count_vectorizer.transform(queries)
my_relevances = calc_cos_relevant(count_queries_matrix, count_documents_matrix)
r = average_measure_on_all_queries(relevances, my_relevances)
r['alg'] = "Pure Term Frequency Cosine similarity"
result.append(r)

# Pure Term Frequency Euclidean distance

In [12]:
tfidf_vectorizer = CountVectorizer()
tfidf_documents_matrix = tfidf_vectorizer.fit_transform(documents)
tfidf_queries_matrix = tfidf_vectorizer.transform(queries)
my_relevances = calc_euc_relevant(tfidf_queries_matrix, tfidf_documents_matrix)
r = average_measure_on_all_queries(relevances, my_relevances)
r['alg'] = "Pure Term Frequency Euclidean distance"
result.append(r)

# Binary representation Cosine similarity

In [13]:
count_vectorizer = CountVectorizer(binary = True)
count_documents_matrix = count_vectorizer.fit_transform(documents)
count_queries_matrix = count_vectorizer.transform(queries)
my_relevances = calc_cos_relevant(count_queries_matrix, count_documents_matrix)
r = average_measure_on_all_queries(relevances, my_relevances)
r['alg'] = "Binary representation Cosine similarity"
result.append(r)

# Binary representation Euclidean distance

In [14]:
tfidf_vectorizer = CountVectorizer(binary = True)
tfidf_documents_matrix = tfidf_vectorizer.fit_transform(documents)
tfidf_queries_matrix = tfidf_vectorizer.transform(queries)
my_relevances = calc_euc_relevant(tfidf_queries_matrix, tfidf_documents_matrix)
r = average_measure_on_all_queries(relevances, my_relevances)
r['alg'] = "Binary representation Euclidean distance"
result.append(r)

# Summary

In [15]:
pd.DataFrame(result)

Unnamed: 0,alg,f,precision,recall
0,TF-IDF Cosine similarity,0.311172,0.348287,0.241086
1,TF-IDF Euclidean distance,0.289466,0.321927,0.219556
2,Pure Term Frequency Cosine similarity,0.311172,0.348287,0.241086
3,Pure Term Frequency Euclidean distance,0.109029,0.01591,0.012
4,Binary representation Cosine similarity,0.217311,0.177538,0.128593
5,Binary representation Euclidean distance,0.125864,0.021538,0.015111
