In [240]:
import pandas as pd
import numpy as np
import os
import pickle
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string


In [241]:
preprocessed_data_path = "data/preprocessed_data/"
stopword_list = stopwords.words('english')

In [242]:
def dump_saves(obj, file_name):
    with open("Saves/" + file_name, 'wb') as f:
        pickle.dump(obj, f)

def load_saves(file_name):
    with open("Saves/" + file_name, 'rb') as f:
        return pickle.load(f)

In [243]:
def preprocess_query(text):
    lowercased_text = text.lower()
    # Tokenization
    tokenized_text = word_tokenize(lowercased_text)
    # Remove Stopwords
    filtered_text = [word for word in tokenized_text if word not in stopword_list]
    # Remove Punctuations
    temp = []
    for word in filtered_text:
        temp_word = word
        for punc in string.punctuation:
            if punc == "-":
                continue
            temp_word = temp_word.replace(punc, '')
        temp.append(temp_word)
    filtered_text = temp
    # Remove Blank Space Tokens
    filtered_text = ' '.join(filtered_text).split()
    return filtered_text

In [244]:
class TFIDF:
    def __init__(self, documents, weighting = "binary"):
        self.documents = documents
        self.weighting = weighting
        
        self.vocab = None

        self.N = len(documents)
        self.tf = self.compute_tf()
        self.idf = self.compute_idf()
        self.tf_idf = self.compute_tf_idf()

    def compute_tf(self):
        tf = {}
        for i, document in enumerate(self.documents):
            tf[i] = {}
            for term in document.split():
                if term not in tf[i]:
                    tf[i][term] = 0
                tf[i][term] += 1

        for i in tf:
            for term in tf[i]:
                if self.weighting == "binary":
                    if tf[i][term] > 0:
                        tf[i][term] = 1
                elif self.weighting == "raw":
                    tf[i][term] = tf[i][term]
                elif self.weighting == "term_frequency":
                    tf[i][term] = tf[i][term] / sum([tf[i][t] for t in tf[i]])
                elif self.weighting == "log_normalization":
                    tf[i][term] = (1 + np.log(tf[i][term]))
                elif self.weighting == "double_normalization":
                    tf[i][term] = (0.5 + 0.5 * (tf[i][term] / max([tf[i][t] for t in tf[i]])))
        return tf

    def compute_idf(self):
        idf = {}
        for document in self.documents:
            for term in document.split():
                if term not in idf:
                    idf[term] = 0
                idf[term] += 1
        for term in idf:
            idf[term] = np.log(self.N / idf[term])
        return idf
    
    def compute_tf_idf(self):
        tf_idf = np.zeros((self.N, len(self.idf)))
        self.vocab = {}
        c = 0
        for i, document in enumerate(self.documents):
            for j, term in enumerate(set(document.split())):
                if term not in self.vocab:
                    self.vocab[term] = c
                    c+=1
                term_loc = self.vocab[term]
                tf_idf[i][term_loc] += self.tf[i][term] * self.idf[term]
        return tf_idf
    
    def query_processing(self, query):
        query = preprocess_query(query)
        query_tf = {}
        for term in query:
            if term not in query_tf:
                query_tf[term] = 0
            query_tf[term] += 1
        for term in query_tf:
            if self.weighting == "binary":
                if query_tf[term] > 0:
                    query_tf[term] = 1
            elif self.weighting == "raw":
                query_tf[term] = query_tf[term]
            elif self.weighting == "term_frequency":
                query_tf[term] = query_tf[term] / sum([query_tf[t] for t in query_tf])
            elif self.weighting == "log_normalization":
                query_tf[term] = (1 + np.log(query_tf[term]))
            elif self.weighting == "double_normalization":
                query_tf[term] = (0.5 + 0.5 * (query_tf[term] / max([query_tf[t] for t in query_tf])))

        # create a vector

        query_vector = np.zeros(len(self.vocab))
        for term in query_tf:
            if term in self.vocab:
                term_loc = self.vocab[term]
                query_vector[term_loc] = query_tf[term] * self.idf[term]

        return query_vector
    
    def get_score(self, query):
        query_vector = self.query_processing(query)
        scores = np.dot(self.tf_idf, query_vector)
        return scores
    
    def get_top_k(self, query, k):
        scores = self.get_score(query)
        top_k = np.argsort(scores)[::-1][:k]
        return top_k

    def get_tf(self):
        return self.tf

    def get_idf(self):
        return self.idf

    def get_tf_idf(self):
        return self.tf_idf

    def get_vocab(self):
        return self.vocab

In [245]:
weighting_metrics = ["binary", "raw", "term_frequency", "log_normalization", "double_normalization"]

In [246]:
corpus = []
filenames = []

for filename in os.listdir(preprocessed_data_path):
    filenames.append(filename)
    with open(preprocessed_data_path + filename, "r") as f:
        corpus.append(f.read())
    f.close()

In [247]:
binary_tfidf = TFIDF(corpus, weighting = "binary")
binary_tfidf_matrix = binary_tfidf.get_tf_idf()

In [248]:
raw_tfidf = TFIDF(corpus, weighting = "raw")
raw_tfidf_matrix = raw_tfidf.get_tf_idf()

In [249]:
term_frequency_tfidf = TFIDF(corpus, weighting = "term_frequency")
term_frequency_tfidf_matrix = term_frequency_tfidf.get_tf_idf()

In [250]:
log_normalization_tfidf = TFIDF(corpus, weighting = "log_normalization")
log_normalization_tfidf_matrix = log_normalization_tfidf.get_tf_idf()

In [251]:
double_normalization_tfidf = TFIDF(corpus, weighting = "double_normalization")
double_normalization_tfidf_matrix = double_normalization_tfidf.get_tf_idf()

In [252]:
dump_saves(binary_tfidf_matrix, "binary_tfidf_matrix.pkl")
dump_saves(raw_tfidf_matrix, "raw_tfidf_matrix.pkl")
dump_saves(term_frequency_tfidf_matrix, "term_frequency_tfidf_matrix.pkl")
dump_saves(log_normalization_tfidf_matrix, "log_normalization_tfidf_matrix.pkl")
dump_saves(double_normalization_tfidf_matrix, "double_normalization_tfidf_matrix.pkl")

In [253]:
vocab = binary_tfidf.get_vocab()

In [254]:
# binary_tfidf_matrix = load_saves("data/binary_tfidf_matrix.pkl")
# raw_tfidf_matrix = load_saves("data/raw_tfidf_matrix.pkl")
# term_frequency_tfidf_matrix = load_saves("data/term_frequency_tfidf_matrix.pkl")
# log_normalization_tfidf_matrix = load_saves("data/log_normalization_tfidf_matrix.pkl")
# double_normalization_tfidf_matrix = load_saves("data/double_normalization_tfidf_matrix.pkl")

In [255]:
query_1 = "turbulent incompressible laminar peripheral jets proximity"
query_2 = "reynolds number and potential shear"

In [256]:
binary_query_1_documents = binary_tfidf.get_top_k(query_1, 5)
binary_query_1_score = binary_tfidf.get_score(query_1)
binary_query_1_documents_fullname = [filenames[i] for i in binary_query_1_documents]

print("Top 5 most similar documents for query 1 using binary weighting: ", binary_query_1_documents_fullname)
print("Scores for the top 5 documents using binary weighting: ", binary_query_1_score[binary_query_1_documents])

print("-------------")

binary_query_2_documents = binary_tfidf.get_top_k(query_2, 5)
binary_query_2_score = binary_tfidf.get_score(query_2)
binary_query_2_documents_fullname = [filenames[i] for i in binary_query_2_documents]

print("Top 5 most similar documents for query 2 using binary weighting: ", binary_query_2_documents_fullname)
print("Scores for the top 5 documents using binary weighting: ", binary_query_2_score[binary_query_2_documents])

Top 5 most similar documents for query 1 using binary weighting:  ['cranfield0086', 'cranfield1223', 'cranfield0354', 'cranfield0650', 'cranfield0792']
Scores for the top 5 documents using binary weighting:  [72.3430423  38.02766146 35.81756028 34.31538084 24.41982983]
-------------
Top 5 most similar documents for query 2 using binary weighting:  ['cranfield1037', 'cranfield1251', 'cranfield0530', 'cranfield0964', 'cranfield1188']
Scores for the top 5 documents using binary weighting:  [12.76539403 12.76539403 10.33512113 10.33512113 10.33512113]


In [257]:
raw_query_documents = raw_tfidf.get_top_k(query_1, 5)
raw_query_score = raw_tfidf.get_score(query_1)
raw_query_documents_fullname = [filenames[i] for i in raw_query_documents]

print("Top 5 most similar documents for query 1 using raw weighting: ", raw_query_documents_fullname)
print("Scores for the top 5 documents using raw weighting: ", raw_query_score[raw_query_documents])

print("-------------")

raw_query_2_documents = raw_tfidf.get_top_k(query_2, 5)
raw_query_2_score = raw_tfidf.get_score(query_2)
raw_query_2_documents_fullname = [filenames[i] for i in raw_query_2_documents]

print("Top 5 most similar documents for query 2 using raw weighting: ", raw_query_2_documents_fullname)
print("Scores for the top 5 documents using raw weighting: ", raw_query_2_score[raw_query_2_documents])

Top 5 most similar documents for query 1 using raw weighting:  ['cranfield0086', 'cranfield0997', 'cranfield1223', 'cranfield0696', 'cranfield1164']
Scores for the top 5 documents using raw weighting:  [120.26625477  81.64698976  65.24332471  54.43132651  48.83965967]
-------------
Top 5 most similar documents for query 2 using raw weighting:  ['cranfield1244', 'cranfield0814', 'cranfield0484', 'cranfield1098', 'cranfield1271']
Scores for the top 5 documents using raw weighting:  [43.61483799 42.72070283 38.49251145 34.23225452 33.95379324]


In [258]:
term_frequency_query_documents = term_frequency_tfidf.get_top_k(query_1, 5)
term_frequency_query_score = term_frequency_tfidf.get_score(query_1)
term_frequency_query_documents_fullname = [filenames[i] for i in term_frequency_query_documents]

print("Top 5 most similar documents for query 1 using term frequency weighting: ", term_frequency_query_documents_fullname)
print("Scores for the top 5 documents using term frequency weighting: ", term_frequency_query_score[term_frequency_query_documents])

print("-------------")

term_frequency_query_2_documents = term_frequency_tfidf.get_top_k(query_2, 5)
term_frequency_query_2_score = term_frequency_tfidf.get_score(query_2)
term_frequency_query_2_documents_fullname = [filenames[i] for i in term_frequency_query_2_documents]

print("Top 5 most similar documents for query 2 using term frequency weighting: ", term_frequency_query_2_documents_fullname)
print("Scores for the top 5 documents using term frequency weighting: ", term_frequency_query_2_score[term_frequency_query_2_documents])

Top 5 most similar documents for query 1 using term frequency weighting:  ['cranfield0243', 'cranfield1380', 'cranfield0086', 'cranfield0354', 'cranfield1223']
Scores for the top 5 documents using term frequency weighting:  [1.37284835 0.67580791 0.65471535 0.62071943 0.6171313 ]
-------------
Top 5 most similar documents for query 2 using term frequency weighting:  ['cranfield0920', 'cranfield0854', 'cranfield1121', 'cranfield0171', 'cranfield1188']
Scores for the top 5 documents using term frequency weighting:  [0.70277903 0.55289559 0.55206214 0.51732895 0.50448343]


In [259]:
log_normalization_query_documents = log_normalization_tfidf.get_top_k(query_1, 5)
log_normalization_query_score = log_normalization_tfidf.get_score(query_1)
log_normalization_query_documents_fullname = [filenames[i] for i in log_normalization_query_documents]

print("Top 5 most similar documents for query 1 using log normalization weighting: ", log_normalization_query_documents_fullname)
print("Scores for the top 5 documents using log normalization weighting: ", log_normalization_query_score[log_normalization_query_documents])

print("-------------")

log_normalization_query_2_documents = log_normalization_tfidf.get_top_k(query_2, 5)
log_normalization_query_2_score = log_normalization_tfidf.get_score(query_2)
log_normalization_query_2_documents_fullname = [filenames[i] for i in log_normalization_query_2_documents]

print("Top 5 most similar documents for query 2 using log normalization weighting: ", log_normalization_query_2_documents_fullname)
print("Scores for the top 5 documents using log normalization weighting: ", log_normalization_query_2_score[log_normalization_query_2_documents])

Top 5 most similar documents for query 1 using log normalization weighting:  ['cranfield0086', 'cranfield1223', 'cranfield1094', 'cranfield1164', 'cranfield0997']
Scores for the top 5 documents using log normalization weighting:  [105.56088191  52.97739251  41.34636603  41.34636603  37.9897928 ]
-------------
Top 5 most similar documents for query 2 using log normalization weighting:  ['cranfield0814', 'cranfield1098', 'cranfield1271', 'cranfield0682', 'cranfield1383']
Scores for the top 5 documents using log normalization weighting:  [22.3858168  20.4916743  20.25593634 20.25593634 20.25593634]


In [260]:
double_normalization_query_documents = double_normalization_tfidf.get_top_k(query_1, 5)
double_normalization_query_score = double_normalization_tfidf.get_score(query_1)
double_normalization_query_documents_fullname = [filenames[i] for i in double_normalization_query_documents]

print("Top 5 most similar documents for query 1 using double normalization weighting: ", double_normalization_query_documents_fullname)
print("Scores for the top 5 documents using double normalization weighting: ", double_normalization_query_score[double_normalization_query_documents])

print("-------------")

double_normalization_query_2_documents = double_normalization_tfidf.get_top_k(query_2, 5)
double_normalization_query_2_score = double_normalization_tfidf.get_score(query_2)
double_normalization_query_2_documents_fullname = [filenames[i] for i in double_normalization_query_2_documents]

print("Top 5 most similar documents for query 2 using double normalization weighting: ", double_normalization_query_2_documents_fullname)
print("Scores for the top 5 documents using double normalization weighting: ", double_normalization_query_2_score[double_normalization_query_2_documents])

Top 5 most similar documents for query 1 using double normalization weighting:  ['cranfield0086', 'cranfield0354', 'cranfield0650', 'cranfield1223', 'cranfield1380']
Scores for the top 5 documents using double normalization weighting:  [56.21589695 26.67539778 25.73653563 25.5381632  24.41982983]
-------------
Top 5 most similar documents for query 2 using double normalization weighting:  ['cranfield1037', 'cranfield0530', 'cranfield1251', 'cranfield1188', 'cranfield0814']
Scores for the top 5 documents using double normalization weighting:  [12.76539403 10.33512113 10.10866373  9.90826058  8.62767895]


----

In [265]:
class Jaccard:
    def __init__(self, documents, vocab):
        self.documents = documents
        self.vocab = vocab

    def compute_jaccard(self, query):
        jaccard_coeff = np.zeros(len(self.documents))
        for i, document1 in enumerate(self.documents):
            jaccard_coeff[i] = len(set(document1.split()) & set(query.split())) / len(set(document1.split()) | set(query.split()))
        return jaccard_coeff

    def get_top_k(self, query, k):
        jaccard_coeff = self.compute_jaccard(query)
        top_k = np.argsort(jaccard_coeff)[::-1][:k]
        return top_k

In [266]:
jaccard = Jaccard(corpus, vocab)

In [267]:
jaccard_query_1 = jaccard.compute_jaccard(query_1)
top_10_jaccard_query_1 = jaccard.get_top_k(query_1, 10)
top_10_jaccard_query_1_fullname = [filenames[i] for i in top_10_jaccard_query_1]

print("Top 10 most similar documents for query 1 according to jaccard similarity: ", top_10_jaccard_query_1_fullname)
print("Scores of top 10 most similar documents for query 1 according to jaccard similarity: ", jaccard_query_1[top_10_jaccard_query_1])

Top 10 most similar documents for query 1 according to jaccard similarity:  ['cranfield0382', 'cranfield0376', 'cranfield0243', 'cranfield0254', 'cranfield1141', 'cranfield0387', 'cranfield0242', 'cranfield0258', 'cranfield0418', 'cranfield0664']
Scores of top 10 most similar documents for query 1 according to jaccard similarity:  [0.08695652 0.08108108 0.07894737 0.06666667 0.05882353 0.05882353
 0.05714286 0.05714286 0.05714286 0.05405405]


In [268]:
jaccard_query_2 = jaccard.compute_jaccard(query_2)
top_10_jaccard_query_2 = jaccard.get_top_k(query_2, 10)
top_10_jaccard_query_2_fullname = [filenames[i] for i in top_10_jaccard_query_2]

print("Top 10 most similar documents for query 2 according to jaccard similarity: ", top_10_jaccard_query_2_fullname)
print("Scores of top 10 most similar documents for query 2 according to jaccard similarity: ", jaccard_query_2[top_10_jaccard_query_2])

Top 10 most similar documents for query 2 according to jaccard similarity:  ['cranfield0389', 'cranfield0670', 'cranfield0254', 'cranfield1085', 'cranfield0491', 'cranfield0530', 'cranfield0669', 'cranfield0003', 'cranfield0361', 'cranfield0965']
Scores of top 10 most similar documents for query 2 according to jaccard similarity:  [0.07692308 0.07692308 0.06896552 0.06060606 0.06       0.05555556
 0.05405405 0.05263158 0.05263158 0.05263158]
