In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
preprocessed_data_path = "data/preprocessed_data/"

In [3]:
class TFIDF:
    def __init__(self, documents, weighting = "raw"):
        self.documents = documents
        self.weighting = weighting

        self.N = len(documents)
        self.tf = self.compute_tf()
        self.idf = self.compute_idf()
        self.tf_idf = self.compute_tf_idf()

    def compute_tf(self):
        tf = {}
        for i, document in enumerate(self.documents):
            tf[i] = {}
            for term in document.split():
                if term not in tf[i]:
                    tf[i][term] = 0
                tf[i][term] += 1
        return tf

    def compute_idf(self):
        idf = {}
        for document in self.documents:
            for term in document.split():
                if term not in idf:
                    idf[term] = 0
                idf[term] += 1
        for term in idf:
            idf[term] = np.log(self.N / idf[term])
        return idf
    
    def compute_tf_idf(self):
        tf_idf = np.zeros((self.N, len(self.idf)))
        for i, document in enumerate(self.documents):
            vis = []
            for j, term in enumerate(set(document.split())):
                term_loc = j
                if term in vis:
                    term_loc = vis.index(term)
                else:
                    vis.append(term)
                if self.weighting == "binary":
                    tf_idf[i][term_loc] += self.idf[term]
                elif self.weighting == "raw":
                    tf_idf[i][term_loc] += self.tf[i][term] * self.idf[term]
                elif self.weighting == "term_frequency":
                    tf_idf[i][term_loc] += self.tf[i][term] * self.idf[term] / sum([self.tf[i][t] for t in self.tf[i] if t is not term])
                elif self.weighting == "log_normalization":
                    tf_idf[i][term_loc] += (1 + np.log(self.tf[i][term])) * self.idf[term]
                elif self.weighting == "double_normalization":
                    tf_idf[i][term_loc] += (0.5 + 0.5 * (self.tf[i][term] / max([self.tf[i][t] for t in self.tf[i] if t is not term]))) * self.idf[term]
        return tf_idf

    def get_tf(self):
        return self.tf

    def get_idf(self):
        return self.idf

    def get_tf_idf(self):
        return self.tf_idf

In [5]:
weighting_metrics = ["binary", "raw", "term_frequency", "log_normalization", "double_normalization"]

In [6]:
corpus = []

for filename in os.listdir(preprocessed_data_path):
    with open(preprocessed_data_path + filename, "r") as f:
        corpus.append(f.read())
    f.close()

In [7]:
len(corpus)

1400

In [8]:
tfidf = TFIDF(corpus)

In [30]:
class Jaccard:
    def __init__(self, documents, vocab):
        self.documents = documents
        self.vocab = vocab

    def compute_jaccard(self):
        jaccard = np.zeros((len(self.documents), len(self.vocab)))
        for i, document1 in enumerate(self.documents):
            for j, query in enumerate(self.vocab):
                jaccard[i][j] = len(set(document1.split()) & set(query.split())) / len(set(document1.split()) | set(query.split()))
        return jaccard

In [31]:
jaccard = Jaccard(corpus, tfidf.idf.keys())

In [32]:
# jaccard.compute_jaccard("investigation")
jaccard_coeff = jaccard.compute_jaccard()