# Import Necessary Libraries

In [8]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from collections import Counter
from num2words import num2words

import os
import string
import re
import copy
import math

from sklearn.feature_extraction.text import TfidfVectorizer

# Initializing Parameters

In [2]:
path = 'C:/Users/prash/Downloads/ML ALGORITHMS/'

title = "stories_dataset"
alpha = 0.3

# Importing Files and Creating Dataset

In [3]:
# Getting the names of all the folders and storing it
folders = [x[0] for x in os.walk(path + 'DATASETS/' + title + '/')]
folders[0] = folders[0][:len(folders[0])-1]

# Form the Dataset

dataset = []
c = False
for i in folders:
    file = open(i+"/index.html", 'r')
    text = file.read().strip()
    file.close()

    file_name = re.findall('><A HREF="(.*)">', text)
    file_title = re.findall('<BR><TD> (.*)\n', text)

    if c == False:
        file_name = file_name[2:]
        c = True
    
    for j in range(len(file_name)):
        dataset.append((str(i) +"/"+ str(file_name[j]), file_title[j]))
        
N = len (dataset)      
print("Length of the dataset is: ",N)

Length of the dataset is:  467


# PreProcessing the Text Dataset

In [4]:
def convert_lower_case(data):
    return np.char.lower(data)

def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

def preprocess(data):
    data = convert_lower_case(data)
    # remove comma seperately
    data = remove_punctuation(data) 
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    # needed again as we need to stem the words
    data = stemming(data) 
    # needed again as num2word is giving few hypens and commas fourty-one
    data = remove_punctuation(data) 
    # needed again as num2word is giving stop words 101 - one hundred and one
    data = remove_stop_words(data)
    return data

def print_doc(id):
    print(dataset[id])
    file = open(dataset[id][0], 'r', encoding='cp1250')
    text = file.read().strip()
    file.close()
    print(text)
    
    

processed_text = []
processed_title = []

count = 0
for i in dataset[:N]:
    
    # Get the Text from the file
    file = open(i[0], 'r', encoding="utf8", errors='ignore')
    text = file.read().strip()
    file.close()
    
    # PreProcess the text
    processed_text.append(word_tokenize(str(preprocess(text))))
    # PreProcess the title
    processed_title.append(word_tokenize(str(preprocess(i[1]))))
    
    count += 1
    if(count%50 ==0):
        print(str(count) + " Documents PreProcessed!!")

50 Documents PreProcessed!!
100 Documents PreProcessed!!
150 Documents PreProcessed!!
200 Documents PreProcessed!!
250 Documents PreProcessed!!
300 Documents PreProcessed!!
350 Documents PreProcessed!!
400 Documents PreProcessed!!
450 Documents PreProcessed!!


# Implementation From Scratch

In [5]:
# Get the Vocabulary


DF = {}
# Get the indexes where each word is present
for i in range(N):
    tokens = processed_text[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

    tokens = processed_title[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

# Calculate the DF score
for i in DF:
    DF[i] = len(DF[i])
    
total_vocab_size = len(DF)
print("The Total Vocab Size is: ",total_vocab_size)

# List of words in Vocabulary
total_vocab = [x for x in DF]


def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

# Tf-Idf Values for Text

tf_idf_text = {}
doc = 0
for i in range(N):
    
    tokens = processed_text[i]
    # Vocab for combined Text and Title
    counter = Counter(tokens + processed_title[i])
    # Length of combined Text and Title
    words_count = len(tokens + processed_title[i])
    
    for token in np.unique(tokens):
        # Calculating Term Frequency
        tf = counter[token]/words_count
        # Calculating Inverse Document Frequency
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1))
        # Calculating Tf-Idf Values
        tf_idf_text[doc, token] = tf*idf

    doc += 1

print("\nTf-Idf Value of the word 'go' in Text is: ",tf_idf_text[(0,"go")])
    
# Tf-Idf Values for Title
    
tf_idf_title = {}
doc = 0
for i in range(N):
    
    tokens = processed_title[i]
    counter = Counter(tokens + processed_text[i])
    words_count = len(tokens + processed_text[i])

    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        #numerator is added 1 to avoid negative values
        idf = np.log((N+1)/(df+1)) 
        tf_idf_title[doc, token] = tf*idf

    doc += 1
    
print("\nTf-Idf Value of the word 'go' in Title is: ",tf_idf_title[(0,"go")])


# Combining Both Tf-Idf Values into one

tf_idf = copy.deepcopy(tf_idf_text)

for i in tf_idf:
    tf_idf[i] *= alpha
    
# Refer to Reference Link for more understanding on this step    
for i in tf_idf_title:
    tf_idf[i] = tf_idf_title[i]
    
print("\nFinal Length of Tf-Idf Values Are:",len(tf_idf))

The Total Vocab Size is:  32350

Tf-Idf Value of the word 'go' in Text is:  0.0002906893990853149

Tf-Idf Value of the word 'go' in Title is:  0.0002906893990853149

Final Length of Tf-Idf Values Are: 344378


# Ranking Methods

## Matching Score

In [6]:
def matching_score(k, query):
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))

    print("Matching Score")
    print("\nQuery:", query)
    print("\n",tokens)
    
    query_weights = {}
    for key in tf_idf:
        # If the word exists in tokens
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    # Sort the documents in the order of highest similarity
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)
    # Print the First K documents
    l = []
    for i in query_weights[:10]:
        l.append(i[0])
    
    print("\nThe top " + str(k) + " Documents with the Highest Similarity are:\n\n",l)
    

matching_score(10, "Without the drive of Rebeccah's insistence, Kate lost her momentum."
                + "She stood next a slatted oak bench, canisters still clutched, surveying.")

Matching Score

Query: Without the drive of Rebeccah's insistence, Kate lost her momentum.She stood next a slatted oak bench, canisters still clutched, surveying.

 ['without', 'drive', 'rebeccah', 'insist', 'kate', 'lost', 'momentum', 'stood', 'next', 'slat', 'oak', 'bench', 'canist', 'still', 'clutch', 'survey']

The top 10 Documents with the Highest Similarity are:

 [166, 200, 352, 433, 211, 350, 175, 187, 188, 294]


## Cosine Similarity

In [9]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim


def gen_vector(tokens):

    Q = np.zeros((len(total_vocab)))
    
    counter = Counter(tokens)
    words_count = len(tokens)

    query_weights = {}
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = math.log((N+1)/(df+1))

        try:
            ind = total_vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass
    return Q


def cosine_similarity(k, query):
    
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))
    
    print("Cosine Similarity")
    print("\nQuery:", query)
    print("\n",tokens)
    
    d_cosines = []
    
    # Finding Tf-Idf of Tokens and Vectorising it
    query_vector = gen_vector(tokens)
    
    for d in D:
        d_cosines.append(cosine_sim(query_vector, d))
    
    # Sort the documents in the order of highest similarity
    out = np.array(d_cosines).argsort()[-k:][::-1]    
    print("\nThe top " + str(k) + " Documents with the Highest Similarity are:\n\n",out)


# Vectorising Tf-Idf
print("Vectorising Tf-Idf Values...")
D = np.zeros((N, total_vocab_size))
for i in tf_idf:
    try:
        ind = total_vocab.index(i[1])
        D[i[0]][ind] = tf_idf[i]
    except:
        pass
    
    
Q = cosine_similarity(10, "Without the drive of Rebeccah's insistence, Kate lost her momentum."
                     + "She stood next a slatted oak bench, canisters still clutched, surveying")

Vectorising Tf-Idf Values...
Cosine Similarity

Query: Without the drive of Rebeccah's insistence, Kate lost her momentum.She stood next a slatted oak bench, canisters still clutched, surveying

 ['without', 'drive', 'rebeccah', 'insist', 'kate', 'lost', 'momentum', 'stood', 'next', 'slat', 'oak', 'bench', 'canist', 'still', 'clutch', 'survey']

The top 10 Documents with the Highest Similarity are:

 [200 166 433 175 169 402 211  87 151 369]


# Scikit Learn Implementation

In [10]:
# This is just an example of Scikit Learn Implementation. The dataset to be passed has to be preprocessed first

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(pd.DataFrame(dataset)[1])

# Convert to a DataFrame
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

### Reference:
https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-world-dataset-796d339a4089 <br>
https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76