### 1. Importing Libraries

In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math
import json
from googletrans import Translator

In [2]:
alpha = 0.3

### 2. Reading the entire dataset

In [3]:
data_folder = "../Data/Articles/English/"
dataset = {}
doc_index = 0
file_names = [x[2] for x in os.walk(str(data_folder))]
file_names = file_names[0] #Above value is a 2D array
for i in file_names:
    with open(data_folder+i) as json_file:
        article = json.load(json_file)
        curr_article = {}
        curr_article['title'] = article['title'].strip() #Using strip to remove leading and trailing spaces
        curr_article['text'] = article['text'].strip() 
        curr_article['index'] = doc_index
        #i[:-5] so as to remove .json from the key of the dictionary
        dataset[i[:-5]] = curr_article
        doc_index = doc_index + 1

In [4]:
N = len(dataset)

### 3. Pre-processing

#### 3.1 Converts upper case letters to lower case

In [5]:
def convert_lower_case(data):
    return np.char.lower(data)

####  3.2 Removes stop words like and, are, is, etc.

In [6]:
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

#### 3.3 Removes punctuations

In [7]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

#### 3.4 Removes apostrophe punctuation

In [8]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

#### 3.5 Stemmer for English that removes inflections like tried => try + ed (ed is removed)

In [9]:
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

#### 3.6 Converts numerals to words like 19 => nineteen

In [10]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

#### 3.7 Calls all of the above functions to preprocess the text

In [11]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
#     data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
#     data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

### 4. Preprocessing the extracted data (title and text)

In [12]:
for i in dataset.keys():
    dataset[i]['processed_text'] = word_tokenize(str(preprocess(dataset[i]['text'])))
    dataset[i]['processed_title'] = word_tokenize(str(preprocess(dataset[i]['title'])))

### 5. Calculating DF for all words

In [13]:
DF = {}

for i in dataset.keys():
    tokens = dataset[i]['processed_text']
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

    tokens = dataset[i]['processed_title']
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
for i in DF:
    DF[i] = len(DF[i])

In [14]:
total_vocab_size = len(DF)
total_vocab = [x for x in DF]

#### 5.1 Returns the frequency of the input word throughout the entire set of documents

In [15]:
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

### 6. Calculating TF-IDF for the Text of the Article. 
#### Here normal TF-IDF is used as Title (of the Article) weight would be added to this 

In [16]:
tf_idf = {}

for i in dataset.keys():
    
    tokens = dataset[i]['processed_text']
    
    counter = Counter(tokens + dataset[i]['processed_title'])
    words_count = len(tokens + dataset[i]['processed_title'])
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1))
        
        tf_idf[dataset[i]['index'], token] = tf*idf

### 7. Calculating TF-IDF for the Title of the Article.

In [17]:
tf_idf_title = {}

for i in dataset.keys():
    
    tokens = dataset[i]['processed_title']
    counter = Counter(tokens + dataset[i]['processed_text'])
    words_count = len(tokens + dataset[i]['processed_text'])

    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1)) #numerator is added 1 to avoid negative values
        
        tf_idf_title[dataset[i]['index'], token] = tf*idf

### 8. Merging TF-IDF of Text and Title of the Article

In [18]:
for i in tf_idf:
    tf_idf[i] *= alpha
    
for i in tf_idf_title:
    tf_idf[i] = tf_idf_title[i]

### 9. TF-IDF Cosine Similarity Ranking

In [19]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

#### 9.1 Vectorising TF-IDF

In [20]:
D = np.zeros((N, total_vocab_size))
for i in tf_idf:
    try:
        ind = total_vocab.index(i[1])
        D[i[0]][ind] = tf_idf[i]
    except:
        pass

In [21]:
def gen_vector(tokens):

    Q = np.zeros((len(total_vocab)))
    
    counter = Counter(tokens)
    words_count = len(tokens)

    query_weights = {}
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = math.log((N+1)/(df+1))

        try:
            ind = total_vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass
    return Q

#### 9.2 Search and Retrieve using Cosine Similarity

In [22]:
def cosine_similarity(k, query):
    print("Cosine Similarity")
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))
    
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    d_cosines = []
    
    query_vector = gen_vector(tokens)
    
    for d in D:
        d_cosines.append(cosine_sim(query_vector, d))
        
    out = np.array(d_cosines).argsort()[-k:][::-1]
    
    print("")
    
    print(out)
    
    return out

In [23]:
Q = cosine_similarity(10, "Salman Khan")

Cosine Similarity

Query: Salman Khan

['salman', 'khan']

[129 145  35 120   8 114 171  17  75 105]


In [24]:
for i in dataset.keys():
    if dataset[i]['index'] == Q[0]:
        print('Top Ranked Article is as follows:\n')
        print('Title : %s'%dataset[i]['title'])
        print("")
        print(dataset[i]['text'])
        print("\n\n\n")

for i in dataset.keys():
    if dataset[i]['index'] == Q[-1]:
        print("%sth ranked Article is as follows:\n"%len(Q))
        print('Title : %s'%dataset[i]['title'])
        print("")
        print(dataset[i]['text'])
        print("\n\n\n")

Top Ranked Article is as follows:

Title : Amitabh Bachchan:Did You Know Amitabh Bachchan Was Once Mistaken For Salman Khan? His Response Was Epic!

Did You Know Amitabh Bachchan Was Once Mistaken For Salman Khan? His Response To It Was Quite Cool!
Did You Know Amitabh Bachchan Was Once Mistaken For Salman Khan? His Response To It Was Quite Cool!
Have you ever been mistaken for somebody else? For us, it might be a usual thing. But can you imagine celebrities being in a situation where they are confused for other stars?
In the west, it has happened quite a lot of times. For instance, once a fan asked This Is Us star Justin Hartley about his wife Blake. If you didn’t get the drift, he was mistaken for Ryan Reynolds. Can you imagine?
Recently, Amitabh Bachchan also revealed about one such incident that happened with him. Big B was once confused as Salman Khan. I know, even I want to know who that person was?
“We were shooting on the streets in Glasgow and then I had to walk on the footpat