---
# Scoring and Term-Weighting
---

In [1]:
# Importing relevant modules

import re
import time
import pickle
import numpy as np
import pandas as pd

from math import log
from tqdm import tqdm
from collections import OrderedDict

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
### Execution Mode ###
# 0 : Build Matrices & Setup
# 1 : Testing
# 2 : Interactive

Flag = 1

In [3]:
# Loading Preprocessed Stories Data

Data = pickle.load(open('../Dumps/stories_data.pkl','rb'))
Docs = pickle.load(open('../Dumps/stories_docs.pkl','rb'))

In [4]:
# Function to clean text

def clean(text):
    
    # Converting text to lowercase
    text = text.lower()
    
    # Word Tokenization
    words = word_tokenize(text)
    
    # Removing all punctuation and unecessary characters from text
    for i in range(len(words)):
        words[i] = re.sub(r'[^a-z\s]','',words[i])
    
    # Removing stopwords from text
    stop_words = set(stopwords.words("english"))
    words = [w for w in words if not w in stop_words]
    
    # Removing blank spaces
    for i in range(len(words)):
        words[i]=words[i].strip()
        
    words = [i for i in words if i!='']
    
    cleaned_text = ' '.join(words)
    
    return cleaned_text

### TF-IDF Index & Vocabulary

In [5]:
# Creating Index for TF-IDF

if Flag == 0:
    Index = {}
    for i in tqdm(range(len(Data))):
        words = Data[i]['cleaned_text'].split()
        for j in words:
            if j in Index:
                if i in Index[j]:
                    Index[j][i]+=1
                else:
                    Index[j][i]=1
            else:
                Index[j]={}
                Index[j][i]=1

    Vocabulary = list(Index.keys())
    Vocabulary.sort()
    pickle.dump(Vocabulary,open('../Dumps/tf_idf_vocabulary.pkl','wb'))
    pickle.dump(Index,open('../Dumps/tf_idf_index.pkl','wb'))

### TF-IDF Weighting Schemes

In [6]:
# Binary Weighting Scheme

def tf_idf_binary(word,doc):

    if doc in Index[word]: 
        tf = 1
    else: 
        tf = 0
        
    idf = log((len(Docs)/len(Index[word])+1),10)
    
    score = tf*idf
    return score

In [7]:
# Raw count Weighting Scheme

def tf_idf_raw(word,doc):
    
    if doc in Index[word]: 
        tf = Index[word][doc]
    else : 
        tf = 0
        
    idf = log((len(Docs)/len(Index[word])+1),10)
    
    score = tf*idf
    return score

In [8]:
# Term Frequency Weighting Scheme

def tf_idf_frequency(word,doc):
    
    if doc in Index[word]: 
        tf = Index[word][doc]/Docs[doc]['size']
    else : 
        tf = 0
        
    idf = log((len(Docs)/len(Index[word])+1),10)
    
    score = tf*idf
    return score

In [9]:
# Log Normalization Weighting Scheme

def tf_idf_log(word,doc):
    
    if doc in Index[word]: 
        tf = log(1+Index[word][doc])
    else: 
        tf = 0
        
    idf = log((len(Docs)/len(Index[word])+1),10)
    
    score = tf*idf
    return score

In [10]:
# Double Normalization Weighting Scheme

def tf_idf_double(word,doc):

    if doc in Index[word]:
        tf = 0.5 + 0.5 * (Index[word][doc]/Docs[doc]['max_frequency'])
    else: 
        tf = 0
        
    idf = log((len(Docs)/len(Index[word])+1),10)
    
    score = tf*idf
    return score

### Building TF-IDF Score Matrices

In [11]:
# Building Matrix

if Flag == 0:
    binary_matrix = [[0 for i in range(len(Vocabulary))] for j in range(len(Docs))]
    raw_count_matrix = [[0 for i in range(len(Vocabulary))] for j in range(len(Docs))]
    term_frequency_matrix = [[0 for i in range(len(Vocabulary))] for j in range(len(Docs))]
    log_normalization_matrix = [[0 for i in range(len(Vocabulary))] for j in range(len(Docs))]
    double_normalization_matrix = [[0 for i in range(len(Vocabulary))] for j in range(len(Docs))]

    for x in tqdm(range(len(Docs))):
        for y in range(len(Vocabulary)):
            binary_matrix[x][y]= tf_idf_binary(Vocabulary[y],x)
            raw_count_matrix[x][y]= tf_idf_raw(Vocabulary[y],x)
            term_frequency_matrix[x][y]= tf_idf_frequency(Vocabulary[y],x)
            log_normalization_matrix[x][y]= tf_idf_log(Vocabulary[y],x)
            double_normalization_matrix[x][y]= tf_idf_double(Vocabulary[y],x)

    pickle.dump(np.array(binary_matrix),open('../Dumps/binary_matrix.pkl','wb'))
    pickle.dump(np.array(raw_count_matrix),open('../Dumps/raw_count_matrix.pkl','wb'))
    pickle.dump(np.array(term_frequency_matrix),open('../Dumps/term_frequency_matrix.pkl','wb'))
    pickle.dump(np.array(log_normalization_matrix),open('../Dumps/log_normalization_matrix.pkl','wb'))
    pickle.dump(np.array(double_normalization_matrix),open('../Dumps/double_normalization_matrix.pkl','wb'))

---
## Query System
---

In [12]:
# Loading Pickle Files

Index = pickle.load(open('../Dumps/tf_idf_index.pkl','rb'))
Vocabulary = pickle.load(open('../Dumps/tf_idf_vocabulary.pkl','rb'))
binary = pickle.load(open('../Dumps/binary_matrix.pkl','rb'))
raw_count = pickle.load(open('../Dumps/raw_count_matrix.pkl','rb'))
term_frequency = pickle.load(open('../Dumps/term_frequency_matrix.pkl','rb'))
log_normalization = pickle.load(open('../Dumps/log_normalization_matrix.pkl','rb'))
double_normalization = pickle.load(open('../Dumps/double_normalization_matrix.pkl','rb'))
Weight_Modes = {1:binary,2:raw_count,3:term_frequency,4:log_normalization,5:double_normalization}
Weight_Names = {1:'Binary',2:'Raw Count',3:'Term Frequency',4:'Log Normalization',5:'Double Normalization'}

### TF-IDF Scoring
---

In [13]:
# TF_IDF of Query

def tf_idf_query(query,mode):
    query_vec = [0]*len(Vocabulary)
    query_words = clean(query).split()
    
    for i in query_words:
        if i in Vocabulary:
            query_vec[Vocabulary.index(i)]+=1
    
    if mode == 1:
        for i in range(len(query_vec)):
            query_vec[i]= int(query_vec[i]>0)
            
    elif mode == 2:
        for i in range(len(query_vec)):
            query_vec[i]= query_vec[i]
            
    elif mode == 3:
        S = len(query_words)
        for i in range(len(query_vec)):
            query_vec[i]/= S
            
    elif mode == 4:
        for i in range(len(query_vec)):
            query_vec[i]= log(1+query_vec[i],10)
            
    elif mode == 5:
        M = max(query_vec)
        for i in range(len(query_vec)):
            query_vec[i]= 0.5*int(query_vec[i]>0) + 0.5*(query_vec[i]/M)
    
    for i in range(len(query_vec)):
        query_vec[i]*=log((len(Docs)/len(Index[Vocabulary[i]])+1),10) 
        
    query_vec = np.array(query_vec).reshape(len(query_vec),1)
    
    return query_vec  

In [14]:
# Function to handle TF_IDF Query

def tf_idf_scoring(query,mode,qw=0):
    
    matrix = Weight_Modes[mode]
    query_vec = tf_idf_query(query,mode)
    if qw : query_vec = tf_idf_query(query,qw)
    scores = matrix.dot(query_vec)
    result = [(scores[i][0],i) for i in range(len(scores))]
    result = sorted(result,reverse=True)[:5]
    
    for i in result:
        print(Docs[i[1]]['name']+' : '+str(i[0]))

### Cosine Similarity
---

In [15]:
# Function to handle Cosine Similarity Query

def cosine_similarity_scoring(query,mode,qw=0): 
    
    matrix = Weight_Modes[mode]
    query_vec = tf_idf_query(query,mode)
    if qw : query_vec = tf_idf_query(query,qw)
    scores = matrix.dot(query_vec)
    
    scores = scores/np.linalg.norm(query_vec)
    for i in range(len(matrix)): scores[i] = scores[i]/np.linalg.norm(matrix[i])

    result = [(scores[i][0],i) for i in range(len(scores))]
    result = sorted(result,reverse=True)[:5]
    
    for i in result:
        print(Docs[i[1]]['name']+' : '+str(i[0]))

### Jaccard Coefficient
---

In [16]:
# Function to calculate Jaccard Coefficient

def jaccard_coeff(doc,query):
    query_set = query.split(" ")
    doc_set = doc.split(" ")
    intersection = len(list(set(query_set) & set(doc_set)))
    union = len(list(set(query_set) | set(doc_set)))
    coeff = intersection/union
    return coeff

In [17]:
# Function to handle Jaccard Coefficient Query

def jaccard_scoring(query):
    
    scores = []
    query = clean(query)
    
    for i in Docs:
        doc = Data[i]['cleaned_text']
        s = jaccard_coeff(doc,query)
        scores.append((s,i))

    result = [(scores[i][0],i) for i in range(len(scores))]
    result = sorted(scores,reverse=True)[:5]
    
    for i in result:
        print(Docs[i[1]]['name']+' : '+str(i[0]))

### Driver

In [18]:
# Testing

if Flag == 1:
    
    query = "The crab and the heron"

    print("-"*75)
    print("TF-IDF")
    print("-"*75)

    for i in range(1,6):
        print('>> '+Weight_Names[i])
        tf_idf_scoring(query,i)
        print("-"*75)

    print("-"*75)
    print("Cosine Similarity")
    print("-"*75)

    for i in range(1,6):
        print('>> '+Weight_Names[i])
        cosine_similarity_scoring(query,i)
        print("-"*75)

    print("-"*75)
    print("Jaccard Coefficient")
    print("-"*75)

    jaccard_scoring(query)

---------------------------------------------------------------------------
TF-IDF
---------------------------------------------------------------------------
>> Binary
crabhern.txt : 9.518091493689194
aesop11.txt : 9.518091493689194
timem.hac : 3.900514378261692
long1-3.txt : 3.900514378261692
fgoose.txt : 3.900514378261692
---------------------------------------------------------------------------
>> Raw Count
crabhern.txt : 89.56333782146444
aesop11.txt : 29.020663384997654
timem.hac : 7.801028756523384
long1-3.txt : 7.801028756523384
fgoose.txt : 3.900514378261692
---------------------------------------------------------------------------
>> Term Frequency
crabhern.txt : 0.2344589995326294
aesop11.txt : 0.0007679049371559497
long1-3.txt : 0.0006011890225434175
fgoose.txt : 0.00028828635463870595
timem.hac : 0.00025341179692448624
---------------------------------------------------------------------------
>> Log Normalization
crabhern.txt : 6.709348811055275
aesop11.txt : 3.45698580

### User Interface

In [19]:
# Query System

if Flag == 2:
    
    print("-"*75)
    print("Choose a Scoring Scheme:-")
    print("-"*75)
    print("1: TF-IDF")
    print("2: Cosine Similarity")
    print("3: Jaccard Coefficient")

    n = int(input("Choice:"))
    if n == 1 or n == 2:
        print("-"*75)
        print("Choose a Weighting Scheme:-")
        print("-"*75)
        print("1: Binary")
        print("2: Raw Count")
        print("3: Term Frequency")
        print("4: Log Normalization")
        print("5: Double Normalization")
        print("-"*75)
        m = int(input("Choice:"))

    print("-"*75)
    q = input("Enter Query: ")
    print("-"*75)

    if n == 1:
        tf_idf_scoring(q,m)
    if n == 2:
        cosine_similarity_scoring(q,m)
    if n == 3:
        jaccard_scoring(q)

    print("-"*75)

---