## TF-IDF Scoring

In [1]:
import re
import time
import json
import pickle
import textdistance
import numpy as np

from math import log
from tqdm import tqdm

from collections import OrderedDict
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [3]:
Index = pickle.load(open('../Dumps/index.pkl','rb'))
Docs = pickle.load(open('../Dumps/docs.pkl','rb'))
company = pickle.load(open('../Dumps/updated_Docs.pkl','rb'))
company_index = pickle.load(open('../Dumps/company_index.pkl','rb'))
Vocabulary = list(Index.keys())
sentences =json.load(open('../Dumps/corpus.json','r'))

In [15]:
sentences['0']

['Inline XBRL document created by Certent Disclosure Management 1.0.0.0',
 'Created on: 02/05/2021 21:28:37 PM',
 'SECURITIES AND EXCHANGE COMMISSION',
 'ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE',
 'SECURITIES EXCHANGE ACT OF 1934',
 'For the fiscal year ended ',
 '     TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE',
 'SECURITIES EXCHANGE ACT OF 1934',
 'For the transition period from __________ to ____________',
 '(Exact name of registrant as specified in its charter)',
 '(State or other jurisdiction of',
 '(Address of principal executive offices)',
 '(Registrant’s telephone number, including area code)',
 'Securities registered pursuant to Section 12(b) of the Act:',
 'Title of each Class',
 'Name of each exchange on which registered',
 'Common Stock (Par Value $2.50 per share)',
 'New York Stock Exchange',
 'Indicate by check mark if the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act',
 'Indicate by check mark if the r

In [None]:
def clean(text):

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))
    
    text = text.lower().strip()
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join([w for w in word_tokenize(text) if not w in stop_words])
    text = ' '.join([lemmatizer.lemmatize(w) for w in word_tokenize(text) if not w in stop_words])
    
    return text

In [None]:
def TF_IDF_Score_doc(query,doc):
    words = clean(query).split()
    score = 0
    for i in words:
        if i in Index and doc in Index[i]:
            tf = Index[i][doc]/Docs[int(doc)][1]
            idf = log(1+(len(Docs)/len(Index[i])),10)
            score+= tf*idf
    return score

In [None]:
def tf_idf(word,doc):
    
    if doc in Index[word]: 
        tf = Index[word][doc]/Docs[doc][1]
    else : 
        tf = 0
        
    idf = log((len(Docs)/len(Index[word])+1),10)
    
    score = tf*idf
    return score

In [None]:
matrix = [[0 for i in range(len(Vocabulary))] for j in range(len(Docs))]
for x in tqdm(range(len(Docs))):
    for y in range(len(Vocabulary)):
        matrix[x][y]= tf_idf(Vocabulary[y],x)

In [None]:
pickle.dump(np.array(matrix),open('../Dumps/matrix.pkl','wb'))

In [None]:
matrix = pickle.load(open('../Dumps/matrix.pkl','rb'))

In [None]:
def tf_idf_scoring(query):
    
    query_vec = [0]*len(Vocabulary)
    query_words = clean(query).split()
    
    for i in query_words:
        if i in Vocabulary:
            query_vec[Vocabulary.index(i)]+=1
            
    for i in range(len(query_vec)):
        query_vec[i]*=log((len(Docs)/len(Index[Vocabulary[i]])+1),10) 
        
    query_vec = np.array(query_vec).reshape(len(query_vec),1)
    
    scores = matrix.dot(query_vec)
    result = [(scores[i][0],i) for i in range(len(scores))]
    result = sorted(result,reverse=True)[:5]
    return result

In [None]:
x = tf_idf_scoring('iPhone')

In [None]:
x

In [None]:
Docs[399]

In [None]:
company

In [None]:
company_index[461]

In [None]:
def TF_IDF_Score_sentence(query,sentence):
    query_words = clean(query).split()
    sentence_words = clean(sentence).split()
    score = 0
    if not len(sentence_words): 
        return 0
    for i in query_words:
        if i in Index:
            tf = sentence_words.count(i)/len(sentence_words)
            idf = log(1+(len(Docs)/len(Index[i])),10)
            score+= tf*idf
    return score

In [None]:
def cosine_similarity(query,sentence):
    return textdistance.cosine.normalized_similarity(query,sentence)

In [None]:
# Finding Best Doc

Query = input("Enter Your Query: ")
Ranking = []

for i in range(len(Docs)):
    if i in Docs:
        Ranking.append((TF_IDF_Score_doc(Query,i),i))

Ranking.sort(reverse=True)
Best_Docs = [int(Docs[i[1]][0].split('.')[0]) for i in Ranking]
for i in Best_Docs[:10]:
    print(company[i])

In [None]:
Data = json.load(open('../Dumps/corpus.json','r',encoding='utf-8',errors='ignore'))

In [None]:
# Finding Best Sentence

Best = []

for i in Data[str(Best_Docs[0])]:
    Best.append((TF_IDF_Score_sentence(Query,i),i))
    #Best.append((cosine_similarity(Query,i),i))

Best.sort(reverse=True)                
Best[:10]