# Information retrival
this notebook is based on the Scott's great [tutorial](https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-world-dataset-796d339a4089). 

In [1]:
import os
import pprint
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import numpy as np
import num2words
import pandas as pd
from collections import Counter

## Loading the files

In [2]:
folders = [x[0] for x in os.walk(str(os.getcwd())+ '/stories/')]
dataset = []
for folder in folders:
    text = ""
    with open(folder + '/index.html', 'r') as file:
        text = file.read().strip()
    urls = re.findall(r'><A HREF="(.*)">.*</A> ', text)
    titles = re.findall(r'<BR><TD> (.*)\n', text)
    assert len(urls) == len(titles)
    dataset += list(zip(titles,  [folder + '\\' + u for u in urls] ))

In [3]:
print('Total items in the dataset:', len(dataset))

Total items in the dataset: 467


## Preprocessing
1. Convert to lowercase
2. Remove punctuations
3. Remove stopwords
4. Convert numbers into words
5. Remove signle charachters
6. Stemming/Lemmitzing 

In [4]:
def to_lower_case(text):
    return str(np.char.lower(text))

In [5]:
def remove_punctuations(text):
    for punc in string.punctuation:            
        text = str(np.char.replace(text, punc, ''))
    return text

In [6]:
def remove_stopwords(text_tokenized):
    stop_words = stopwords.words('english')
    return [word for word in text_tokenized if word not in stop_words]

In [7]:
def remove_single_char(text_tokenized):
    return [word for word in text_tokenized if len(word) > 1]

In [8]:
def num_2_str(text_tokenized):
    result = []
    for word in text_tokenized:
        try:
            value = num2words.num2words(float(word))
        except:
            value = word
        result.append(value)
    return result

In [9]:
def stemmify(text_tokenized):
    ps = PorterStemmer()
    return [ps.stem(word) for word in text_tokenized]

In [10]:
def preprocessing_pipleline(text):
    text = to_lower_case(text)
    text_tokenized = word_tokenize(text)
    text_tokenized = num_2_str(text_tokenized)
    text_tokenized = word_tokenize(remove_punctuations(" ".join(text_tokenized)))
    text_tokenized = stemmify(text_tokenized)
    text_tokenized = remove_single_char(text_tokenized)
    text_tokenized = remove_stopwords(text_tokenized)
    text = remove_punctuations(" ".join(text_tokenized))
    return word_tokenize(text) 
    

## Reading the data

In [11]:
#stories_df = pd.DataFrame(columns=['title', 'body'])
titles = []
bodies = []
for title, fp in dataset:
    with open(fp, 'r', encoding='utf8', errors='ignore') as file:
        text = file.read().strip()
        titles.append(preprocessing_pipleline(title))
        bodies.append(preprocessing_pipleline(text))
    #stories_df.loc[len(stories_df)] = [title, text]

In [12]:
DF = {}
for i,body in enumerate(bodies):
    for word in body:
        if word in DF:
            DF[word].add(i)
        else:
            DF[word] = {i}

In [13]:
for i in DF:
    DF[i] = len(DF[i])

In [37]:
vocab = list(DF.keys())
N = len(vocab)

In [47]:
vocab[0:-1:2500]

['thi',
 'scan',
 'calmli',
 'samo',
 'leybu\x17',
 'jersey',
 'alvin',
 'prep',
 'connoisseur',
 'groeg',
 'fibe',
 'megabang',
 'pansi',
 'bouillabaiss',
 'kirk\x00',
 '9d618']

In [15]:
print('Total vocabulary:', N)

Total vocabulary: 38847


## TF-IDF

In [22]:
DF['one']

443

In [23]:
tf_idf_bag = {} # word -> docs -> score
alpha = 0.6
for idx, (title, body) in enumerate(zip(titles, bodies)):
    word_count = Counter(title + body)
    total_words = sum(word_count.values())
    
    for word in np.unique(body + title):
        tf = word_count[word] / total_words
        df = DF.get(word, 0)
        idf = np.log(N / (df + 1) )
        tf_idf_score = tf * idf
        word_docs = tf_idf_bag.get(word, dict())
        if word in title:
            word_docs[idx] = tf_idf_score * alpha
        else:
            word_docs[idx] = tf_idf_score * (1 - alpha)
        tf_idf_bag[word] = word_docs

            
        

In [24]:
def matching_score(qry):
    collected_weights = Counter()
    collected_words = set()
    ppsed_qry = preprocessing_pipleline(qry)
    for word in ppsed_qry:
        if word in tf_idf_bag:
            collected_words.add(word)
            available_docs_ids =  tf_idf_bag.get(word, {})
            collected_weights += Counter(available_docs_ids)
    
    return sorted(collected_weights.items(), key=lambda x:x[1], reverse=True)

In [30]:
qry = """i never dreamed before"""
dict(matching_score(qry))

{317: 0.2568779007093256,
 292: 0.10946345606801229,
 339: 0.07928447569670667,
 325: 0.07444006869152872,
 293: 0.0490823519675465,
 415: 0.04233596939864691,
 437: 0.039798066037310606,
 128: 0.03938245209110707,
 397: 0.03837874938579918,
 108: 0.03594763806073828,
 327: 0.03537396907829314,
 51: 0.03501682541026412,
 116: 0.03496109793852692,
 192: 0.03281771337158144,
 344: 0.03209875662607545,
 161: 0.03167997573607129,
 261: 0.031196764290793672,
 453: 0.030430795943521695,
 288: 0.029661592015634813,
 201: 0.029093431758534245,
 146: 0.028445460179991413,
 167: 0.02829555652958431,
 198: 0.027947483695013573,
 186: 0.02738282555778948,
 3: 0.027278174464750695,
 204: 0.026985788027008507,
 409: 0.026325423406821626,
 250: 0.026222078448415252,
 32: 0.026181165906937157,
 423: 0.024699506151410495,
 401: 0.02469135125082727,
 217: 0.02371901147200545,
 125: 0.023661166183955015,
 364: 0.022907025429087922,
 314: 0.02247744081915893,
 257: 0.022161047868689776,
 213: 0.0221370735

In [31]:
with open(dataset[317][1], 'r') as file:
    print(file.read())

                   by Cindy Joy
                      West Augusta, VA
A Peace Song

Last nite I had the strangest
 dream I never dreamed before.
I dreamed the world had all
 agreed To put an end to war.
I dreamed I saw a mighty room
 All filled with women + men, And
the paper they were signing said
 They'd never fight again.
  And when the paper was all
   signed and a million copies
   made.  They all joined hands
   and bowed their heads
  And grateful prayers were
   prayed.
  And the people in the
   streets below
   were dancing round and round
  While guns and swords
   and uniforms
  Were scattered on the ground.



## Cosine Similarity

In [48]:
D = np.zeros((len(dataset), len(vocab)))
for k,v in tf_idf_bag.items():
    ind = vocab.index(v)
    D[k][ind] = v

ValueError: {0: 0.002204071194682367} is not in list