# Information retrival
this notebook is based on the Scott's great [tutorial](https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-world-dataset-796d339a4089). 

In [1]:
import os
import pprint
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import numpy as np
import num2words
import pandas as pd
from collections import Counter

## Loading the files

In [2]:
folders = [x[0] for x in os.walk(str(os.getcwd())+ '/stories/')]
dataset = []
for folder in folders:
    text = ""
    with open(folder + '/index.html', 'r') as file:
        text = file.read().strip()
    urls = re.findall(r'><A HREF="(.*)">.*</A> ', text)
    titles = re.findall(r'<BR><TD> (.*)\n', text)
    assert len(urls) == len(titles)
    dataset += list(zip(titles,  [folder + '\\' + u for u in urls] ))

In [3]:
print('Total items in the dataset:', len(dataset))

Total items in the dataset: 467


## Preprocessing
1. Convert to lowercase
2. Remove punctuations
3. Remove stopwords
4. Convert numbers into words
5. Remove signle charachters
6. Stemming/Lemmitzing 

In [4]:
def to_lower_case(text):
    return str(np.char.lower(text))

In [5]:
def remove_punctuations(text):
    for punc in string.punctuation:            
        text = str(np.char.replace(text, punc, ''))
    return text

In [6]:
def remove_stopwords(text_tokenized):
    stop_words = stopwords.words('english')
    return [word for word in text_tokenized if word not in stop_words]

In [7]:
def remove_single_char(text_tokenized):
    return [word for word in text_tokenized if len(word) > 1]

In [8]:
def num_2_str(text_tokenized):
    result = []
    for word in text_tokenized:
        try:
            value = num2words.num2words(float(word))
        except:
            value = word
        result.append(value)
    return result

In [9]:
def stemmify(text_tokenized):
    ps = PorterStemmer()
    return [ps.stem(word) for word in text_tokenized]

In [10]:
def preprocessing_pipleline(text):
    text = to_lower_case(text)
    text_tokenized = word_tokenize(text)
    text_tokenized = num_2_str(text_tokenized)
    text_tokenized = word_tokenize(remove_punctuations(" ".join(text_tokenized)))
    text_tokenized = stemmify(text_tokenized)
    text_tokenized = remove_single_char(text_tokenized)
    text_tokenized = remove_stopwords(text_tokenized)
    text = remove_punctuations(" ".join(text_tokenized))
    return word_tokenize(text) 
    

## Reading the data

In [11]:
#stories_df = pd.DataFrame(columns=['title', 'body'])
titles = []
bodies = []
for title, fp in dataset:
    with open(fp, 'r', encoding='utf8', errors='ignore') as file:
        text = file.read().strip()
        titles.append(preprocessing_pipleline(title))
        bodies.append(preprocessing_pipleline(text))
    #stories_df.loc[len(stories_df)] = [title, text]

In [12]:
DF = {}
for i,body in enumerate(bodies):
    for word in body:
        if word in DF:
            DF[word].add(i)
        else:
            DF[word] = {i}

In [13]:
for i in DF:
    DF[i] = len(DF[i])

In [14]:
vocab = DF.keys()
N = len(vocab)

In [15]:
print('Total vocabulary:', N)

Total vocabulary: 38847


## TF-IDF

In [43]:
DF['']

136

In [45]:
tf_idf_bag = {} # word -> docs -> score
alpha = 0.6
for idx, (title, body) in enumerate(zip(titles, bodies)):
    word_count = Counter(title + body)
    total_words = sum(word_count.values())
    
    for word in np.unique(body + title):
        tf = word_count[word] / total_words
        df = DF.get(word, 0)
        idf = np.log(N / (df + 1) )
        tf_idf_score = tf * idf
        word_docs = tf_idf_bag.get(word, dict())
        if word in title:
            word_docs[idx] = tf_idf_score * alpha
        else:
            word_docs[idx] = tf_idf_score * (1 - alpha)
        tf_idf_bag[word] = word_docs

            
        

In [59]:
def matching_score(qry):
    collected_weights = Counter()
    collected_words = set()
    ppsed_qry = preprocessing_pipleline(qry)
    for word in ppsed_qry:
        if word in tf_idf_bag:
            collected_words.add(word)
            available_docs_ids =  tf_idf_bag.get(word, {})
            collected_weights += Counter(available_docs_ids)
    
    return sorted(collected_weights.items(), key=lambda x:x[1], reverse=True)

In [74]:
qry = """twitter"""
dict(matching_score(qry))

{212: 0.002232097342053498,
 320: 0.0010257556197122708,
 394: 0.0006081097502155977,
 310: 0.00017746052557364558,
 26: 0.00016731795611433966,
 225: 0.00011716747845862313}

In [75]:
dataset[212]

('The Great Learning: The Text of Confucious',
 'E:\\Programming\\workbench\\python\\nlp\\stories/stories/\\greatlrn.leg')

In [49]:
Counter({'a': 1, 'b':0, 'c':3, 'd':94}) + Counter({}) 

Counter({'a': 1, 'c': 3, 'd': 94})