# Probabilistic Model

### Preprocessing

In [None]:
print("kernal running...")


In [None]:
# accent removal
import sys
# reload(sys)
# sys.setdefaultencoding('utf8')

In [1]:
import nltk
# nltk.download()

In [2]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize



In [3]:
import pandas as pd
import numpy as np




In [4]:
import os,math,re


In [5]:
def get_corpus(fname):
    myf = open(fname,"rb")
    text = myf.read().decode(errors='replace')
    return text

In [6]:
def get_word_tokens(sentence):
    sentence = sentence.lower()
    return nltk.word_tokenize(sentence)

In [7]:
def sw_remove(word_tokens):
    stop_words = set(stopwords.words('english'))
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return filtered_sentence

In [8]:
def stem_tokens(new_tokens):
    ps = PorterStemmer()
    stemmed = []
    for i in new_tokens:
        stemmed.append(ps.stem(i))
    return stemmed

In [9]:
# Data-Preprocessing

filename = "mydata/Text1.txt"
text = get_corpus(filename)

print("\n\n----stats---------")
print("text: ", len(text))

tokens = get_word_tokens(text)
print("tokens: ", len(tokens))

tokens = sw_remove(tokens)
tokens = stem_tokens(tokens)

print("tokens after preprocessing:", len(tokens))
print("------------------")

print(pd.DataFrame(tokens).head(15))



----stats---------
text:  29387
tokens:  3853
tokens after preprocessing: 2701
------------------
              0
0       nebular
1        region
2         south
3          zeta
4        orioni
5            37
6         photo
7             :
8         mount
9        wilson
10  observatori
11            .
12         star
13      cluster
14       hercul


### Inverted Index to get Indexes of tokens

In [14]:
#List of files
files = [r'mydata/Inverted Index/T1.txt',
         r'mydata/Inverted Index/T2.txt',
         r'mydata/Inverted Index/T3.txt',
         r'mydata/Inverted Index/T4.txt',
         r'mydata/Inverted Index/T5.txt',
         r'mydata/Inverted Index/T6.txt',
         r'mydata/Inverted Index/T7.txt',
         r'mydata/Inverted Index/T8.txt',
         r'mydata/Inverted Index/T9.txt',
         r'mydata/Inverted Index/T10.txt',]
# xfiles = [(i[len(i)-i[::-1].index('/'):]) for i in files ]
xfiles = ['T1.txt', 'T2.txt', 'T3.txt', 'T4.txt', 'T5.txt', 'T6.txt', 'T7.txt', 'T8.txt', 'T9.txt', 'T10.txt']
print(xfiles)

['T1.txt', 'T2.txt', 'T3.txt', 'T4.txt', 'T5.txt', 'T6.txt', 'T7.txt', 'T8.txt', 'T9.txt', 'T10.txt']


In [15]:
# Helper functions for inv_ind()
class CreateInvDict:
    def __init__(self):
        self.myd = {}
    def checkf(self, x, i):
        if i not in self.myd.keys():
            self.myd[i] = [x]
        else:
            self.myd[i].append(x)
 
def freq_list(str,word):
    count = str.count(word)
    mid = -1
    freq = []
    for i in range(count):
        prev = str[mid+1:].index(word)
        mid += (prev+1)
        freq.append(mid)
    return freq
 
def freq_count(text, word):
    return text.count(word)
 
#Inverted index, return dictionary
def inv_ind(stemmed_docs,doc_sizes,n):
    unq_tok = set(stemmed_docs)
    inv_table = CreateInvDict()
    for i in unq_tok:
        start = 0
        for j in range(n):
            end = doc_sizes[j]
            temp = stemmed_docs[start:(start+end)]
            if i in temp:
                x = (xfiles[j],freq_count(temp,i))
                inv_table.checkf(x,i)
            start += end
    return inv_table.myd

In [16]:
n = len(files)
doc_sizes = []
stemmed = []
for i in files:
    text = get_corpus(i)
    tokens = get_word_tokens(text)
    tokens = sw_remove(tokens)
    tokens = stem_tokens(tokens)
    doc_sizes.append(len(tokens))
    stemmed.extend(tokens)

table = inv_ind(stemmed,doc_sizes,n)
df = pd.DataFrame(table.items(), columns=['Tokens','Occurences'])
try:
    os.remove(r'Inverted.csv')
except:
    pass
df.to_csv(r'Inverted.csv')

### Model

In [22]:
inv_file = r'Inverted.csv'

In [23]:
def get_relevance(n,nw):
  return (n-nw+0.5)/(nw+0.5)

In [24]:
def get_prob_matrix(n, df, toks):
  prob_matrix = {}

  for i in toks:
    nw = df.loc[i, 'Occurences'].count(')')     #number of documents where current token is present
    prob_matrix[i] = [nw, get_relevance(n,nw)]
  return prob_matrix

In [25]:
def get_query_tokens(query):
  tokens = get_word_tokens(query.lower())
  tokens = sw_remove(tokens)
  tokens = stem_tokens(tokens)
  return tokens

In [26]:
def get_cond_probability(qtok, inv_file):
  prob_matrix = {}
  df = pd.read_csv(inv_file)
  toks = list(df['Tokens'])

  df.set_index('Tokens',inplace=True)
  word_matrix = get_prob_matrix(len(xfiles), df, toks)
#   print(word_matrix)
#   print(len(xfiles), xfiles)
#   for i in xfiles:
#     print(i)
  for i in xfiles:
    flag = False
    val = 1
    prob_matrix[i] = 0
#     print(i)
    for j in qtok:    # for each query tokend
      if j in toks:   # if current query token is in document tokens
        if i in df.loc[j,'Occurences']:  # if current doucment is present in list of inverted index querytoken  
          flag = True
          val *= word_matrix[j][1]   # multiply because of |and| when multiple tokens present in this file
    prob_matrix[i] = val if flag else 0 #if val=1 that means the query token is not present in document tokens so 0 probability
    
#     print(prob_matrix)
  return prob_matrix  

In [27]:
print("Enter the query, spaces are considered as |and| because of tokensizing and searching the relavant words in document, because of and the probability of each query token is multiplied")
vect = get_cond_probability(get_query_tokens(input()),inv_file)
vect = {k: "{0:.40f}".format(v) for k, v in sorted(vect.items(), key=lambda item: item[1], reverse=True)}
print(pd.DataFrame(vect.items(),columns=['File','Relevance']))

Enter the query, spaces are considered as |and| because of tokensizing and searching the relavant words in document, because of and the probability of each query token is multiplied
moon sun
      File                                   Relevance
0   T2.txt  0.2941176470588235392078502172807930037379
1   T6.txt  0.2941176470588235392078502172807930037379
2   T1.txt  0.2036199095022624416806422686931909993291
3   T3.txt  0.2036199095022624416806422686931909993291
4   T4.txt  0.2036199095022624416806422686931909993291
5   T5.txt  0.2036199095022624416806422686931909993291
6   T9.txt  0.2036199095022624416806422686931909993291
7  T10.txt  0.2036199095022624416806422686931909993291
8   T7.txt  0.0000000000000000000000000000000000000000
9   T8.txt  0.0000000000000000000000000000000000000000
