# 1. PREPROCESSING THE DATA

## IMPORTING REQUIRED LIBRARIES

In [1]:
import pandas as pd
import numpy as np

In [2]:
dataframe = pd.read_csv("ratings.csv",index_col = "Unnamed: 0")

  mask |= (ar1 == a)


In [3]:
dataframe.shape

(1319968, 3)

## DROPPING THE RATING COLUMN

In [4]:
dataframe = dataframe.drop(['rating'], axis=1)

## TREATING SAME RESTAURANT AT DIFFERENT LOCATIONS AS ONE ENTITY TO REDUCE SIZE OF POSTING LIST IN LATER IMPLEMENTATIONS

In [5]:
dataframe['review'] = dataframe[['name','review']].groupby(['name'])['review'].transform(lambda x: ''.join(str(x)))

In [6]:
dataframe.shape

(1319968, 2)

## DELETING ALL DUPLICATE COLUMNS

In [7]:
dataframe = dataframe[['name','review']].drop_duplicates()

In [8]:
dataframe.shape

(7041, 2)

In [9]:
dataframe

Unnamed: 0,name,review
0,Jalsa,0 A beautiful place to dine inThe int...
12,Spice Elephant,12 Had been here for dinner with family ...
26,San Churro Cafe,26 Ambience is not that good enough and...
46,Addhuri Udupi Bhojana,46 Great food and proper Karnataka style...
81,Grand Village,81 Very good restaurant in neighbourhood...
...,...,...
1315206,Calcutta North Indian Meals,1315206 This center probably famous for nam...
1315268,Chime - Sheraton Grand Bengaluru Whitefield Ho...,1315268 Nice and friendly place and staff i...
1315289,The Nest - The Den Bengaluru,1315289 Great ambience looking nice good s...
1315306,Nawabs Empire,1315306 This place is not at all good We ha...


## MAKING ALL STRINGS LOWER CASE

In [10]:
dataframe['review'] = dataframe['review'].str.lower()

## REMOVING ERRONEOUS NUMBERS FROM THE REVIEWS

In [11]:
dataframe.review = dataframe.review.str.replace('\d+', '')

  dataframe.review = dataframe.review.str.replace('\d+', '')


## REMOVING STOP WORDS

In [12]:
import nltk.corpus
stop = nltk.corpus.stopwords.words('english')

In [13]:
dataframe['review'] = dataframe['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

## ONLY KEEPING WORDS PRESENT IN THE ENGLISH DICTIONARY

In [14]:
words = set(nltk.corpus.words.words())

In [15]:
dataframe['review'] = dataframe['review'].apply(lambda x: ' '.join([word for word in x.split() if word in (words)]))

In [16]:
dataframe

Unnamed: 0,name,review
0,Jalsa,beautiful place dine dinner family restaurant ...
12,Spice Elephant,dinner family turned ambience really nice staf...
26,San Churro Cafe,ambience good enough went quick bite first big...
46,Addhuri Udupi Bhojana,great food proper style full place half good f...
81,Grand Village,good restaurant buffet great service overwhelm...
...,...,...
1315206,Calcutta North Indian Meals,center probably famous north object
1315268,Chime - Sheraton Grand Bengaluru Whitefield Ho...,nice friendly place staff awesome service bad ...
1315289,The Nest - The Den Bengaluru,great ambience looking nice good selection nes...
1315306,Nawabs Empire,place good ordered negative review would object


## RESETING INDEX

In [17]:
dataframe.reset_index(drop=True, inplace = True)

In [18]:
import nltk

## LEMMETIZING WITHOUT POS INDEX

In [19]:
# import nltk

# w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
# lemmatizer = nltk.stem.WordNetLemmatizer()

# def lemmatize_text(text):
#     return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]


# dataframe['review'] = dataframe.review.apply(lemmatize_text)

In [20]:
dataframe

Unnamed: 0,name,review
0,Jalsa,beautiful place dine dinner family restaurant ...
1,Spice Elephant,dinner family turned ambience really nice staf...
2,San Churro Cafe,ambience good enough went quick bite first big...
3,Addhuri Udupi Bhojana,great food proper style full place half good f...
4,Grand Village,good restaurant buffet great service overwhelm...
...,...,...
7036,Calcutta North Indian Meals,center probably famous north object
7037,Chime - Sheraton Grand Bengaluru Whitefield Ho...,nice friendly place staff awesome service bad ...
7038,The Nest - The Den Bengaluru,great ambience looking nice good selection nes...
7039,Nawabs Empire,place good ordered negative review would object


## Words like friendly do not get converted so lemmetizing with POS tag

## LEMMETIZING WITH POS TAG

SIZE OF DICTIONARY BEFORE LEMMETIZATION

In [21]:
d = set()
for words in dataframe.review.str.findall(r"\w+").map(set):
    for word in words:
        d.add(word)
print(len(d))

4889


In [22]:
# WORDNET LEMMATIZER (with appropriate pos tags)

import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

def pos_tagger(nltk_tag):
	if nltk_tag.startswith('J'):
		return wordnet.ADJ
	elif nltk_tag.startswith('V'):
		return wordnet.VERB
	elif nltk_tag.startswith('N'):
		return wordnet.NOUN
	elif nltk_tag.startswith('R'):
		return wordnet.ADV
	else:		
		return None

def pos_tagged(sentence):
    return nltk.pos_tag(nltk.word_tokenize(sentence))

def wordnet_tagged(sentence):
    return list(map(lambda x: (x[0], pos_tagger(x[1])), sentence))

def lem(sentence):
    lemmatized_sentence = []
    for word, tag in sentence:
        if tag is None:
            # if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:	
            # else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)


def final(sentence):
    sentence = pos_tagged(sentence)
    sentence = wordnet_tagged(sentence)
    sentence = lem(sentence)
    return sentence
    
dataframe['review'] = dataframe['review'].apply(final)


[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [WinError 10054] An existing connection was forcibly
[nltk_data]     closed by the remote host>


In [23]:
dataframe

Unnamed: 0,name,review
0,Jalsa,beautiful place dine dinner family restaurant ...
1,Spice Elephant,dinner family turn ambience really nice staff ...
2,San Churro Cafe,ambience good enough go quick bite first big t...
3,Addhuri Udupi Bhojana,great food proper style full place half good f...
4,Grand Village,good restaurant buffet great service overwhelm...
...,...,...
7036,Calcutta North Indian Meals,center probably famous north object
7037,Chime - Sheraton Grand Bengaluru Whitefield Ho...,nice friendly place staff awesome service bad ...
7038,The Nest - The Den Bengaluru,great ambience look nice good selection nest o...
7039,Nawabs Empire,place good order negative review would object


In [24]:
#estimating the size of the dictionary AFTER LEMMETIZATION

d = set()
for words in dataframe.review.str.findall(r"\w+").map(set):
    for word in words:
        d.add(word)
print(len(d))

4633


In [25]:
#4889 to 4633 is the length of the dictionary after lemmetization

In [26]:
#trying to compress posting list by using docid instead

## 2.  CREATING INVERTED INDEX

In [27]:

new_list = []
for i in range(dataframe.shape[0]):
    for j in dataframe.iloc[i,1].split():
        new_list.append([j,i])
new_list = sorted(new_list)
dict_index = {}
words = []
for i in new_list:
    if i[0] not in words:
        words.append(i[0])
        dict_index[i[0]] = [1,[i[1]]]
    else:  
        if(i[1] not in dict_index[i[0]][1]):    
            dict_index[i[0]][0]+=1
            dict_index[i[0]][1].append(i[1])

In [28]:
dict_index

{'aa': [3, [3925, 5024, 6230]],
 'abandon': [1, [1241]],
 'able': [12,
  [736, 815, 1645, 1902, 2228, 4302, 5089, 6007, 6016, 6474, 6657, 6737]],
 'absolute': [33,
  [234,
   503,
   560,
   767,
   768,
   1018,
   1020,
   1080,
   1508,
   1581,
   1673,
   1774,
   1983,
   2068,
   2206,
   2225,
   2601,
   2682,
   2783,
   3085,
   3647,
   3915,
   3991,
   4172,
   4849,
   5228,
   5251,
   5383,
   5426,
   5974,
   6029,
   6100,
   6604]],
 'absolutely': [166,
  [51,
   66,
   109,
   145,
   147,
   155,
   191,
   204,
   436,
   498,
   506,
   610,
   658,
   660,
   697,
   706,
   739,
   763,
   775,
   859,
   936,
   953,
   1019,
   1031,
   1191,
   1288,
   1331,
   1466,
   1555,
   1575,
   1662,
   1677,
   1728,
   1825,
   1855,
   1876,
   1976,
   1983,
   1996,
   2020,
   2039,
   2053,
   2058,
   2133,
   2153,
   2168,
   2189,
   2192,
   2193,
   2199,
   2250,
   2253,
   2302,
   2318,
   2332,
   2339,
   2442,
   2489,
   2526,
   2528,
   25

## CREATING BIGRAM INVERTED INDEX

In [31]:

bigrams = {}
words = []
for i in range(dataframe.shape[0]):
    for word in dataframe.iloc[i,1].split():
        if word not in words:
            words.append(word)
            new = '$'+word+'$'
            for i in range(len(word)):
                if new[i:i+2] not in bigrams:
                    bigrams[new[i:i+2]] = [word]
                else:
                    bigrams[new[i:i+2]].append(word)

                    

In [32]:
bigrams

{'$b': ['beautiful',
  'best',
  'bad',
  'bite',
  'big',
  'buffet',
  'bar',
  'back',
  'bit',
  'busy',
  'book',
  'bunch',
  'barbecue',
  'break',
  'base',
  'bath',
  'branch',
  'board',
  'become',
  'bake',
  'butter',
  'box',
  'bone',
  'benne',
  'basket',
  'basically',
  'bakery',
  'beside',
  'bread',
  'belong',
  'breakfast',
  'beer',
  'bear',
  'baker',
  'bowl',
  'brilliant',
  'behind',
  'boy',
  'bull',
  'bottle',
  'blend',
  'budget',
  'beat',
  'brownie',
  'blow',
  'brain',
  'blue',
  'buy',
  'block',
  'broken',
  'brightly',
  'bugle',
  'bot',
  'brunch',
  'bed',
  'belief',
  'birthday',
  'bought',
  'bring',
  'black',
  'blessing',
  'buzz',
  'belt',
  'boil',
  'begin',
  'bold',
  'behavior',
  'button',
  'brand',
  'bun',
  'believe',
  'bright',
  'blast',
  'boneless',
  'baby',
  'blueberry',
  'bum',
  'brigade',
  'butterscotch',
  'brown',
  'brew',
  'brewery',
  'bel',
  'birth',
  'baa',
  'better',
  'bustle',
  'bullet',
 

## INVERTED INDEX WITH POSITIONAL INFORMATION

In [None]:
new_list = []
for i in range(dataframe.shape[0]):
    count = 0
    for j in dataframe.iloc[i,1].split():
        new_list.append([j,i,count])
        count+=1
new_list = sorted(new_list)
dict_index = {}
words = []
for i in new_list:
    if i[0] not in words:
        words.append(i[0])
        dict_index[i[0]] = [1,{i[1]:[i[2]]}]
    else:
        if i[1] not in dict_index[i[0]][1]:            
            dict_index[i[0]][0]+=1
            dict_index[i[0]][1][i[1]] = [i[2]]
        else:
            dict_index[i[0]][1][i[1]].append(i[2])
print(dict_index)


In [None]:
#when positional information is included
#dict_index is a dictionary structure
#each word is a key
#The value of each word is a list where first element is document frequency and 
#second element is a dictionary with doc id as key and positional info as a list