# Text Processing 

In [165]:
import pandas as pd

In [166]:
df = pd.read_csv('tripadvisor_hotel_reviews.csv')
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,2
1,ok nothing special charge diamond member hilto...,0
2,nice rooms not experience hotel monaco seattle...,1
3,unique great stay wonderful time hotel monaco ...,2
4,great stay great stay went seahawk game awesom...,2


In [167]:
import nltk
nltk.download('punkt')

from nltk.corpus import stopwords
nltk.download('stopwords')

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Tokenization

In [168]:
def word_tokenization(paragraph):
    words = word_tokenize(paragraph)
    return words

In [169]:
df['Review_Tokenized'] = df['Review'].apply(lambda x: word_tokenization(x)) 

## Removing Stopwords

In [170]:
stopWords = stopwords.words('english')

In [171]:
def remove_stopwords(tokenized_list):
    stopword_removed_list = []
    for i in tokenized_list:
        if i not in stopWords:
            stopword_removed_list.append(i)
    return stopword_removed_list     

In [172]:
df['Review_stopWord_removed'] = df['Review_Tokenized'].apply(lambda x: remove_stopwords(x)) 

In [173]:
df

Unnamed: 0,Review,Rating,Review_Tokenized,Review_stopWord_removed
0,nice hotel expensive parking got good deal sta...,2,"[nice, hotel, expensive, parking, got, good, d...","[nice, hotel, expensive, parking, got, good, d..."
1,ok nothing special charge diamond member hilto...,0,"[ok, nothing, special, charge, diamond, member...","[ok, nothing, special, charge, diamond, member..."
2,nice rooms not experience hotel monaco seattle...,1,"[nice, rooms, not, experience, hotel, monaco, ...","[nice, rooms, experience, hotel, monaco, seatt..."
3,unique great stay wonderful time hotel monaco ...,2,"[unique, great, stay, wonderful, time, hotel, ...","[unique, great, stay, wonderful, time, hotel, ..."
4,great stay great stay went seahawk game awesom...,2,"[great, stay, great, stay, went, seahawk, game...","[great, stay, great, stay, went, seahawk, game..."
...,...,...,...,...
20486,best kept secret rd time staying charm not sta...,2,"[best, kept, secret, rd, time, staying, charm,...","[best, kept, secret, rd, time, staying, charm,..."
20487,great location price view hotel great quick pl...,2,"[great, location, price, view, hotel, great, q...","[great, location, price, view, hotel, great, q..."
20488,ok just looks nice modern outside desk staff n...,0,"[ok, just, looks, nice, modern, outside, desk,...","[ok, looks, nice, modern, outside, desk, staff..."
20489,hotel theft ruined vacation hotel opened sept ...,0,"[hotel, theft, ruined, vacation, hotel, opened...","[hotel, theft, ruined, vacation, hotel, opened..."


## Stemming All the Words

In [174]:
from nltk.stem import PorterStemmer
porter = PorterStemmer()

In [175]:
def Stemmer(txt_list):
    stemmed_txt_list = []
    for i in txt_list:
        stemmed_txt_list.append(porter.stem(i))
    return stemmed_txt_list

In [176]:
df['Review_stemmed'] = df['Review_stopWord_removed'].apply(lambda x: Stemmer(x)) 

In [177]:
df

Unnamed: 0,Review,Rating,Review_Tokenized,Review_stopWord_removed,Review_stemmed
0,nice hotel expensive parking got good deal sta...,2,"[nice, hotel, expensive, parking, got, good, d...","[nice, hotel, expensive, parking, got, good, d...","[nice, hotel, expens, park, got, good, deal, s..."
1,ok nothing special charge diamond member hilto...,0,"[ok, nothing, special, charge, diamond, member...","[ok, nothing, special, charge, diamond, member...","[ok, noth, special, charg, diamond, member, hi..."
2,nice rooms not experience hotel monaco seattle...,1,"[nice, rooms, not, experience, hotel, monaco, ...","[nice, rooms, experience, hotel, monaco, seatt...","[nice, room, experi, hotel, monaco, seattl, go..."
3,unique great stay wonderful time hotel monaco ...,2,"[unique, great, stay, wonderful, time, hotel, ...","[unique, great, stay, wonderful, time, hotel, ...","[uniqu, great, stay, wonder, time, hotel, mona..."
4,great stay great stay went seahawk game awesom...,2,"[great, stay, great, stay, went, seahawk, game...","[great, stay, great, stay, went, seahawk, game...","[great, stay, great, stay, went, seahawk, game..."
...,...,...,...,...,...
20486,best kept secret rd time staying charm not sta...,2,"[best, kept, secret, rd, time, staying, charm,...","[best, kept, secret, rd, time, staying, charm,...","[best, kept, secret, rd, time, stay, charm, st..."
20487,great location price view hotel great quick pl...,2,"[great, location, price, view, hotel, great, q...","[great, location, price, view, hotel, great, q...","[great, locat, price, view, hotel, great, quic..."
20488,ok just looks nice modern outside desk staff n...,0,"[ok, just, looks, nice, modern, outside, desk,...","[ok, looks, nice, modern, outside, desk, staff...","[ok, look, nice, modern, outsid, desk, staff, ..."
20489,hotel theft ruined vacation hotel opened sept ...,0,"[hotel, theft, ruined, vacation, hotel, opened...","[hotel, theft, ruined, vacation, hotel, opened...","[hotel, theft, ruin, vacat, hotel, open, sept,..."


In [178]:
df.drop('Review_stopWord_removed',1,inplace = True)
df.drop('Review_Tokenized',1,inplace = True)
df.drop('Review',1,inplace = True)

## Merging Resultant Tokens

In [179]:
def FinalReviewMaker(stemmed_list):
    empty = ""
    for i in stemmed_list:
        empty = empty + i + " "
    return empty

In [180]:
df['Review'] = df['Review_stemmed'].apply(lambda x: FinalReviewMaker(x))

In [181]:
df.drop('Review_stemmed',1,inplace = True)

In [182]:
df

Unnamed: 0,Rating,Review
0,2,nice hotel expens park got good deal stay hote...
1,0,ok noth special charg diamond member hilton de...
2,1,nice room experi hotel monaco seattl good hote...
3,2,uniqu great stay wonder time hotel monaco loca...
4,2,great stay great stay went seahawk game awesom...
...,...,...
20486,2,best kept secret rd time stay charm star ca n ...
20487,2,great locat price view hotel great quick place...
20488,0,ok look nice modern outsid desk staff n partic...
20489,0,hotel theft ruin vacat hotel open sept guest w...
