In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re
import pandas as pd

In [2]:
data = pd.read_csv('tripadvisor_hotel_reviews.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  109 non-null    object
 1   Rating  109 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 1.8+ KB


In [4]:
data.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [6]:
data['review_lowercase'] = data['Review'].str.lower()
data.head()

Unnamed: 0,Review,Rating,review_lowercase
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not 4* experience hotel monaco seat...
3,"unique, great stay, wonderful time hotel monac...",5,"unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay, went seahawk game aweso..."


In [7]:
en_stopwords = stopwords.words('english')
en_stopwords.remove('not')

In [8]:
data['review_no_stopwords'] = data['review_lowercase'].apply(lambda x: ' '.join([w for w in x.split() if w not in en_stopwords]))

In [9]:
data['review_no_stopwords'].head()

0    nice hotel expensive parking got good deal sta...
1    ok nothing special charge diamond member hilto...
2    nice rooms not 4* experience hotel monaco seat...
3    unique, great stay, wonderful time hotel monac...
4    great stay great stay, went seahawk game aweso...
Name: review_no_stopwords, dtype: object

In [10]:
data['review_no_stopwords_no_punct'] = data.apply(lambda x: re.sub(r"[*]", "star", x['review_no_stopwords']), axis=1)

In [11]:
data.head()

Unnamed: 0,Review,Rating,review_lowercase,review_no_stopwords,review_no_stopwords_no_punct
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not 4* experience hotel monaco seat...,nice rooms not 4* experience hotel monaco seat...,nice rooms not 4star experience hotel monaco s...
3,"unique, great stay, wonderful time hotel monac...",5,"unique, great stay, wonderful time hotel monac...","unique, great stay, wonderful time hotel monac...","unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay, went seahawk game aweso...","great stay great stay, went seahawk game aweso...","great stay great stay, went seahawk game aweso..."


In [12]:
data['review_no_stopwords_no_punct'] = data.apply(lambda x: re.sub(r"[^\w\s]", "star", x['review_no_stopwords']), axis=1)

In [13]:
data.head()

Unnamed: 0,Review,Rating,review_lowercase,review_no_stopwords,review_no_stopwords_no_punct
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not 4* experience hotel monaco seat...,nice rooms not 4* experience hotel monaco seat...,nice rooms not 4star experience hotel monaco s...
3,"unique, great stay, wonderful time hotel monac...",5,"unique, great stay, wonderful time hotel monac...","unique, great stay, wonderful time hotel monac...",uniquestar great staystar wonderful time hotel...
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay, went seahawk game aweso...","great stay great stay, went seahawk game aweso...",great stay great staystar went seahawk game aw...


In [16]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /Users/alex/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [18]:
data['tokenized'] = data.apply(lambda x: word_tokenize(x['review_no_stopwords_no_punct']), axis=1)

In [19]:
data['tokenized'][0]

['nice',
 'hotel',
 'expensive',
 'parking',
 'got',
 'good',
 'deal',
 'stay',
 'hotel',
 'anniversarystar',
 'arrived',
 'late',
 'evening',
 'took',
 'advice',
 'previous',
 'reviews',
 'valet',
 'parkingstar',
 'check',
 'quick',
 'easystar',
 'little',
 'disappointed',
 'nonstarexistent',
 'view',
 'room',
 'room',
 'clean',
 'nice',
 'sizestar',
 'bed',
 'comfortable',
 'woke',
 'stiff',
 'neck',
 'high',
 'pillowsstar',
 'not',
 'soundproof',
 'like',
 'heard',
 'music',
 'room',
 'night',
 'morning',
 'loud',
 'bangs',
 'doors',
 'opening',
 'closing',
 'hear',
 'people',
 'talking',
 'hallwaystar',
 'maybe',
 'noisy',
 'neighborsstar',
 'aveda',
 'bath',
 'products',
 'nicestar',
 'not',
 'goldfish',
 'stay',
 'nice',
 'touch',
 'taken',
 'advantage',
 'staying',
 'longerstar',
 'location',
 'great',
 'walking',
 'distance',
 'shoppingstar',
 'overall',
 'nice',
 'experience',
 'pay',
 '40',
 'parking',
 'nightstar']

In [20]:
p = PorterStemmer()

In [21]:
data['stemmed'] = data['tokenized'].apply(lambda tokens: [p.stem(t) for t in tokens])

In [22]:
data['stemmed'][0]

['nice',
 'hotel',
 'expens',
 'park',
 'got',
 'good',
 'deal',
 'stay',
 'hotel',
 'anniversarystar',
 'arriv',
 'late',
 'even',
 'took',
 'advic',
 'previou',
 'review',
 'valet',
 'parkingstar',
 'check',
 'quick',
 'easystar',
 'littl',
 'disappoint',
 'nonstarexist',
 'view',
 'room',
 'room',
 'clean',
 'nice',
 'sizestar',
 'bed',
 'comfort',
 'woke',
 'stiff',
 'neck',
 'high',
 'pillowsstar',
 'not',
 'soundproof',
 'like',
 'heard',
 'music',
 'room',
 'night',
 'morn',
 'loud',
 'bang',
 'door',
 'open',
 'close',
 'hear',
 'peopl',
 'talk',
 'hallwaystar',
 'mayb',
 'noisi',
 'neighborsstar',
 'aveda',
 'bath',
 'product',
 'nicestar',
 'not',
 'goldfish',
 'stay',
 'nice',
 'touch',
 'taken',
 'advantag',
 'stay',
 'longerstar',
 'locat',
 'great',
 'walk',
 'distanc',
 'shoppingstar',
 'overal',
 'nice',
 'experi',
 'pay',
 '40',
 'park',
 'nightstar']

In [26]:
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/alex/nltk_data...


True

In [33]:
data['lemmatized']=data['tokenized'].apply(lambda tokens: [lemmatizer.lemmatize(t) for t in tokens])

In [34]:
data['lemmatized'][0]

['nice',
 'hotel',
 'expensive',
 'parking',
 'got',
 'good',
 'deal',
 'stay',
 'hotel',
 'anniversarystar',
 'arrived',
 'late',
 'evening',
 'took',
 'advice',
 'previous',
 'review',
 'valet',
 'parkingstar',
 'check',
 'quick',
 'easystar',
 'little',
 'disappointed',
 'nonstarexistent',
 'view',
 'room',
 'room',
 'clean',
 'nice',
 'sizestar',
 'bed',
 'comfortable',
 'woke',
 'stiff',
 'neck',
 'high',
 'pillowsstar',
 'not',
 'soundproof',
 'like',
 'heard',
 'music',
 'room',
 'night',
 'morning',
 'loud',
 'bang',
 'door',
 'opening',
 'closing',
 'hear',
 'people',
 'talking',
 'hallwaystar',
 'maybe',
 'noisy',
 'neighborsstar',
 'aveda',
 'bath',
 'product',
 'nicestar',
 'not',
 'goldfish',
 'stay',
 'nice',
 'touch',
 'taken',
 'advantage',
 'staying',
 'longerstar',
 'location',
 'great',
 'walking',
 'distance',
 'shoppingstar',
 'overall',
 'nice',
 'experience',
 'pay',
 '40',
 'parking',
 'nightstar']

In [35]:
tokens_clean = sum(data['lemmatized'], [])

In [42]:
unigrams = (pd.Series(nltk.ngrams(tokens_clean,1)).value_counts())

In [43]:
print(unigrams)

(hotel,)                251
(room,)                 238
(great,)                116
(not,)                  115
(staff,)                 85
                       ... 
(considerationstar,)      1
(handstarheld,)           1
(finished,)               1
(appointed,)              1
(connected,)              1
Name: count, Length: 3053, dtype: int64


In [45]:
bigrams = (pd.Series(nltk.ngrams(tokens_clean,2)).value_counts())

In [46]:
print(bigrams)

(great, location)         21
(space, needle)           18
(hotel, monaco)           14
(pike, place)             12
(location, great)          9
                          ..
(guest, like)              1
(like, guest)              1
(didnstart, make)          1
(personnel, didnstart)     1
(right, streetstar)        1
Name: count, Length: 8512, dtype: int64
