In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Albert\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

In [34]:
import gensim
from gensim.utils import simple_preprocess

In [36]:
import spacy

### Prepare Stopwords

In [3]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
#stop_words.extend(['another_word'])

### Import Data

In [5]:
df = pd.read_csv('./data/df_features.gz')

In [6]:
df.columns

Index(['Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date',
       'Average_Score', 'Hotel_Name', 'Reviewer_Nationality',
       'Negative_Review', 'Review_Total_Negative_Word_Counts',
       'Total_Number_of_Reviews', 'Positive_Review',
       'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score',
       'days_since_review', 'lat', 'lng', 'Diff', 'Review_Month',
       'Review_Year', 'Country', 'City', 'Pet', 'Purpose', 'Whom', 'Room',
       'Length', 'Device', 'Room_Recode', 'Nationality_Recode',
       'Length_Recode', 'Length_N', 'Close_Landmarks', 'Dist_Center',
       'Dist_Airport', 'Dist_Train', 'Price', 'Stars'],
      dtype='object')

### Negative Reviews

In [81]:
# Convert to list
data = df['Negative_Review'].values.tolist()
data[:3]

[' I am so angry that i made this post available via all possible sites i use when planing my trips so no one will make the mistake of booking this place I made my booking via booking com We stayed for 6 nights in this hotel from 11 to 17 July Upon arrival we were placed in a small room on the 2nd floor of the hotel It turned out that this was not the room we booked I had specially reserved the 2 level duplex room so that we would have a big windows and high ceilings The room itself was ok if you don t mind the broken window that can not be closed hello rain and a mini fridge that contained some sort of a bio weapon at least i guessed so by the smell of it I intimately asked to change the room and after explaining 2 times that i booked a duplex btw it costs the same as a simple double but got way more volume due to the high ceiling was offered a room but only the next day SO i had to check out the next day before 11 o clock in order to get the room i waned to Not the best way to begin 

### Remove emails, new lines and distracting characters

In [82]:
# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]
data[:3]

[' I am so angry that i made this post available via all possible sites i use when planing my trips so no one will make the mistake of booking this place I made my booking via booking com We stayed for 6 nights in this hotel from 11 to 17 July Upon arrival we were placed in a small room on the 2nd floor of the hotel It turned out that this was not the room we booked I had specially reserved the 2 level duplex room so that we would have a big windows and high ceilings The room itself was ok if you don t mind the broken window that can not be closed hello rain and a mini fridge that contained some sort of a bio weapon at least i guessed so by the smell of it I intimately asked to change the room and after explaining 2 times that i booked a duplex btw it costs the same as a simple double but got way more volume due to the high ceiling was offered a room but only the next day SO i had to check out the next day before 11 o clock in order to get the room i waned to Not the best way to begin 

### Tokenize words and Clean-up text

In [83]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:3])

[['am', 'so', 'angry', 'that', 'made', 'this', 'post', 'available', 'via', 'all', 'possible', 'sites', 'use', 'when', 'planing', 'my', 'trips', 'so', 'no', 'one', 'will', 'make', 'the', 'mistake', 'of', 'booking', 'this', 'place', 'made', 'my', 'booking', 'via', 'booking', 'com', 'we', 'stayed', 'for', 'nights', 'in', 'this', 'hotel', 'from', 'to', 'july', 'upon', 'arrival', 'we', 'were', 'placed', 'in', 'small', 'room', 'on', 'the', 'nd', 'floor', 'of', 'the', 'hotel', 'it', 'turned', 'out', 'that', 'this', 'was', 'not', 'the', 'room', 'we', 'booked', 'had', 'specially', 'reserved', 'the', 'level', 'duplex', 'room', 'so', 'that', 'we', 'would', 'have', 'big', 'windows', 'and', 'high', 'ceilings', 'the', 'room', 'itself', 'was', 'ok', 'if', 'you', 'don', 'mind', 'the', 'broken', 'window', 'that', 'can', 'not', 'be', 'closed', 'hello', 'rain', 'and', 'mini', 'fridge', 'that', 'contained', 'some', 'sort', 'of', 'bio', 'weapon', 'at', 'least', 'guessed', 'so', 'by', 'the', 'smell', 'of', 

### Remove Stopwords and Lemmatize

In [84]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [85]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'VERB', 'ADJ'])

data_lemmatized[:5]

[['make',
  'post',
  'site',
  'use',
  'plan',
  'trip',
  'make',
  'mistake',
  'book',
  'place',
  'make',
  'booking',
  'book',
  'com',
  'stay',
  'night',
  'hotel',
  'arrival',
  'place',
  'room',
  'floor',
  'hotel',
  'turn',
  'room',
  'book',
  'reserve',
  'level',
  'duplex',
  'room',
  'would',
  'window',
  'ceiling',
  'room',
  'mind',
  'break',
  'window',
  'close',
  'rain',
  'fridge',
  'contain',
  'weapon',
  'guess',
  'smell',
  'ask',
  'change',
  'room',
  'explain',
  'time',
  'book',
  'duplex',
  'cost',
  'get',
  'way',
  'volume',
  'ceiling',
  'offer',
  'room',
  'day',
  'check',
  'day',
  'clock',
  'order',
  'get',
  'room',
  'wan',
  'way',
  'begin',
  'holiday',
  'wait',
  'order',
  'check',
  'room',
  'waist',
  'time',
  'room',
  'get',
  'want',
  'garden',
  'view',
  'window',
  'waiting',
  'room',
  'place',
  'belonging',
  'rush',
  'city',
  'evening',
  'turn',
  'noise',
  'room',
  'guess',
  'make',
  'vibrate

### Word Ranking

In [86]:
words = [item for sublist in data_lemmatized for item in sublist]

In [87]:
from collections import Counter
count_dict = Counter(words)

In [102]:
len(count_dict)

33306

In [91]:
pd.DataFrame.from_dict(count_dict, orient='index').sort_values(by=0, ascending=False).head(50).index

Index(['room', 'hotel', 'breakfast', 'staff', 'nothing', 'bed', 'get', 'would',
       'could', 'night', 'bathroom', 'stay', 'bit', 'time', 'work', 'shower',
       'check', 'service', 'day', 'bar', 'need', 'price', 'make', 'pay', 'go',
       'book', 'ask', 'reception', 'floor', 'door', 'window', 'take', 'use',
       'water', 'air', 'coffee', 'give', 'noise', 'restaurant', 'area', 'wifi',
       'tell', 'say', 'find', 'food', 'charge', 'location', 'come', 'look',
       'view'],
      dtype='object')

In [99]:
df['Negative_Lemma'] = data_lemmatized

### Positive Reviews

In [104]:
# Convert to list
data = df['Positive_Review'].values.tolist()
data[:3]

[' Only the park outside of the hotel was beautiful ',
 ' No real complaints the hotel was great great location surroundings rooms amenities and service Two recommendations however firstly the staff upon check in are very confusing regarding deposit payments and the staff offer you upon checkout to refund your original payment and you can make a new one Bit confusing Secondly the on site restaurant is a bit lacking very well thought out and excellent quality food for anyone of a vegetarian or vegan background but even a wrap or toasted sandwich option would be great Aside from those minor minor things fantastic spot and will be back when i return to Amsterdam ',
 ' Location was good and staff were ok It is cute hotel the breakfast range is nice Will go back ']

### Remove emails, new lines and distracting characters

In [105]:
# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]
data[:3]

[' Only the park outside of the hotel was beautiful ',
 ' No real complaints the hotel was great great location surroundings rooms amenities and service Two recommendations however firstly the staff upon check in are very confusing regarding deposit payments and the staff offer you upon checkout to refund your original payment and you can make a new one Bit confusing Secondly the on site restaurant is a bit lacking very well thought out and excellent quality food for anyone of a vegetarian or vegan background but even a wrap or toasted sandwich option would be great Aside from those minor minor things fantastic spot and will be back when i return to Amsterdam ',
 ' Location was good and staff were ok It is cute hotel the breakfast range is nice Will go back ']

### Tokenize words and Clean-up text

In [106]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:3])

[['only', 'the', 'park', 'outside', 'of', 'the', 'hotel', 'was', 'beautiful'], ['no', 'real', 'complaints', 'the', 'hotel', 'was', 'great', 'great', 'location', 'surroundings', 'rooms', 'amenities', 'and', 'service', 'two', 'recommendations', 'however', 'firstly', 'the', 'staff', 'upon', 'check', 'in', 'are', 'very', 'confusing', 'regarding', 'deposit', 'payments', 'and', 'the', 'staff', 'offer', 'you', 'upon', 'checkout', 'to', 'refund', 'your', 'original', 'payment', 'and', 'you', 'can', 'make', 'new', 'one', 'bit', 'confusing', 'secondly', 'the', 'on', 'site', 'restaurant', 'is', 'bit', 'lacking', 'very', 'well', 'thought', 'out', 'and', 'excellent', 'quality', 'food', 'for', 'anyone', 'of', 'vegetarian', 'or', 'vegan', 'background', 'but', 'even', 'wrap', 'or', 'toasted', 'sandwich', 'option', 'would', 'be', 'great', 'aside', 'from', 'those', 'minor', 'minor', 'things', 'fantastic', 'spot', 'and', 'will', 'be', 'back', 'when', 'return', 'to', 'amsterdam'], ['location', 'was', 'good

### Remove Stopwords and Lemmatize

In [107]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [108]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'VERB', 'ADJ'])

data_lemmatized[:5]

[['park', 'hotel'],
 ['complaint',
  'hotel',
  'location',
  'surrounding',
  'room',
  'amenity',
  'service',
  'recommendation',
  'staff',
  'check',
  'regard',
  'deposit',
  'payment',
  'staff',
  'offer',
  'checkout',
  'refund',
  'payment',
  'make',
  'bit',
  'site',
  'restaurant',
  'bit',
  'lack',
  'think',
  'quality',
  'food',
  'anyone',
  'vegan',
  'background',
  'wrap',
  'toast',
  'sandwich',
  'option',
  'would',
  'great',
  'thing',
  'spot',
  'return'],
 ['location', 'staff', 'hotel', 'breakfast', 'range', 'go'],
 ['location',
  'surrounding',
  'bar',
  'restaurant',
  'area',
  'building',
  'character'],
 ['location', 'build', 'setting']]

### Word Ranking

In [109]:
words = [item for sublist in data_lemmatized for item in sublist]

In [110]:
from collections import Counter
count_dict = Counter(words)

In [111]:
len(count_dict)

32957

In [112]:
pd.DataFrame.from_dict(count_dict, orient='index').sort_values(by=0, ascending=False).head(50).index

Index(['staff', 'location', 'room', 'hotel', 'breakfast', 'bed', 'stay',
       'station', 'walk', 'service', 'everything', 'restaurant', 'view', 'bar',
       'bathroom', 'area', 'would', 'facility', 'get', 'love', 'food', 'metro',
       'reception', 'make', 'city', 'place', 'minute', 'london', 'shower',
       'check', 'go', 'time', 'tube', 'night', 'value', 'day', 'price', 'need',
       'size', 'train', 'coffee', 'could', 'distance', 'welcome', 'feel',
       'lot', 'min', 'street', 'clean', 'recommend'],
      dtype='object')

In [113]:
df['Positive_Lemma'] = data_lemmatized

In [114]:
df_NLP = df[['Negative_Review','Positive_Review','Negative_Lemma','Positive_Lemma']]

In [118]:
df_NLP.to_csv("./data/NLP.gz", index_label=False, compression="gzip")

In [119]:
df_NLP.head()

Unnamed: 0,Negative_Review,Positive_Review,Negative_Lemma,Positive_Lemma
0,I am so angry that i made this post available...,Only the park outside of the hotel was beauti...,"[make, post, site, use, plan, trip, make, mist...","[park, hotel]"
1,No Negative,No real complaints the hotel was great great ...,[],"[complaint, hotel, location, surrounding, room..."
2,Rooms are nice but for elderly a bit difficul...,Location was good and staff were ok It is cut...,"[room, bit, room, story, step, ask, level, roo...","[location, staff, hotel, breakfast, range, go]"
3,My room was dirty and I was afraid to walk ba...,Great location in nice surroundings the bar a...,"[room, walk, barefoot, floor, look, clean, wee...","[location, surrounding, bar, restaurant, area,..."
4,You When I booked with your company on line y...,Amazing location and building Romantic setting,"[book, company, line, show, picture, room, thi...","[location, build, setting]"


In [120]:
df_NLP.shape

(515738, 4)