In [1]:
import json
from langdetect import detect

import nltk

from nltk.corpus import wordnet as wn

from gensim import corpora, models

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import re

Slow version of gensim.models.doc2vec is being used


## Step 1: Preprocessing 
- Removing ***unneccesary imformation*** (title, author, date_stayed etc.) and saving **ratings** (numerical) and **text** (textual comment/review)
- ***Lowercasing***
- Removing reviews written in a ***language other than English*** (we might have to work on this part a bit more)
- Removing reviews with ***incomplete numerical ratings*** (`len(review['rating']<7)`)
- Collecting a ***list of numerical ratings*** of all the reviews in the dataset 
- Collecting a ***list of textual reviews*** of all the reviews in the dataset 

In [2]:
# parses the data
reviews = []
with open('review_copy.txt', 'r') as file:
    for line in file:
        data = json.loads(line)
        reviews.append(data)

In [3]:
print(reviews)

[{'ratings': {'service': 5.0, 'cleanliness': 5.0, 'overall': 5.0, 'value': 5.0, 'location': 5.0, 'sleep_quality': 5.0, 'rooms': 5.0}, 'title': '“Truly is "Jewel of the Upper Wets Side"”', 'text': 'Stayed in a king suite for 11 nights and yes it cots us a bit but we were happy with the standard of room, the location and the friendliness of the staff. Our room was on the 20th floor overlooking Broadway and the madhouse of the Fairway Market. Room was quite with no noise evident from the hallway or adjoining rooms. It was great to be able to open windows when we craved fresh rather than heated air. The beds, including the fold out sofa bed, were comfortable and the rooms were cleaned well. Wi-fi access worked like a dream with only one connectivity issue on our first night and this was promptly responded to with a call from the service provider to ensure that all was well. The location close to the 72nd Street subway station is great and the complimentary umbrellas on the drizzly days wer

In [4]:
# removes the unneccessary information from the data like title, author, date_stayed etc.
# lowercases the data 
# detects the reviews written in English and removes the oes written in other languages 
for review in reviews:
    for key in ('title','author', 'date_stayed', 'offering_id', 'num_helpful_votes', 'date', 'id', 'via_mobile'):
        if key in review:
            del review[key]
    review['text']=review['text'].lower()
    if not detect(review['text'])=='en':
        reviews.remove(review)
    if len(review['ratings'])!=7:
        reviews.remove(review)

In [5]:
print(reviews[0])

{'ratings': {'service': 5.0, 'cleanliness': 5.0, 'overall': 5.0, 'value': 5.0, 'location': 5.0, 'sleep_quality': 5.0, 'rooms': 5.0}, 'text': 'stayed in a king suite for 11 nights and yes it cots us a bit but we were happy with the standard of room, the location and the friendliness of the staff. our room was on the 20th floor overlooking broadway and the madhouse of the fairway market. room was quite with no noise evident from the hallway or adjoining rooms. it was great to be able to open windows when we craved fresh rather than heated air. the beds, including the fold out sofa bed, were comfortable and the rooms were cleaned well. wi-fi access worked like a dream with only one connectivity issue on our first night and this was promptly responded to with a call from the service provider to ensure that all was well. the location close to the 72nd street subway station is great and the complimentary umbrellas on the drizzly days were greatly appreciated. it is fabulous to have the kitch

In [6]:
# collects all the numerical ratings in a list
all_ratings=[]
for review in reviews:
    all_ratings.append(review['ratings']) 

In [7]:
# collects all the textual reviews in a list
all_reviews=[]
for review in reviews:
    all_reviews.append(review['text'])

## Step 2: Transforming to structured format
- ***Tokenization***
- Filtering out ***stopwords*** (excluding negative stopwords in order not to affect our analysis)
- Removing ***non-alphabetic characters***
- ***POS tagging***
- ***Lemmatization***
- ***Bag of Words***

In [8]:
# Based on some papers I looked at, I manually removed some of the english stopwords which could affect our emotion analysis 
stop_words = set(stopwords.words('english'))-set(('no', 'not', 'didn', 'doesn', 'don', 'down', 'hasn', 'haven'))
word_tokens = []
for review in all_reviews:
    word_tokens.append(word_tokenize(review))

In [9]:
filtered_sentences=[]
for word_token in word_tokens:
    filtered_sentences.append([w for w in word_token if not w in stop_words])

In [10]:
# removes the non-alphabetic characters 
isalpha_filtered_sentences=[]
for sentence in filtered_sentences:
    isalpha_sentence=[]
    for word in sentence:
        #isalpha_sentence.append(regex.sub('', word))
        if word.isalpha()==True:
            isalpha_sentence.append(word)
    isalpha_filtered_sentences.append(isalpha_sentence)

In [11]:
all_reviews=isalpha_filtered_sentences

In [12]:
# remove punctuation afte pos tagging
# too short words in the next cell

In [21]:
# POS-Tagging and Lemmatization
#un2wn_mapping = {"VERB" : wn.VERB, "NOUN" : wn.NOUN}
un2wn_mapping = {"NOUN" : wn.NOUN}

docs = []

for review in all_reviews:
    lemmatized_reviews = []
    for w, p in nltk.pos_tag(review, tagset="universal"):
        if p in un2wn_mapping.keys():
            lemma = nltk.WordNetLemmatizer().lemmatize(w, pos = un2wn_mapping[p])
        else:
            lemma = nltk.WordNetLemmatizer().lemmatize(w)
            
        lemmatized_reviews.append(lemma.lower())  # case insensitive
        
    docs.append(lemmatized_reviews)

In [22]:
print(docs)

[['stayed', 'king', 'suite', 'night', 'yes', 'cot', 'u', 'bit', 'happy', 'standard', 'room', 'location', 'friendliness', 'staff', 'room', 'floor', 'overlooking', 'broadway', 'madhouse', 'fairway', 'market', 'room', 'quite', 'no', 'noise', 'evident', 'hallway', 'adjoining', 'room', 'great', 'able', 'open', 'window', 'craved', 'fresh', 'rather', 'heated', 'air', 'bed', 'including', 'fold', 'sofa', 'bed', 'comfortable', 'room', 'cleaned', 'well', 'access', 'worked', 'like', 'dream', 'one', 'connectivity', 'issue', 'first', 'night', 'promptly', 'responded', 'call', 'service', 'provider', 'ensure', 'well', 'location', 'close', 'street', 'subway', 'station', 'great', 'complimentary', 'umbrella', 'drizzly', 'day', 'greatly', 'appreciated', 'fabulous', 'kitchen', 'cooking', 'facility', 'access', 'whole', 'range', 'fresh', 'food', 'directly', 'across', 'road', 'fairway', 'second', 'time', 'member', 'party', 'stayed', 'beacon', 'certainly', 'hotel', 'choice', 'future', 'visit'], ['every', 'visit

In [23]:
dictionary = corpora.Dictionary(docs)
print ('Number of unique tokens:', len(dictionary))

Number of unique tokens: 224


In [24]:
# let's check each token's unique id
#print (dictionary.token2id.items())

We will do the following filtering once we work with a bigger portion of the dataset. If we run it now, everything is going to be filtered out because there is not enough data.

In [25]:
# Filter out words that occur in less than 10 documents, or more than 50% of the documents.
# dictionary.filter_extremes(no_below=10, no_above=0.5)
# print ('Number of unique tokens:', len(dictionary))

In [26]:
# Bag-of-words representation of the documents -  list of (word_id, word_frequency) 2-tuples
bow_corpus = [dictionary.doc2bow(d) for d in docs]

## Step 3: Topic Modelling
- Trainging a ***LDA model***
- Finding the most ***representative words for each topic***
- Finding the ***topics of each review*** in the dataset

In [27]:
ldamodel = models.ldamodel.LdaModel(bow_corpus, num_topics=5, id2word = dictionary, passes=50)

In [28]:
#the most representative words for each topic 
ldamodel.show_topics(formatted=False, num_words=10)

[(0,
  [('room', 0.032666034802558003),
   ('great', 0.02212957018889777),
   ('bed', 0.016860113112863964),
   ('fairway', 0.016860112797365507),
   ('every', 0.016860110074789929),
   ('location', 0.011591506827808655),
   ('comfortable', 0.011591343095029063),
   ('access', 0.011591323931938973),
   ('night', 0.011591322672021438),
   ('fresh', 0.011591321990934005)]),
 (1,
  [('hotel', 0.036302700288964304),
   ('one', 0.024592849762854595),
   ('restaurant', 0.018738622860007537),
   ('well', 0.01873588147885917),
   ('nice', 0.018735206696161098),
   ('lobby', 0.018735204094963057),
   ('complimentary', 0.012881054754778786),
   ('get', 0.012880858240358955),
   ('beverage', 0.012880448773767548),
   ('decent', 0.01288044830001764)]),
 (2,
  [('room', 0.0044644058654565062),
   ('restaurant', 0.0044643866945919689),
   ('go', 0.0044643856152372322),
   ('stay', 0.0044643853635704354),
   ('one', 0.0044643549002090612),
   ('great', 0.0044643490399696494),
   ('suite', 0.004464348

In [29]:
# the topics of each review in the dataset
# I set the minimum probabulity to 0.5 because usually there was 1 topics with prob >0.9 and the rest were <0.01
review_topics=[]
for i in range(len(bow_corpus)):
    review_topics.append(ldamodel.get_document_topics(bow_corpus[i], minimum_probability = 0.5))
    print('The main topic of review', i+1, 'is', review_topics[i][0][0], 'and its probability is ', review_topics[i][0][1])

The main topic of review 1 is 0 and its probability is  0.991973817594
The main topic of review 2 is 0 and its probability is  0.982919546983
The main topic of review 3 is 4 and its probability is  0.983627561502
The main topic of review 4 is 1 and its probability is  0.993679926203


### NEXT STEP: EMOTION ANALYSIS
**Can we compute the emotion of each *review* by computing the emotion of its *topic* with highest probability?**

For instance, the emotion of the **1st review** would be the emotion of **topic 0** (because its probability is 0.99197281812). To compute the emotion we would find the emotion of the *10* words representing that topic, so of ***room, great, stay, fairway, bed, every, location, comfortable, access, night***.

**Shall we perform a review-based, sentence-based or phrase-based analysis?**

Because there are not only *mixed reviews*, but also *mixed sentences*. 