# Necessary Libraries

In [2]:
import pandas as pd
import numpy as np
import re
from re import sub
import multiprocessing
from unidecode import unidecode

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors


from time import time 
from collections import defaultdict

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [3]:
df = pd.read_csv(r'C:\Users\Boon Kong\Desktop\DSAI\Project\data\reviews.csv')
df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,7202016,38917982,2015-07-19,28943674,Bianca,Cute and cozy place. Perfect location to every...
1,7202016,39087409,2015-07-20,32440555,Frank,Kelly has a great room in a very central locat...
2,7202016,39820030,2015-07-26,37722850,Ian,"Very spacious apartment, and in a great neighb..."
3,7202016,40813543,2015-08-02,33671805,George,Close to Seattle Center and all it has to offe...
4,7202016,41986501,2015-08-10,34959538,Ming,Kelly was a great host and very accommodating ...


# 1) Data Cleaning
- Remove punctuations
- Tokenize words (split review into list of words)
- Remove stopwords (common words that do not add meaning to the context of the text)
- Lemmatize words (reduce words to their root form to reduce vocabulary size and group words with same meaning together)

In [4]:
import string
import re
import nltk
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english')

In [5]:
## functions for cleaning tasks

def remove_punct(text):
    no_punct = ''.join(char for char in text if char not in string.punctuation)
    return no_punct

# creates a list of words
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

# remove common words with no meaning e.g. connectors
def remove_stopwords(token_list):
    text = [word for word in token_list if word not in stopwords]
    return text

wn = nltk.WordNetLemmatizer()

# convert words into their root forms
def lemmatize(text):
    lemmatized_text = [wn.lemmatize(word) for word in text]
    return lemmatized_text

In [6]:
df['comments'] = df['comments'].astype(str)
df['clean_comment'] = df['comments'].apply(lambda x: remove_punct(x))
df['tokenized'] = df['clean_comment'].apply(lambda x: tokenize(x.lower()))
df['no_stopwords'] = df['tokenized'].apply(lambda x: remove_stopwords(x))
df['lemmatized'] = df['no_stopwords'].apply(lambda x: lemmatize(x))

df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,clean_comment,tokenized,no_stopwords,lemmatized
0,7202016,38917982,2015-07-19,28943674,Bianca,Cute and cozy place. Perfect location to every...,Cute and cozy place Perfect location to everyt...,"[cute, and, cozy, place, perfect, location, to...","[cute, cozy, place, perfect, location, everyth...","[cute, cozy, place, perfect, location, everyth..."
1,7202016,39087409,2015-07-20,32440555,Frank,Kelly has a great room in a very central locat...,Kelly has a great room in a very central locat...,"[kelly, has, a, great, room, in, a, very, cent...","[kelly, great, room, central, location, beauti...","[kelly, great, room, central, location, beauti..."
2,7202016,39820030,2015-07-26,37722850,Ian,"Very spacious apartment, and in a great neighb...",Very spacious apartment and in a great neighbo...,"[very, spacious, apartment, and, in, a, great,...","[spacious, apartment, great, neighborhood, kin...","[spacious, apartment, great, neighborhood, kin..."
3,7202016,40813543,2015-08-02,33671805,George,Close to Seattle Center and all it has to offe...,Close to Seattle Center and all it has to offe...,"[close, to, seattle, center, and, all, it, has...","[close, seattle, center, offer, ballet, theate...","[close, seattle, center, offer, ballet, theate..."
4,7202016,41986501,2015-08-10,34959538,Ming,Kelly was a great host and very accommodating ...,Kelly was a great host and very accommodating ...,"[kelly, was, a, great, host, and, very, accomm...","[kelly, great, host, accommodating, great, nei...","[kelly, great, host, accommodating, great, nei..."


In [7]:
df['lemmatized'].isna().sum()

0

# 2) Gensim Word2Vec 
Gensim is an algorithm based on neural networks, using large amounts of unannotated plain text to learn relationship between words. This gives us a spatial dimension with words of the same meaning close to each other. For example, 'strong' and 'powerful' will be close to each other.


## create word embeddings
- 300 dimensional embeddings
- lookup window = 4 (learn to predict word using 4 words from left and right)

In [11]:
# put all words into a list
sent = [row for row in df['lemmatized']]

# create phrases from sentences
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[1]

INFO - 21:31:13: collecting all words and their counts
INFO - 21:31:13: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 21:31:16: PROGRESS: at sentence #50000, processed 1811173 words and 602816 word types
INFO - 21:31:19: collected 885212 word types from a corpus of 3085563 words (unigram + bigrams) and 84849 sentences
INFO - 21:31:19: using 885212 counts as vocab in Phrases<0 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 21:31:19: source_vocab length 885212
INFO - 21:31:29: Phraser built with 32536 phrasegrams


['kelly',
 'great',
 'room',
 'central',
 'location',
 'beautiful',
 'building',
 'architecture',
 'style',
 'really',
 'like',
 'felt',
 'guite',
 'home',
 'wish',
 'spent',
 'time',
 'went',
 'walk',
 'found',
 'seattle',
 'center',
 'major',
 'food',
 'festival',
 'progress',
 'treat',
 'visited',
 'space_needle',
 'chihuly_glass',
 'exhibit',
 'pike_place',
 'market',
 'wow',
 'thanks',
 'great',
 'stay']

In [12]:
model = Word2Vec(min_count=3,
                     window=4,
                     size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

start = time()

model.build_vocab(sentences, progress_per=50000)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

INFO - 21:31:54: collecting all words and their counts
INFO - 21:31:54: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 21:32:00: PROGRESS: at sentence #50000, processed 1542326 words, keeping 61718 word types
INFO - 21:32:03: collected 79994 word types from a corpus of 2627642 raw words and 84849 sentences
INFO - 21:32:03: Loading a fresh vocabulary
INFO - 21:32:03: effective_min_count=3 retains 30260 unique words (37% of original 79994, drops 49734)
INFO - 21:32:03: effective_min_count=3 leaves 2561531 word corpus (97% of original 2627642, drops 66111)
INFO - 21:32:03: deleting the raw counts dictionary of 79994 items
INFO - 21:32:03: sample=1e-05 downsamples 3054 most-common words
INFO - 21:32:03: downsampling leaves estimated 677102 word corpus (26.4% of prior 2561531)
INFO - 21:32:04: estimated required memory for 30260 words and 300 dimensions: 87754000 bytes
INFO - 21:32:04: resetting layer weights


Time to build vocab: 0.28 mins


In [13]:
start = time()

model.train(sentences, total_examples=model.corpus_count, epochs=30)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

model.init_sims(replace=True)

INFO - 21:33:00: training model with 11 workers on 30260 vocabulary and 300 features, using sg=0 hs=0 sample=1e-05 negative=20 window=4
INFO - 21:33:01: EPOCH 1 - PROGRESS: at 10.44% examples, 68541 words/s, in_qsize 0, out_qsize 0
INFO - 21:33:02: EPOCH 1 - PROGRESS: at 20.48% examples, 68606 words/s, in_qsize 1, out_qsize 0
INFO - 21:33:03: EPOCH 1 - PROGRESS: at 30.62% examples, 68200 words/s, in_qsize 0, out_qsize 3
INFO - 21:33:04: EPOCH 1 - PROGRESS: at 34.89% examples, 58239 words/s, in_qsize 9, out_qsize 0
INFO - 21:33:05: EPOCH 1 - PROGRESS: at 41.69% examples, 55308 words/s, in_qsize 20, out_qsize 0
INFO - 21:33:06: EPOCH 1 - PROGRESS: at 51.77% examples, 56798 words/s, in_qsize 17, out_qsize 2
INFO - 21:33:07: EPOCH 1 - PROGRESS: at 62.13% examples, 58030 words/s, in_qsize 21, out_qsize 0
INFO - 21:33:08: EPOCH 1 - PROGRESS: at 73.55% examples, 60094 words/s, in_qsize 18, out_qsize 0
INFO - 21:33:10: EPOCH 1 - PROGRESS: at 84.01% examples, 60062 words/s, in_qsize 21, out_qsi

INFO - 21:33:51: EPOCH 5 - PROGRESS: at 88.76% examples, 57355 words/s, in_qsize 21, out_qsize 0
INFO - 21:33:51: worker thread finished; awaiting finish of 10 more threads
INFO - 21:33:51: worker thread finished; awaiting finish of 9 more threads
INFO - 21:33:51: worker thread finished; awaiting finish of 8 more threads
INFO - 21:33:51: worker thread finished; awaiting finish of 7 more threads
INFO - 21:33:51: worker thread finished; awaiting finish of 6 more threads
INFO - 21:33:51: worker thread finished; awaiting finish of 5 more threads
INFO - 21:33:51: worker thread finished; awaiting finish of 4 more threads
INFO - 21:33:51: worker thread finished; awaiting finish of 3 more threads
INFO - 21:33:51: worker thread finished; awaiting finish of 2 more threads
INFO - 21:33:51: worker thread finished; awaiting finish of 1 more threads
INFO - 21:33:51: worker thread finished; awaiting finish of 0 more threads
INFO - 21:33:51: EPOCH - 5 : training on 2627642 raw words (677390 effective 

INFO - 21:34:32: EPOCH 10 - PROGRESS: at 10.44% examples, 64940 words/s, in_qsize 0, out_qsize 0
INFO - 21:34:33: EPOCH 10 - PROGRESS: at 20.89% examples, 67647 words/s, in_qsize 0, out_qsize 0
INFO - 21:34:34: EPOCH 10 - PROGRESS: at 30.29% examples, 66198 words/s, in_qsize 0, out_qsize 2
INFO - 21:34:35: EPOCH 10 - PROGRESS: at 35.30% examples, 54825 words/s, in_qsize 7, out_qsize 0
INFO - 21:34:37: EPOCH 10 - PROGRESS: at 42.44% examples, 52077 words/s, in_qsize 21, out_qsize 0
INFO - 21:34:38: EPOCH 10 - PROGRESS: at 52.55% examples, 53869 words/s, in_qsize 21, out_qsize 0
INFO - 21:34:39: EPOCH 10 - PROGRESS: at 62.77% examples, 55502 words/s, in_qsize 19, out_qsize 2
INFO - 21:34:40: EPOCH 10 - PROGRESS: at 73.14% examples, 56932 words/s, in_qsize 21, out_qsize 1
INFO - 21:34:41: EPOCH 10 - PROGRESS: at 84.00% examples, 58655 words/s, in_qsize 22, out_qsize 0
INFO - 21:34:41: worker thread finished; awaiting finish of 10 more threads
INFO - 21:34:41: worker thread finished; await

INFO - 21:35:21: worker thread finished; awaiting finish of 9 more threads
INFO - 21:35:21: worker thread finished; awaiting finish of 8 more threads
INFO - 21:35:21: worker thread finished; awaiting finish of 7 more threads
INFO - 21:35:21: worker thread finished; awaiting finish of 6 more threads
INFO - 21:35:21: worker thread finished; awaiting finish of 5 more threads
INFO - 21:35:21: worker thread finished; awaiting finish of 4 more threads
INFO - 21:35:21: worker thread finished; awaiting finish of 3 more threads
INFO - 21:35:21: worker thread finished; awaiting finish of 2 more threads
INFO - 21:35:21: worker thread finished; awaiting finish of 1 more threads
INFO - 21:35:21: worker thread finished; awaiting finish of 0 more threads
INFO - 21:35:21: EPOCH - 14 : training on 2627642 raw words (677495 effective words) took 9.7s, 69534 effective words/s
INFO - 21:35:22: EPOCH 15 - PROGRESS: at 9.60% examples, 61058 words/s, in_qsize 0, out_qsize 0
INFO - 21:35:23: EPOCH 15 - PROGRE

INFO - 21:36:02: EPOCH 19 - PROGRESS: at 10.02% examples, 66091 words/s, in_qsize 1, out_qsize 0
INFO - 21:36:04: EPOCH 19 - PROGRESS: at 14.29% examples, 39972 words/s, in_qsize 18, out_qsize 2
INFO - 21:36:05: EPOCH 19 - PROGRESS: at 23.87% examples, 47900 words/s, in_qsize 21, out_qsize 0
INFO - 21:36:06: EPOCH 19 - PROGRESS: at 33.30% examples, 51557 words/s, in_qsize 19, out_qsize 3
INFO - 21:36:07: EPOCH 19 - PROGRESS: at 43.53% examples, 54813 words/s, in_qsize 19, out_qsize 2
INFO - 21:36:08: EPOCH 19 - PROGRESS: at 54.88% examples, 57261 words/s, in_qsize 14, out_qsize 5
INFO - 21:36:09: EPOCH 19 - PROGRESS: at 70.39% examples, 61495 words/s, in_qsize 10, out_qsize 1
INFO - 21:36:10: EPOCH 19 - PROGRESS: at 78.33% examples, 59723 words/s, in_qsize 21, out_qsize 0
INFO - 21:36:11: EPOCH 19 - PROGRESS: at 89.54% examples, 61452 words/s, in_qsize 17, out_qsize 4
INFO - 21:36:11: worker thread finished; awaiting finish of 10 more threads
INFO - 21:36:11: worker thread finished; aw

INFO - 21:36:52: worker thread finished; awaiting finish of 9 more threads
INFO - 21:36:52: worker thread finished; awaiting finish of 8 more threads
INFO - 21:36:52: worker thread finished; awaiting finish of 7 more threads
INFO - 21:36:52: worker thread finished; awaiting finish of 6 more threads
INFO - 21:36:52: worker thread finished; awaiting finish of 5 more threads
INFO - 21:36:52: worker thread finished; awaiting finish of 4 more threads
INFO - 21:36:52: worker thread finished; awaiting finish of 3 more threads
INFO - 21:36:52: worker thread finished; awaiting finish of 2 more threads
INFO - 21:36:52: worker thread finished; awaiting finish of 1 more threads
INFO - 21:36:52: worker thread finished; awaiting finish of 0 more threads
INFO - 21:36:52: EPOCH - 23 : training on 2627642 raw words (678308 effective words) took 10.1s, 67463 effective words/s
INFO - 21:36:53: EPOCH 24 - PROGRESS: at 9.60% examples, 62174 words/s, in_qsize 0, out_qsize 3
INFO - 21:36:54: EPOCH 24 - PROGR

INFO - 21:37:34: EPOCH 28 - PROGRESS: at 20.86% examples, 68413 words/s, in_qsize 0, out_qsize 0
INFO - 21:37:35: EPOCH 28 - PROGRESS: at 28.03% examples, 58406 words/s, in_qsize 5, out_qsize 0
INFO - 21:37:36: EPOCH 28 - PROGRESS: at 32.96% examples, 52469 words/s, in_qsize 19, out_qsize 1
INFO - 21:37:37: EPOCH 28 - PROGRESS: at 44.65% examples, 55285 words/s, in_qsize 22, out_qsize 0
INFO - 21:37:38: EPOCH 28 - PROGRESS: at 56.41% examples, 58097 words/s, in_qsize 18, out_qsize 1
INFO - 21:37:40: EPOCH 28 - PROGRESS: at 66.55% examples, 58607 words/s, in_qsize 21, out_qsize 0
INFO - 21:37:41: EPOCH 28 - PROGRESS: at 76.84% examples, 59324 words/s, in_qsize 18, out_qsize 5
INFO - 21:37:42: EPOCH 28 - PROGRESS: at 89.55% examples, 62178 words/s, in_qsize 21, out_qsize 0
INFO - 21:37:42: worker thread finished; awaiting finish of 10 more threads
INFO - 21:37:42: worker thread finished; awaiting finish of 9 more threads
INFO - 21:37:42: worker thread finished; awaiting finish of 8 more 

Time to train the model: 5.02 mins


In [14]:
model.save("word2vec.model")

INFO - 21:38:02: saving Word2Vec object under word2vec.model, separately None
INFO - 21:38:02: not storing attribute vectors_norm
INFO - 21:38:02: not storing attribute cum_table
INFO - 21:38:02: saved word2vec.model


In [19]:
cols = ['date','reviewer_id','reviewer_name','clean_comment','tokenized','no_stopwords']

df.drop(columns=cols,inplace=True)
df.head()

Unnamed: 0,listing_id,id,comments,lemmatized
0,7202016,38917982,Cute and cozy place. Perfect location to every...,"[cute, cozy, place, perfect, location, everyth..."
1,7202016,39087409,Kelly has a great room in a very central locat...,"[kelly, great, room, central, location, beauti..."
2,7202016,39820030,"Very spacious apartment, and in a great neighb...","[spacious, apartment, great, neighborhood, kin..."
3,7202016,40813543,Close to Seattle Center and all it has to offe...,"[close, seattle, center, offer, ballet, theate..."
4,7202016,41986501,Kelly was a great host and very accommodating ...,"[kelly, great, host, accommodating, great, nei..."


In [21]:
df['clean'] = df['lemmatized'].apply(lambda x: ' '.join(bigram[x]))
df.head()

Unnamed: 0,listing_id,id,comments,lemmatized,clean
0,7202016,38917982,Cute and cozy place. Perfect location to every...,"[cute, cozy, place, perfect, location, everyth...",cute cozy place perfect location everything
1,7202016,39087409,Kelly has a great room in a very central locat...,"[kelly, great, room, central, location, beauti...",kelly great room central location beautiful bu...
2,7202016,39820030,"Very spacious apartment, and in a great neighb...","[spacious, apartment, great, neighborhood, kin...",spacious apartment great neighborhood kind apa...
3,7202016,40813543,Close to Seattle Center and all it has to offe...,"[close, seattle, center, offer, ballet, theate...",close seattle center offer ballet theater_muse...
4,7202016,41986501,Kelly was a great host and very accommodating ...,"[kelly, great, host, accommodating, great, nei...",kelly great host accommodating great neighborh...


# 3) K-means Clustering
- basic technique for unsupervised clustering
- set n=2 (we want positive vs negative)
- 50 starting points to prevent wrong choice of centroids

In [22]:
from sklearn.cluster import KMeans

In [23]:
word_vectors = Word2Vec.load("word2vec.model").wv

INFO - 21:47:54: loading Word2Vec object from word2vec.model
INFO - 21:47:55: loading wv recursively from word2vec.model.wv.* with mmap=None
INFO - 21:47:55: setting ignored attribute vectors_norm to None
INFO - 21:47:55: loading vocabulary recursively from word2vec.model.vocabulary.* with mmap=None
INFO - 21:47:55: loading trainables recursively from word2vec.model.trainables.* with mmap=None
INFO - 21:47:55: setting ignored attribute cum_table to None
INFO - 21:47:55: loaded word2vec.model


In [24]:
k_model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))

## see closest words to each cluster centroid

In [26]:
# cluster 1 is words with negative sentiments
word_vectors.similar_by_vector(k_model.cluster_centers_[1], topn=20, restrict_vocab=None)

[('wifi_unstable', 0.9956074953079224),
 ('biggest_complaint', 0.9948632717132568),
 ('old_nasty', 0.9943544864654541),
 ('might_appeal', 0.9931325912475586),
 ('insect_repellent', 0.9929763674736023),
 ('noggin', 0.9908583164215088),
 ('wiring', 0.9907512664794922),
 ('kept_banging', 0.9907026290893555),
 ('goo', 0.9895851016044617),
 ('flashing', 0.9888520836830139),
 ('lever', 0.9885170459747314),
 ('somebody_el', 0.9883679151535034),
 ('unsanitary', 0.9881523847579956),
 ('litter', 0.988092303276062),
 ('hinge', 0.9875819087028503),
 ('need_updating', 0.987217366695404),
 ('plastic_bag', 0.9865788221359253),
 ('wash_hand', 0.9864225387573242),
 ('investment', 0.9863881468772888),
 ('luke_warm', 0.9862217903137207)]

In [27]:
# cluster 0 is words with positive sentiments
word_vectors.similar_by_vector(k_model.cluster_centers_[0], topn=20, restrict_vocab=None)

[('kay_denis', 0.9945439100265503),
 ('perfect_launchpad', 0.9936453700065613),
 ('give_pointer', 0.99309241771698),
 ('ruby_dog', 0.9930784702301025),
 ('become_favorite', 0.9929834008216858),
 ('saul_megans', 0.992965579032898),
 ('tinyhouse', 0.9926290512084961),
 ('sooooooo', 0.9922561645507812),
 ('qa_hill', 0.9921590089797974),
 ('erickas', 0.9920530915260315),
 ('aug_2013', 0.9920216798782349),
 ('quiet_neighborhoody', 0.9919600486755371),
 ('nc', 0.9918854236602783),
 ('without_invasive', 0.9918511509895325),
 ('uw_village', 0.9917807579040527),
 ('lifetime_experience', 0.9917296171188354),
 ('georgeous', 0.9916488528251648),
 ('unbelievable_hospitality', 0.9916225671768188),
 ('child_hospital', 0.9915664196014404),
 ('vaibhav_heidi', 0.9910863637924194)]

## Comparing results from above
- cluster for negative sentiment is pretty accurate, with most phrases linking to bad experiences
- cluster for positive sentiment might not be that accurate, with some phrases having no relation to good experiences
- but this might be because most of the sentences revolving around these phrases have been from good reviews

In [28]:
pos_cluster = k_model.cluster_centers_[0]
neg_cluster = k_model.cluster_centers_[1]

## Assign sentiment scores
- based on how close the words are to centroid
- (-1 to 1)

In [32]:
word_df = pd.DataFrame(word_vectors.vocab.keys())
word_df.columns = ['words']
word_df['vectors'] = word_df['words'].apply(lambda x: word_vectors[f'{x}'])
word_df['cluster'] = word_df['vectors'].apply(lambda x: k_model.predict([np.array(x)]))
word_df['cluster'] = word_df['cluster'].apply(lambda x: x[0])
word_df.head(15)

Unnamed: 0,words,vectors,cluster
0,cute,"[-0.024901101, -0.066744104, 0.010962429, -0.1...",0
1,cozy,"[0.01864702, -0.023149328, 0.041681483, -0.129...",0
2,place,"[-0.010262076, -0.023641733, 0.113858655, -0.0...",0
3,perfect,"[-0.020732332, -0.02959141, 0.11434515, -0.107...",0
4,location,"[-0.032389443, -0.059886683, 0.09468089, -0.03...",0
5,everything,"[0.041884515, 0.026371881, 0.045442346, -0.070...",0
6,kelly,"[0.010439747, 0.121025056, 0.054996, -0.085330...",0
7,great,"[-0.04261457, -0.0873449, 0.048071723, -0.0137...",0
8,room,"[0.06447234, -0.060898483, 0.0531486, -0.00776...",1
9,central,"[-0.019869706, -0.090317436, 0.085564695, 0.01...",0


In [33]:
word_df['label'] = [1 if i==0 else -1 for i in word_df.cluster]
word_df['distance'] = word_df.apply(lambda x: 1/(k_model.transform([x.vectors]).min()), axis=1)

# sentiment score = distance from centroid x centroid value
word_df['sentiment'] = word_df['distance'] * word_df['label']
word_df.head(15)

INFO - 22:16:11: Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO - 22:16:11: NumExpr defaulting to 8 threads.


Unnamed: 0,words,vectors,cluster,label,distance,sentiment
0,cute,"[-0.024901101, -0.066744104, 0.010962429, -0.1...",0,1,1.08079,1.08079
1,cozy,"[0.01864702, -0.023149328, 0.041681483, -0.129...",0,1,1.09247,1.09247
2,place,"[-0.010262076, -0.023641733, 0.113858655, -0.0...",0,1,1.165105,1.165105
3,perfect,"[-0.020732332, -0.02959141, 0.11434515, -0.107...",0,1,1.168722,1.168722
4,location,"[-0.032389443, -0.059886683, 0.09468089, -0.03...",0,1,1.09034,1.09034
5,everything,"[0.041884515, 0.026371881, 0.045442346, -0.070...",0,1,1.051214,1.051214
6,kelly,"[0.010439747, 0.121025056, 0.054996, -0.085330...",0,1,1.175903,1.175903
7,great,"[-0.04261457, -0.0873449, 0.048071723, -0.0137...",0,1,1.126342,1.126342
8,room,"[0.06447234, -0.060898483, 0.0531486, -0.00776...",1,-1,0.982031,-0.982031
9,central,"[-0.019869706, -0.090317436, 0.085564695, 0.01...",0,1,1.074237,1.074237


## next steps
- now that we have sentiment scores for each word, we can use it to predict sentiment score of sentences
- possible method of improvement - 3rd cluster for neutral words

# 4) Predict Sentiment Scores of Sentences

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [35]:
main_df = df
df.head()

Unnamed: 0,listing_id,id,comments,lemmatized,clean
0,7202016,38917982,Cute and cozy place. Perfect location to every...,"[cute, cozy, place, perfect, location, everyth...",cute cozy place perfect location everything
1,7202016,39087409,Kelly has a great room in a very central locat...,"[kelly, great, room, central, location, beauti...",kelly great room central location beautiful bu...
2,7202016,39820030,"Very spacious apartment, and in a great neighb...","[spacious, apartment, great, neighborhood, kin...",spacious apartment great neighborhood kind apa...
3,7202016,40813543,Close to Seattle Center and all it has to offe...,"[close, seattle, center, offer, ballet, theate...",close seattle center offer ballet theater_muse...
4,7202016,41986501,Kelly was a great host and very accommodating ...,"[kelly, great, host, accommodating, great, nei...",kelly great host accommodating great neighborh...


In [36]:
sentiment_df = word_df
sentiment_df.head()

Unnamed: 0,words,vectors,cluster,label,distance,sentiment
0,cute,"[-0.024901101, -0.066744104, 0.010962429, -0.1...",0,1,1.08079,1.08079
1,cozy,"[0.01864702, -0.023149328, 0.041681483, -0.129...",0,1,1.09247,1.09247
2,place,"[-0.010262076, -0.023641733, 0.113858655, -0.0...",0,1,1.165105,1.165105
3,perfect,"[-0.020732332, -0.02959141, 0.11434515, -0.107...",0,1,1.168722,1.168722
4,location,"[-0.032389443, -0.059886683, 0.09468089, -0.03...",0,1,1.09034,1.09034


In [37]:
sent_map = dict(zip(sentiment_df['words'].values, sentiment_df['sentiment'].values))

## Creating tf-idf scores for words in each sentence
- tf-idf weightage is based on uniqueness of word
- more unique the word across corpus = higher score
- hence, more unique words will contribute a higher weightage to the sentiment

In [38]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(main_df['clean'])
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(main_df['clean'])



In [46]:
# dictionary of key (word) to value (tf-idf score)
def create_tfidf(x, transformed_file, features):
    vector = transformed_file[x.name].tocoo() # matrix to coordinates
    vector.col = features.iloc[vector.col].values
    coo_dict = dict(zip(vector.col, vector.data))
    return coo_dict

# replace word with tf-idf score
def word_to_tfidf(x, transformed_file, features):
    dictionary = create_tfidf(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x['clean'].split()))

In [47]:
%%time
# may take up to 5mins
tfidf_scores = main_df.apply(lambda x: word_to_tfidf(x, transformed, features), axis=1)

Wall time: 33.3 s


## Implementing sentiment scores into words in each sentence
- these scores were created during K-means clustering, now you are just putting them into sentence format

In [51]:
def word_to_sentiment(word, sentiment_dict):
    try:
        output = sentiment_dict[word]
    except KeyError:
        output = 0
    return output

In [52]:
sentiment_scores = main_df['clean'].apply(lambda x: list(map(lambda y: word_to_sentiment(y, sent_map), x.split())))

In [60]:
final_df = pd.DataFrame(data=[sentiment_scores, tfidf_scores, main_df['clean']]).T
final_df.columns = ['sentiment_score', 'tfidf_score', 'sentence']
final_df.head()

Unnamed: 0,sentiment_score,tfidf_score,sentence
0,"[1.0807900676706315, 1.0924704996364492, 1.165...","[4.334304533362949, 3.4299899925355093, 1.9144...",cute cozy place perfect location everything
1,"[1.1759034646294253, 1.1263419456038901, -0.98...","[6.897601817096672, 3.384974820083783, 2.66101...",kelly great room central location beautiful bu...
2,"[1.0205340292759995, 1.0306948059770953, 1.126...","[3.89758688175068, 5.134640704200904, 1.692487...",spacious apartment great neighborhood kind apa...
3,"[1.04017413501294, 1.1063623947429364, 0.97384...","[3.0789937927887823, 2.04520279070619, 5.17544...",close seattle center offer ballet theater_muse...
4,"[1.1759034646294253, 1.1263419456038901, 1.164...","[6.897601817096672, 6.769949640167566, 2.21166...",kelly great host accommodating great neighborh...


## Merge scores 
- dot product of sentiment and tf-idf scores = overall sentiment
- positive (>0), negative (<0)

In [62]:
final_df = pd.DataFrame(data=[sentiment_scores, tfidf_scores, main_df['clean']]).T
final_df.columns = ['sentiment_score', 'tfidf_score', 'sentence']

# dot product
final_df['sentiment_rate'] = final_df.apply(lambda x: np.array(x.loc['sentiment_score']) @ np.array(x.loc['tfidf_score']), axis=1)
final_df['prediction'] = (final_df['sentiment_rate']>0).astype('int8')
final_df.head(15)
# final_df['sentiment'] = [1 if i==1 else 0 for i in final_df['sentiment']]

Unnamed: 0,sentiment_score,tfidf_score,sentence,sentiment_rate,prediction
0,"[1.0807900676706315, 1.0924704996364492, 1.165...","[4.334304533362949, 3.4299899925355093, 1.9144...",cute cozy place perfect location everything,19.245867,1
1,"[1.1759034646294253, 1.1263419456038901, -0.98...","[6.897601817096672, 3.384974820083783, 2.66101...",kelly great room central location beautiful bu...,108.777278,1
2,"[1.0205340292759995, 1.0306948059770953, 1.126...","[3.89758688175068, 5.134640704200904, 1.692487...",spacious apartment great neighborhood kind apa...,56.740638,1
3,"[1.04017413501294, 1.1063623947429364, 0.97384...","[3.0789937927887823, 2.04520279070619, 5.17544...",close seattle center offer ballet theater_muse...,131.140522,1
4,"[1.1759034646294253, 1.1263419456038901, 1.164...","[6.897601817096672, 6.769949640167566, 2.21166...",kelly great host accommodating great neighborh...,121.286061,1
5,"[1.1759034646294253, 1.1263419456038901, 1.165...","[6.897601817096672, 3.384974820083783, 3.82893...",kelly great place great looking clean simple w...,39.370889,1
6,"[1.1759034646294253, 1.1263419456038901, 1.020...","[6.897601817096672, 3.384974820083783, 2.49734...",kelly great nice neighborhood place stay expec...,25.369374,1
7,"[0, 1.144072044336585, 0, -0.9620808091281017,...","[11.250027981994263, 7.122893596949171, 11.655...",hola bnb erz left seattle simply fantastic tim...,13.834402,1
8,"[1.1759034646294253, 1.165105090604758, 1.1086...","[13.795203634193344, 1.914465645264698, 5.2338...",kelly place conveniently_located quiet street ...,73.895394,1
9,"[1.165105090604758, 1.1079019359329356, 1.0201...","[1.914465645264698, 3.1014859255634732, 2.4973...",place really nice clean important_aspect close...,-117.702295,0


In [63]:
final_df.to_csv('final.csv')

# 5) Performance Metrics

In [57]:
y_pred = final_df['prediction']
y_test = final_df['sentiment']

In [58]:
matrix = pd.DataFrame(confusion_matrix(y_test, y_pred))
print('This is the Confusion Matrix:')
display(matrix)

This is the Confusion Matrix:


Unnamed: 0,0,1
0,9384,75465
1,0,0


## Next Steps

As this is an unsupervised method, there is no labels for checking the accuracy, precision or recall of the model. Upon study of the model output, there is a strong dominance of positive reviews. Many reviews with a slight hint of negativity is still labelled as positive, hence this model is not good enough for sentiment classification.


Next, we will build a supervised model with the state-of-the-art BERT transformer, which will give us a better prediction of the sentiment. Please refer to the BERT_sentiment notebook.