In [1]:
import gzip
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import Word2Vec, LdaMulticore

from gensim.corpora import Dictionary

import numpy as np
import pyLDAvis
from pyLDAvis import gensim

In [3]:
ls -lh data

 Volume in drive C is Acer
 Volume Serial Number is 0823-6B7C

 Directory of C:\Python Scripts\MLChallenge


 Directory of C:\Python Scripts\MLChallenge\data

04.09.2019  21:14    <DIR>          .
04.09.2019  21:14    <DIR>          ..
03.09.2019  10:30       240˙342˙787 reviews_data.txt
               1 File(s)    240˙342˙787 bytes
               2 Dir(s)  342˙697˙648˙128 bytes free


File Not Found


In [2]:
with open('data/reviews_data.txt', 'rb') as f:
    for i, line in enumerate(f):
        print(i, line)
        break

0 b"Oct 12 2009 \tNice trendy hotel location not too bad.\tI stayed in this hotel for one night. As this is a fairly new place some of the taxi drivers did not know where it was and/or did not want to drive there. Once I have eventually arrived at the hotel, I was very pleasantly surprised with the decor of the lobby/ground floor area. It was very stylish and modern. I found the reception's staff geeting me with 'Aloha' a bit out of place, but I guess they are briefed to say that to keep up the coroporate image.As I have a Starwood Preferred Guest member, I was given a small gift upon-check in. It was only a couple of fridge magnets in a gift box, but nevertheless a nice gesture.My room was nice and roomy, there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by 'bliss'.The location is not great. It is at the last metro stop and you then need to take a taxi, but if you are not planning on going to see the historic sites in 

In [3]:
def preprocessing(sentence):
    return [word for word in simple_preprocess(sentence) \
           if word not in STOPWORDS]

def read_sentences(filename):
    with open(filename, 'rb') as f:
        for line in f:
            yield preprocessing(line)

In [4]:
%time sentences = list(read_sentences('data/reviews_data.txt'))

Wall time: 1min 59s


In [5]:
len(sentences)

255404

## Model

In [6]:
model = Word2Vec.load('models/hotel_model_size100_window10_mincount2.model')

## Reprezentacja wektorowa

In [7]:
model.wv.most_similar('good')

[('decent', 0.7995321750640869),
 ('great', 0.7496006488800049),
 ('excellent', 0.7420488595962524),
 ('fair', 0.6252300143241882),
 ('ok', 0.6243418455123901),
 ('average', 0.592189371585846),
 ('reasonable', 0.5821888446807861),
 ('nice', 0.5542484521865845),
 ('wise', 0.5451887845993042),
 ('fine', 0.5380234718322754)]

In [8]:
model.wv.most_similar('bad')

[('terrible', 0.7091025710105896),
 ('horrible', 0.6725283861160278),
 ('awful', 0.6613409519195557),
 ('okay', 0.6073172092437744),
 ('sucked', 0.6060258746147156),
 ('poor', 0.6045094132423401),
 ('crappy', 0.6011784076690674),
 ('ok', 0.5935438871383667),
 ('complain', 0.5927608609199524),
 ('worse', 0.5925099849700928)]

In [9]:
model.wv.most_similar(positive=['bad', 'terrible'], negative=['good'])

[('horrible', 0.8423726558685303),
 ('awful', 0.8038562536239624),
 ('horrific', 0.7360600233078003),
 ('horrid', 0.7299700975418091),
 ('dreadful', 0.7213677167892456),
 ('shocking', 0.6904001235961914),
 ('nasty', 0.687792956829071),
 ('disaster', 0.6822035312652588),
 ('appalling', 0.681570827960968),
 ('horrendous', 0.6786438822746277)]

## Topic modeling - LDA

In [10]:
sentences_light = np.random.permutation(sentences)

In [11]:
sentences_light = sentences_light[:1000]

In [12]:
%time dictionary = Dictionary(sentences_light)

Wall time: 1.62 s


In [13]:
%time bow_corpus = [dictionary.doc2bow(sent) for sent in sentences_light]

Wall time: 906 ms


In [14]:
dictionary.doc2bow(['car'])

[(570, 1)]

In [15]:
dictionary.token2id['car']

570

In [16]:
%time lda_model = LdaMulticore(bow_corpus, id2word=dictionary, num_topics=100, passes=20, workers=8)

Wall time: 5min 51s


In [17]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.025*"hotel" + 0.022*"room" + 0.013*"good" + 0.012*"stay" + 0.010*"nice" + 0.009*"staff" + 0.009*"clean" + 0.009*"location" + 0.008*"great" + 0.007*"like"
Topic: 1 
Words: 0.022*"room" + 0.015*"strip" + 0.009*"mar" + 0.007*"customer" + 0.007*"rooms" + 0.007*"service" + 0.007*"hotel" + 0.007*"asked" + 0.007*"mini" + 0.007*"poor"
Topic: 2 
Words: 0.027*"hotel" + 0.015*"jan" + 0.015*"room" + 0.015*"stay" + 0.011*"place" + 0.010*"london" + 0.010*"nice" + 0.009*"minutes" + 0.008*"good" + 0.008*"spa"
Topic: 3 
Words: 0.027*"hotel" + 0.020*"room" + 0.011*"quot" + 0.011*"good" + 0.011*"location" + 0.010*"great" + 0.010*"staff" + 0.009*"clean" + 0.008*"nice" + 0.008*"stay"
Topic: 4 
Words: 0.018*"room" + 0.018*"great" + 0.013*"stay" + 0.012*"hotel" + 0.010*"free" + 0.010*"time" + 0.009*"breakfast" + 0.009*"good" + 0.009*"clean" + 0.009*"place"
Topic: 5 
Words: 0.026*"room" + 0.021*"hotel" + 0.017*"great" + 0.011*"staff" + 0.011*"location" + 0.011*"friendly" + 0.008*"stay" + 0.

## Wizualizacja tematów

In [18]:
lda_vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.display(lda_vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
