In [1]:
import pandas as pd
pd.set_option("display.max_colwidth", 40)
import numpy as np
import re
#import spacy
import nltk
from nltk import FreqDist
import gensim
from gensim import corpora

# libraries for visualization
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Load  Data

In [2]:
df = pd.read_csv('AllReviewsCleanTokens.csv')

In [3]:
df.count()

Unnamed: 0    11324
date          11324
review        11324
rating        11324
title         11324
language      11324
trip_date     11324
hotel         11324
source        11324
tokens        11324
dtype: int64

In [4]:
#df['lemmantized_reviews'] = df['review'].apply(str.split)
df['tokens'] = df['tokens'].str.replace(r"[\[\]\']", "").str.split(", ")

In [5]:
data = df[['tokens','language']]

## Remove words

In [6]:
# As we can see, the clusters contain too many of these uninformative words so we remove them.
list_stop_words_en = ["one", "time", "could", "would", "get", "center","longleat","park","parks"]
list_stop_words_fr = ["tre", "plus", "si", "tous",'parcs','parc']
list_stop_words = list_stop_words_en + list_stop_words_fr

In [7]:
def remove_stop_word(x, list_stop_words=list_stop_words):
    clean_x = [l for l in x if l not in list_stop_words]
    return clean_x

In [8]:
data['tokens'] = data['tokens'].apply(remove_stop_word)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## Building an LDA model

In [9]:
tokens_en = data[data.language == 'en'].tokens
tokens_fr = data[data.language == 'fr'].tokens

In [10]:
dictionary_en = corpora.Dictionary(tokens_en)
dictionary_fr = corpora.Dictionary(tokens_fr)

In [11]:
print('en:',len(dictionary_en),'\nfr:',len(dictionary_fr))

en: 18412 
fr: 5037


In [12]:
doc_term_matrix_en = [dictionary_en.doc2bow(rev) for rev in tokens_en]
doc_term_matrix_fr = [dictionary_fr.doc2bow(rev) for rev in tokens_fr]

In [13]:
import warnings
warnings.filterwarnings('ignore')

# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

# Build LDA model
num_topics = 5

lda_model_en = LDA(corpus=doc_term_matrix_en, id2word=dictionary_en, 
                num_topics=num_topics, 
                alpha=[0.0001] * num_topics, 
                eta=[0.0001] * len(dictionary_en),
                chunksize=2000,
                passes=4,
                random_state=100,
               )
lda_model_fr = LDA(corpus=doc_term_matrix_fr, id2word=dictionary_fr, 
                num_topics=num_topics, 
                alpha=[0.0001] * num_topics, 
                eta=[0.0001] * len(dictionary_fr),
                chunksize=2000,
                passes=4,
                random_state=100,
               )

In [14]:
lda_model_en.print_topics(num_words=8)

[(0,
  '0.017*"car" + 0.010*"go" + 0.009*"day" + 0.009*"take" + 0.009*"book" + 0.009*"dog" + 0.009*"train" + 0.008*"child"'),
 (1,
  '0.016*"go" + 0.012*"good" + 0.010*"food" + 0.010*"price" + 0.008*"activity" + 0.008*"book" + 0.007*"pool" + 0.007*"expensive"'),
 (2,
  '0.018*"great" + 0.015*"go" + 0.015*"year" + 0.013*"love" + 0.011*"family" + 0.011*"staff" + 0.011*"stay" + 0.010*"amp"'),
 (3,
  '0.011*"service" + 0.011*"clean" + 0.010*"villa" + 0.008*"stay" + 0.008*"go" + 0.008*"staff" + 0.007*"lodge" + 0.006*"quot"'),
 (4,
  '0.015*"good" + 0.012*"pool" + 0.010*"great" + 0.010*"well" + 0.010*"area" + 0.009*"lodge" + 0.009*"stay" + 0.008*"activity"')]

In [15]:
lda_model_fr.print_topics(num_words=8)

[(0,
  '0.020*"tre" + 0.015*"fair" + 0.012*"pass" + 0.011*"bien" + 0.011*"cent" + 0.010*"chos" + 0.010*"vrai" + 0.009*"sup"'),
 (1,
  '0.018*"tre" + 0.010*"bien" + 0.010*"beaucoup" + 0.009*"cher" + 0.008*"bon" + 0.008*"activit" + 0.008*"piscin" + 0.008*"pass"'),
 (2,
  '0.026*"tre" + 0.013*"enfant" + 0.012*"bien" + 0.011*"bon" + 0.009*"piscin" + 0.009*"pass" + 0.008*"activit" + 0.008*"vrai"'),
 (3,
  '0.034*"tre" + 0.014*"bien" + 0.013*"sejour" + 0.010*"piscin" + 0.009*"personnel" + 0.008*"cher" + 0.008*"propr" + 0.008*"hotel"'),
 (4,
  '0.016*"tre" + 0.011*"sall" + 0.011*"piscin" + 0.010*"bain" + 0.009*"bon" + 0.009*"chambr" + 0.009*"vill" + 0.008*"etre"')]

## Topic visualization

In [16]:
# Visualize the topics
pd.options.display.max_colwidth = 2000
viz = pyLDAvis.gensim.prepare(lda_model_en, doc_term_matrix_en, dictionary_en, mds='tsne')

In [17]:
pyLDAvis.enable_notebook()
viz
#pyLDAvis.show(viz)

In [18]:
# Visualize the topics
pd.options.display.max_colwidth = 2000
viz = pyLDAvis.gensim.prepare(lda_model_fr, doc_term_matrix_fr, dictionary_fr, mds='tsne')

In [19]:
pyLDAvis.enable_notebook()
viz
#pyLDAvis.show(viz)