References:
https://github.com/bmabey/pyLDAvis
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

pyLDAvis is designed to help users interpret the topics in a topic model that has been fit to a corpus of text data. The
package extracts information from a fitted LDA topic model to inform an interactive web-based visualization.
The visualization is intended to be used within an IPython notebook but can also be saved to a stand-alone HTML file
for easy sharing

Reference: https://buildmedia.readthedocs.org/media/pdf/pyldavis/latest/pyldavis.pdf

Python libraries needed:
- pandas
- nltk
corpus to be download using nltk.download()
- stopwords
- wordnet
- gensim
- pyldavis

Import Ashville reviews dataset
About the dataset
The reviews dataset contains information about the reviews given by reviewers as of April 2018

In [2]:
#Import the dataset
import pandas as pd
listings = pd.read_csv("listings1.csv")
listings.head()
import spacy

spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

In [3]:
listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,38585,https://www.airbnb.com/rooms/38585,20180418164707,2018-04-18,Charming Victorian twin BR for 2,Per the City Council of Asheville. Due to the ...,"Charming room with 2 twin size beds, furnished...",Per the City Council of Asheville. Due to the ...,none,Our North Asheville neighborhood stretches alo...,...,f,,"{""NORTH CAROLINA"","" NORTH CAROLINA"","" BUNCOMBE""}",f,f,moderate,t,t,1,1.2
1,67870,https://www.airbnb.com/rooms/67870,20180418164707,2018-04-18,Your Own Floor! 1 bdrm W. Asheville,Your own floor and entrance! Lots of room with...,"Finished basement with living room, bedroom, ...",Your own floor and entrance! Lots of room with...,none,Our neighborhood is a quiet one on the west si...,...,f,,"{""NORTH CAROLINA"","" NORTH CAROLINA"","" BUNCOMBE""}",t,f,moderate,f,f,2,2.16
2,80713,https://www.airbnb.com/rooms/80713,20180418164707,2018-04-18,"[DOWNTOWN] Eco~Goddess Sanctuary, Monthly Rentals","Our beautiful, natural Sanctuary is a 1.3 acre...","FEATURED in WNC Magazine, Mountain Xpress, The...","Our beautiful, natural Sanctuary is a 1.3 acre...",none,I've lived in this wonderfully diverse & safe ...,...,f,,"{""NORTH CAROLINA"","" NORTH CAROLINA"","" BUNCOMBE""}",f,f,strict,f,f,2,0.49
3,80905,https://www.airbnb.com/rooms/80905,20180418164707,2018-04-18,French Industrial Chic Loft,Let yourself melt into the delectable décor of...,Have you ever gazed at dreamy photos in a maga...,Let yourself melt into the delectable décor of...,none,"Quiet, tucked away from the hustle and bustle ...",...,f,,"{""NORTH CAROLINA"","" NORTH CAROLINA"","" BUNCOMBE""}",f,f,super_strict_60,f,f,13,0.96
4,86980,https://www.airbnb.com/rooms/86980,20180418164707,2018-04-18,[Downtown] Goddess Sanctuary & Eco-Urban Homes...,"Our beautiful, natural guesthouse is part of a...","FEATURED in WNC Magazine, Mountain Xpress, The...","Our beautiful, natural guesthouse is part of a...",none,I've lived in this wonderfully diverse & safe ...,...,f,,"{""NORTH CAROLINA"","" NORTH CAROLINA"","" BUNCOMBE""}",f,f,strict,f,f,2,0.55


##Business Problem

Perform topic modelling on reviews of Ashville Airbnb data

We need to remove stopwords and perform lemmitization on our reviews, we need to import required libraries

In [4]:
import nltk; nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kanum\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
import re
import numpy as np
import pandas as pd
from pprint import pprint


In [6]:

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


In [7]:

# spacy for lemmatization
import spacy

In [8]:


# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline


In [9]:

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [10]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use']) # We can add the extra words we want to remove from our dataset here

In [11]:
listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,38585,https://www.airbnb.com/rooms/38585,20180418164707,2018-04-18,Charming Victorian twin BR for 2,Per the City Council of Asheville. Due to the ...,"Charming room with 2 twin size beds, furnished...",Per the City Council of Asheville. Due to the ...,none,Our North Asheville neighborhood stretches alo...,...,f,,"{""NORTH CAROLINA"","" NORTH CAROLINA"","" BUNCOMBE""}",f,f,moderate,t,t,1,1.2
1,67870,https://www.airbnb.com/rooms/67870,20180418164707,2018-04-18,Your Own Floor! 1 bdrm W. Asheville,Your own floor and entrance! Lots of room with...,"Finished basement with living room, bedroom, ...",Your own floor and entrance! Lots of room with...,none,Our neighborhood is a quiet one on the west si...,...,f,,"{""NORTH CAROLINA"","" NORTH CAROLINA"","" BUNCOMBE""}",t,f,moderate,f,f,2,2.16
2,80713,https://www.airbnb.com/rooms/80713,20180418164707,2018-04-18,"[DOWNTOWN] Eco~Goddess Sanctuary, Monthly Rentals","Our beautiful, natural Sanctuary is a 1.3 acre...","FEATURED in WNC Magazine, Mountain Xpress, The...","Our beautiful, natural Sanctuary is a 1.3 acre...",none,I've lived in this wonderfully diverse & safe ...,...,f,,"{""NORTH CAROLINA"","" NORTH CAROLINA"","" BUNCOMBE""}",f,f,strict,f,f,2,0.49
3,80905,https://www.airbnb.com/rooms/80905,20180418164707,2018-04-18,French Industrial Chic Loft,Let yourself melt into the delectable décor of...,Have you ever gazed at dreamy photos in a maga...,Let yourself melt into the delectable décor of...,none,"Quiet, tucked away from the hustle and bustle ...",...,f,,"{""NORTH CAROLINA"","" NORTH CAROLINA"","" BUNCOMBE""}",f,f,super_strict_60,f,f,13,0.96
4,86980,https://www.airbnb.com/rooms/86980,20180418164707,2018-04-18,[Downtown] Goddess Sanctuary & Eco-Urban Homes...,"Our beautiful, natural guesthouse is part of a...","FEATURED in WNC Magazine, Mountain Xpress, The...","Our beautiful, natural guesthouse is part of a...",none,I've lived in this wonderfully diverse & safe ...,...,f,,"{""NORTH CAROLINA"","" NORTH CAROLINA"","" BUNCOMBE""}",f,f,strict,f,f,2,0.55


In [13]:
# Convert to list
data = listings.description.values.tolist()
pprint(data[:1])

['Per the City Council of Asheville. Due to the definite ban of short-term '
 'vacation rentals and accessory dwelling units in residentially zoned areas '
 'since January 9, 2018 in Asheville, which applies to downtown and several '
 'areas outside the city center, and carries hefty fines, I wish to assure my '
 'prospective guests that my "homestay" is totally legal as this is my primary '
 'residence, which I share with you, and I am not located in any of the '
 'aforementioned zoning areas. Charming room with 2 twin size beds, furnished '
 'with beautiful Victorian antiques on the 2nd floor with view of a lush '
 'forest and seasonally the Blue Ridge Mountains ... and the sunrise if you '
 'are an early bird ! There is a closet with hangers, a dresser and a '
 'nightstand to store your clothes and belongings. Your own private bathroom '
 'with tub/shower combination is adjacent to your room . We have central air '
 'conditioning and heating (HVAC). The house exudes old fashioned ch

In [14]:
#Tokenize words and Clean-up text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['per', 'the', 'city', 'council', 'of', 'asheville', 'due', 'to', 'the', 'definite', 'ban', 'of', 'short', 'term', 'vacation', 'rentals', 'and', 'accessory', 'dwelling', 'units', 'in', 'residentially', 'zoned', 'areas', 'since', 'january', 'in', 'asheville', 'which', 'applies', 'to', 'downtown', 'and', 'several', 'areas', 'outside', 'the', 'city', 'center', 'and', 'carries', 'hefty', 'fines', 'wish', 'to', 'assure', 'my', 'prospective', 'guests', 'that', 'my', 'homestay', 'is', 'totally', 'legal', 'as', 'this', 'is', 'my', 'primary', 'residence', 'which', 'share', 'with', 'you', 'and', 'am', 'not', 'located', 'in', 'any', 'of', 'the', 'aforementioned', 'zoning', 'areas', 'charming', 'room', 'with', 'twin', 'size', 'beds', 'furnished', 'with', 'beautiful', 'victorian', 'antiques', 'on', 'the', 'nd', 'floor', 'with', 'view', 'of', 'lush', 'forest', 'and', 'seasonally', 'the', 'blue', 'ridge', 'mountains', 'and', 'the', 'sunrise', 'if', 'you', 'are', 'an', 'early', 'bird', 'there', 'is',

In [15]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['per', 'the', 'city', 'council', 'of', 'asheville', 'due', 'to', 'the', 'definite', 'ban', 'of', 'short_term', 'vacation_rentals', 'and', 'accessory', 'dwelling', 'units', 'in', 'residentially', 'zoned', 'areas', 'since', 'january', 'in', 'asheville', 'which', 'applies', 'to', 'downtown', 'and', 'several', 'areas', 'outside', 'the', 'city', 'center', 'and', 'carries', 'hefty', 'fines', 'wish', 'to', 'assure', 'my', 'prospective', 'guests', 'that', 'my', 'homestay', 'is', 'totally', 'legal', 'as', 'this', 'is', 'my', 'primary_residence', 'which', 'share', 'with', 'you', 'and', 'am', 'not', 'located', 'in', 'any', 'of', 'the', 'aforementioned', 'zoning', 'areas', 'charming', 'room', 'with', 'twin', 'size', 'beds', 'furnished', 'with', 'beautiful', 'victorian', 'antiques', 'on', 'the', 'nd', 'floor', 'with', 'view', 'of', 'lush', 'forest', 'and', 'seasonally', 'the', 'blue_ridge', 'mountains', 'and', 'the', 'sunrise', 'if', 'you', 'are', 'an', 'early', 'bird', 'there', 'is', 'closet', 'w

In [16]:
#Remove Stopwords, Make Bigrams and Lemmatize
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [18]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
#nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['asheville', 'due', 'vacation', 'rental', 'accessory', 'dwell', 'unit', 'residentially', 'zone', 'area', 'apply', 'downtown', 'several', 'area', 'city', 'center', 'carry', 'hefty', 'fine', 'wish', 'assure', 'prospective', 'guest', 'homestay', 'totally', 'legal', 'primary_residence', 'share', 'locate', 'aforementioned', 'zoning', 'area', 'charming', 'room', 'size', 'bed', 'furnish', 'beautiful', 'victorian', 'antique', 'floor', 'view', 'forest', 'seasonally', 'blue_ridge', 'mountain', 'sunrise', 'early', 'bird', 'closet', 'hanger', 'dresser', 'store', 'clothe', 'belonging', 'private', 'bathroom', 'shower', 'combination', 'adjacent', 'room', 'central', 'heating', 'exude', 'old', 'fashioned', 'charm', 'warmth', 'coziness', 'even_though', 'new']]


In [19]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 3), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 2), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1)]]


In [20]:
id2word[0]

'accessory'

In [21]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('accessory', 1),
  ('adjacent', 1),
  ('aforementioned', 1),
  ('antique', 1),
  ('apply', 1),
  ('area', 3),
  ('asheville', 1),
  ('assure', 1),
  ('bathroom', 1),
  ('beautiful', 1),
  ('bed', 1),
  ('belonging', 1),
  ('bird', 1),
  ('blue_ridge', 1),
  ('carry', 1),
  ('center', 1),
  ('central', 1),
  ('charm', 1),
  ('charming', 1),
  ('city', 1),
  ('closet', 1),
  ('clothe', 1),
  ('combination', 1),
  ('coziness', 1),
  ('downtown', 1),
  ('dresser', 1),
  ('due', 1),
  ('dwell', 1),
  ('early', 1),
  ('even_though', 1),
  ('exude', 1),
  ('fashioned', 1),
  ('fine', 1),
  ('floor', 1),
  ('forest', 1),
  ('furnish', 1),
  ('guest', 1),
  ('hanger', 1),
  ('heating', 1),
  ('hefty', 1),
  ('homestay', 1),
  ('legal', 1),
  ('locate', 1),
  ('mountain', 1),
  ('new', 1),
  ('old', 1),
  ('primary_residence', 1),
  ('private', 1),
  ('prospective', 1),
  ('rental', 1),
  ('residentially', 1),
  ('room', 2),
  ('seasonally', 1),
  ('several', 1),
  ('share', 1),
  ('shower', 

In [22]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [23]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.034*"downtown" + 0.022*"minute" + 0.014*"walk" + 0.014*"restaurant" + '
  '0.014*"neighborhood" + 0.012*"home" + 0.011*"mile" + 0.011*"asheville" + '
  '0.009*"locate" + 0.009*"close"'),
 (1,
  '0.027*"private" + 0.026*"room" + 0.024*"bedroom" + 0.014*"guest" + '
  '0.013*"home" + 0.013*"full" + 0.012*"bed" + 0.012*"bathroom" + '
  '0.012*"kitchen" + 0.011*"space"'),
 (2,
  '0.020*"home" + 0.016*"mountain" + 0.010*"modern" + 0.010*"make" + '
  '0.010*"experience" + 0.009*"view" + 0.008*"feel" + 0.007*"property" + '
  '0.007*"luxurious" + 0.006*"tiny"')]


In [24]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
