In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline

import matplotlib.style as style
style.use('seaborn-whitegrid')

import os
import pprint
# import googlemaps
# import time
import pickle
from random import randint
from collections import defaultdict

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.naive_bayes import MultinomialNB

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, TreebankWordTokenizer, wordpunct_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.util import ngrams
from nltk.corpus import stopwords

from textblob import TextBlob

import string



In [176]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [3]:
# Set pandas options
pd.set_option('max_rows', 10)
# pd.set_option('max_colwidth', -1)
# pd.set_option('display.width', 150)
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

In [13]:
df = pd.read_csv('data/listing_descriptions.csv', index_col=0)

In [103]:
summaries[0]

"Located on Williams bike corridor, some of Portland's best restaurants, bars and coffee shops are within a 5min drive and downtown is only 10min away.  Also features a wrap around patio with a covered awning to stay dry during the rainy season."

In [77]:
df.head(2)

Unnamed: 0,id,host_id,summary,space,description,neighborhood_overview,notes,host_about
0,7893,21489,"Located on Williams bike corridor, some of Por...",This NE Portland 2 bedroom home is located jus...,"Located on Williams bike corridor, some of Por...",,,"I love to cook, sail and play the mandolin ver..."
1,12899,49682,Settle in to a space of your own. You'll have ...,We live in a neighborhood of beautiful gardens...,Settle in to a space of your own. You'll have ...,We're within walking distance of a grocery and...,We welcome musicians and even have a guitar fo...,"We enjoy cooking, singing, hiking, bicycles, t..."


In [81]:
df.columns

Index(['id', 'host_id', 'summary', 'space', 'description',
       'neighborhood_overview', 'notes', 'host_about'],
      dtype='object')

In [92]:
# Fill NaNs with empty strings
for col in ['summary', 'space', 'description', 'neighborhood_overview', 'notes', 'host_about']:
    df[col].fillna('', inplace=True)


# Make sure there are not any null values remaining
df.head(2)

Unnamed: 0,id,host_id,summary,space,description,neighborhood_overview,notes,host_about
0,7893,21489,"Located on Williams bike corridor, some of Por...",This NE Portland 2 bedroom home is located jus...,"Located on Williams bike corridor, some of Por...",,,"I love to cook, sail and play the mandolin ver..."
1,12899,49682,Settle in to a space of your own. You'll have ...,We live in a neighborhood of beautiful gardens...,Settle in to a space of your own. You'll have ...,We're within walking distance of a grocery and...,We welcome musicians and even have a guitar fo...,"We enjoy cooking, singing, hiking, bicycles, t..."


In [244]:
descriptions = df['description']
about_host_text = df['host_about']
summaries = df['summary']
type(summaries)

pandas.core.series.Series

#### Remove punctuation, make lowercase, etc.

In [163]:
# Make lower case
summaries = summaries.apply(lambda x: x.lower())



In [None]:
# for text in texts:
#     sentences = nltk.sent_tokenize(text)
#     for sentence in sentences:
#         words = nltk.word_tokenize(sentence)
#         tagged_words = nltk.pos_tag(words)
#         ne_tagged_words = nltk.ne_chunk(tagged_words)
#         print ne_tagged_words

In [164]:
summaries = pd.DataFrame(summaries)

In [165]:
# Remove punctuation
summaries['nopunc'] = summaries['summary'].str.replace('[^\w\s]','')

In [166]:
summaries

Unnamed: 0,summary,nopunc
0,"located on williams bike corridor, some of por...",located on williams bike corridor some of port...
1,settle in to a space of your own. you'll have ...,settle in to a space of your own youll have 2 ...
2,"newly remodeled 2 bedroom (3 queen beds), 2 fu...",newly remodeled 2 bedroom 3 queen beds 2 full ...
3,"if you are an urban farming pioneer, enthusias...",if you are an urban farming pioneer enthusiast...
4,"if you are an urban farming pioneer, enthusias...",if you are an urban farming pioneer enthusiast...
...,...,...
5400,moderate sized room with north garden and east...,moderate sized room with north garden and east...
5401,modern portland home with mid-century charm! t...,modern portland home with midcentury charm thi...
5402,1 bed 1 bath apartment living in ultra posh do...,1 bed 1 bath apartment living in ultra posh do...
5403,this is a cozy basement studio nestled on the ...,this is a cozy basement studio nestled on the ...


In [167]:
# Tokenize by sentence
summaries['sent_tokenize'] = summaries['nopunc'].apply(lambda x: sent_tokenize(x))

In [153]:
# Tokenize by word
tokenizer = TreebankWordTokenizer()
summaries['word_tok'] = summaries['sent_tok'].apply(lambda x: [tokenizer.tokenize(y) for y in x])

In [155]:
summaries['no

Unnamed: 0,summary,nopunc,no_punc,word_tok
0,"located on williams bike corridor, some of por...",located on williams bike corridor some of port...,[located on williams bike corridor some of por...,"[[located, on, williams, bike, corridor, some,..."
1,settle in to a space of your own. you'll have ...,settle in to a space of your own youll have 2 ...,[settle in to a space of your own youll have 2...,"[[settle, in, to, a, space, of, your, own, you..."
2,"newly remodeled 2 bedroom (3 queen beds), 2 fu...",newly remodeled 2 bedroom 3 queen beds 2 full ...,[newly remodeled 2 bedroom 3 queen beds 2 full...,"[[newly, remodeled, 2, bedroom, 3, queen, beds..."
3,"if you are an urban farming pioneer, enthusias...",if you are an urban farming pioneer enthusiast...,[if you are an urban farming pioneer enthusias...,"[[if, you, are, an, urban, farming, pioneer, e..."
4,"if you are an urban farming pioneer, enthusias...",if you are an urban farming pioneer enthusiast...,[if you are an urban farming pioneer enthusias...,"[[if, you, are, an, urban, farming, pioneer, e..."
...,...,...,...,...
5400,moderate sized room with north garden and east...,moderate sized room with north garden and east...,[moderate sized room with north garden and eas...,"[[moderate, sized, room, with, north, garden, ..."
5401,modern portland home with mid-century charm! t...,modern portland home with midcentury charm thi...,[modern portland home with midcentury charm th...,"[[modern, portland, home, with, midcentury, ch..."
5402,1 bed 1 bath apartment living in ultra posh do...,1 bed 1 bath apartment living in ultra posh do...,[1 bed 1 bath apartment living in ultra posh d...,"[[1, bed, 1, bath, apartment, living, in, ultr..."
5403,this is a cozy basement studio nestled on the ...,this is a cozy basement studio nestled on the ...,[this is a cozy basement studio nestled on the...,"[[this, is, a, cozy, basement, studio, nestled..."


In [221]:
tf_vectorizer = CountVectorizer(max_df=0.5,
                                min_df=2,
                                lowercase=True,
                                strip_accents='unicode',
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                stop_words='english', ngram_range=(1,2)
                               )
dtm_tf = tf_vectorizer.fit_transform(summaries)

In [207]:
dtm_tf.shape

(5405, 4706)

In [208]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_topics=10, random_state=0)
lda_tf.fit(dtm_tf)

# # for TFIDF DTM
# lda_tfidf = LatentDirichletAllocation(n_topics=20, random_state=0)
# lda_tfidf.fit(dtm_tfidf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=10, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [209]:
pyLDAvis.sklearn.prepare(lda, dtm_tf, tf_vectorizer)

AssertionError: Topic-term distributions and document-term matrix have different number of columns, 1741 != 4706.

In [62]:
tf.shape

(5405, 5058)

In [66]:
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,2))
doc_vectors = vectorizer.fit_transform(summaries)

In [70]:
doc_vectors

<5405x89207 sparse matrix of type '<class 'numpy.float64'>'
	with 359635 stored elements in Compressed Sparse Row format>

In [201]:
lda_tf = LatentDirichletAllocation(n_components=10, max_iter=5,
                                learning_method='online',
                                learning_offset=10.0, batch_size=128,
                                random_state=49, evaluate_every=-1)
lda_tf.fit(tf)

In [220]:
vocab = tf_vectorizer.get_feature_names()

for topic in range(10):
    print(f"TOPIC {topic}")
    for j in np.argsort(-lda.components_,1)[topic,:10]:
        print(vocab[j])
    print()

TOPIC 0
place
great
business
travelers
close
couples
good
solo
adventurers
neighborhood

TOPIC 1
tabor
fun
outdoor
covered
backyard
scene
patio
fred
providence
serene

TOPIC 2
district
pearl
score
ceilings
loft
high
heart
steps
stay
bar

TOPIC 3
new
brand
experience
seasons
home
modern
designed
visit
school
guesthouse

TOPIC 4
north
friendly
people
welcome
live
kenton
light
friends
avenue
pets

TOPIC 5
room
private
bed
bedroom
queen
bathroom
kitchen
living
bath
space

TOPIC 6
coffee
fridge
tea
microwave
breakfast
mini
available
access
maker
use

TOPIC 7
alberta
apartment
arts
district
basement
located
house
home
blocks
studio

TOPIC 8
dryer
washer
house
tub
kitchen
unit
hot
enjoy
space
cozy

TOPIC 9
cute
square
sleeping
feet
nook
smoking
transport
backyard
cottage
inside



In [258]:
def run_lda_with_defaults(corpus, num_topics):
    tf_vectorizer = CountVectorizer(lowercase=True,
                                    strip_accents='unicode',
                                    token_pattern = r'\b[a-zA-Z]{3,}\b',
                                    stop_words='english')
    
    dtm_tf = tf_vectorizer.fit_transform(corpus)
    
    lda_tf = LatentDirichletAllocation(n_components=num_topics, random_state=0, n_jobs=-1)
    lda_tf.fit(dtm_tf)
    
    return pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

In [256]:
def run_lda(corpus, num_topics):
    tf_vectorizer = CountVectorizer(max_df=0.5,
                                    min_df=2,
                                    lowercase=True,
                                    strip_accents='unicode',
                                    token_pattern = r'\b[a-zA-Z]{3,}\b',
                                    stop_words='english',
                                    ngram_range=(1,3))
    
    dtm_tf = tf_vectorizer.fit_transform(corpus)
    
    lda_tf = LatentDirichletAllocation(n_components=num_topics, random_state=0, learning_method='online', n_jobs=-1)
    lda_tf.fit(dtm_tf)
    
    return pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

In [255]:
def run_tfidf_lda(corpus, num_topics):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.5,
                                    min_df=2,
                                    lowercase=True,
                                    strip_accents='unicode',
                                    token_pattern = r'\b[a-zA-Z]{3,}\b',
                                    stop_words='english')
    
    dtm_tfidf = tfidf_vectorizer.fit_transform(corpus)
    
    lda_tfidf = LatentDirichletAllocation(n_components=num_topics, random_state=0, learning_method='online', n_jobs=-1)
    lda_tfidf.fit(dtm_tfidf)
    
    return pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)

In [222]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10, ngram_range=(1,2))
dtm_tf = tf_vectorizer.fit_transform(summaries)
print(dtm_tf.shape)

(5405, 3766)


In [223]:
lda_tf = LatentDirichletAllocation(n_topics=10, random_state=0, learning_method='online', batch_size=1028)
lda_tf.fit(dtm_tf)



LatentDirichletAllocation(batch_size=1028, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=10, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [None]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

In [259]:
run_lda_with_defaults(descriptions, 10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
