# NLP: Review Summary & Topic Modelling

## Imports

#### Imports

In [239]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import dask.dataframe as dd
import pandas as pd
import numpy as np
import math
import os
import pickle
import timeit
from collections import Counter

# Graphics
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='svg'
import seaborn as sns
sns.set(font_scale=1.2)
plt.style.use('seaborn')
# from mlxtend.plotting import plot_decision_regions
# from mpl_toolkits.mplot3d import Axes3D

In [2]:
from __future__ import print_function
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from operator import itemgetter
import contractions
from langdetect import detect

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
#Plain text parsers since we are parsing through text
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lex_rank import LexRankSummarizer 
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer

In [4]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.preprocessing import StandardScaler

In [5]:
# %run '/Users/bellepeng/Desktop/Metis/Projects/Project_AirBNB/notebooks/helper_functions.py' 
# os.chdir('/Users/bellepeng/Desktop/Metis/Projects/Project_AirBNB/notebooks')
os.chdir('/home/ubuntu/notebooks')
import helper_functions
from helper_functions import clean_dolla, plot_bar, cluster_inertia, display_topics

#### Load Data

In [6]:
with open('/home/ubuntu/data/reviews.pkl', 'rb') as file:
    reviews = pickle.load(file)
print(reviews.shape)
reviews[96:105]

(278884, 4)


Unnamed: 0,listing_id,id,date,comments
96,958,172021239,2017-07-19,"We, a family of four with two little kids, sta..."
97,958,174401034,2017-07-25,非常温馨的家，房东十分友好，小花园安静。
98,958,177060286,2017-08-01,"Great place, very well located! Well furnished..."
99,958,178027423,2017-08-04,My husband and I had a wonderful stay at Holly...
100,958,180175634,2017-08-09,Une super jolie maison dans le plus beau quart...
101,958,180981797,2017-08-11,Very comfortable apartment with loads of priva...
102,958,182419319,2017-08-14,Holly was a terrific host - quite attentive an...
103,958,183230677,2017-08-16,Very comfortable and clean. Holly was attentiv...
104,958,185107964,2017-08-20,The neighborhood was great. Walking distance t...


In [8]:
!ls ../data/

listings_sf_cleaned.pkl        reviews.pkl
reviews_cleaned.pkl	       summary_df.pkl
reviews_noForeignLang_all.pkl  summaryLexRank_all.pkl
reviews_noForeignLang.pkl      summaryLSA_all.pkl


In [9]:
# os.path.getsize('/home/ubuntu/data/reviews_cleaned3.pkl')
with open('/home/ubuntu/data/reviews_cleaned.pkl', 'rb') as file:
    reviews_cleaned = pickle.load(file)

print(len(reviews_cleaned))
reviews_cleaned[96:105]

74109


['make stress free although could meet make sure leave detail note house extremely responsive time need speak house location many restaurants grocery store nearby quick drive away downtown highly',
 'host cancel reservation 170 days arrival automate post',
 'garage back yard easy find pet friendly accommodations like price',
 'house conveniently locate public transportation plenty eat pick essentials within walk distance neighborhood seem safe quiet enough complaint front bedroom light sleepers street noise come upstairs neighbor hear move around kitchen nicely remodel greet warm helpful woman owner town reachable make sure everything need',
 'wonderful time family family nice defintely come back',
 'house perfect family five public transportation conveniences super close bed incredibly comfortable definitely',
 'house awesome confortable need backyard plus tipycal house feel local neigbour always helpfull friendly check check also ok thank',
 'love comfortable clean lovely',
 'lovely 

In [141]:
with open('/home/ubuntu/data/reviews_noForeignLang.pkl', 'rb') as file:
    reviews_noForeignLang = pickle.load(file)

print(reviews_noForeignLang.shape)
reviews_noForeignLang[96:102]

(278884, 4)


Unnamed: 0,listing_id,id,date,comments
96,958,172021239,2017-07-19,"We, a family of four with two little kids, sta..."
97,958,174401034,2017-07-25,
98,958,177060286,2017-08-01,"Great place, very well located! Well furnished..."
99,958,178027423,2017-08-04,My husband and I had a wonderful stay at Holly...
100,958,180175634,2017-08-09,
101,958,180981797,2017-08-11,Very comfortable apartment with loads of priva...


## Preprocess:
- Remove the Foreign Language Comments
- Reduce the use only the 20 most recent reviews for each listing  
Later in the notebook:
- Fix Contractions
- Remove pronouns
- Remove stop words
- Lemmatize

#### Empty Out Foreign Language comments

In [None]:
# Do not re-run this unless needed, takes a long time
# reviews_noForeignLang=[]
# for comment in reviews['comments']: 
#     if not comment:
#         reviews_noForeignLang.append(" ")
#     try: 
#         if detect(comment) == 'en':
#             reviews_noForeignLang.append(comment)
#         else:
#             reviews_noForeignLang.append(" ")
#     except:
#         reviews_noForeignLang.append(" ")

# print(len(reviews), len(reviews_noForeignLang))
# print()
# print(reviews['comments'][95:103])
# print()
# print(reviews_noForeignLang[95:103])

In [142]:
# Bring in the rest of the dataframe
reviews_temp=reviews.drop(['comments'], axis=1)
reviews_temp.head()
reviews_noForeignLang = pd.merge(reviews_temp,pd.DataFrame(reviews_noForeignLang, columns=['comments']),left_index=True, right_index=True)
reviews_noForeignLang.iloc[96:102]

Unnamed: 0,listing_id,id,date,comments
96,958,172021239,2017-07-19,"We, a family of four with two little kids, sta..."
97,958,174401034,2017-07-25,
98,958,177060286,2017-08-01,"Great place, very well located! Well furnished..."
99,958,178027423,2017-08-04,My husband and I had a wonderful stay at Holly...
100,958,180175634,2017-08-09,
101,958,180981797,2017-08-11,Very comfortable apartment with loads of priva...


In [None]:
# import pickle
# os.chdir('/home/ubuntu/data')
# with open('reviews_noForeignLang.pkl', 'wb') as file: # reviews_cleaned2 is the one with the pronouns removed
#     pickle.dump(reviews_cleaned, file)
# with open('reviews_noForeignLang.pkl', 'rb') as file:
#     reviews_noForeignLang = pickle.load(file)
# scp -i ~/.ssh/aws_key.pem ubuntu@18.191.195.42:/home/ubuntu/data/reviews_cleaned3.pkl bellepeng@127.0.0.0: ~/Desktop

#### Reduce the size of my dataset: delete ones with no comment, take only most recent 20 reviews

In [143]:
# Delete the lines with no comment, i.e. in foreign language (278884 - 260217) deleted 7%
reviews_noForeignLang = reviews_noForeignLang[reviews_noForeignLang.comments != ' ']
print(reviews_noForeignLang.shape)
reviews_noForeignLang[96:102]

(260217, 4)


Unnamed: 0,listing_id,id,date,comments
103,958,183230677,2017-08-16,Very comfortable and clean. Holly was attentiv...
104,958,185107964,2017-08-20,The neighborhood was great. Walking distance t...
106,958,187813439,2017-08-27,We had a fantastic stay at Holly's place. The ...
108,958,192900392,2017-09-11,The neighborhood is lovely. It was super easy ...
109,958,196324178,2017-09-22,The apartment was perfect. In a lovely neighbo...
110,958,198789221,2017-09-30,"If you want to fell the concept "" Being home ..."


In [144]:
# Keep only the 20 most recent (260217 - 74109)
reviews_recent20 = reviews_noForeignLang.groupby(['listing_id']).tail(20)
print(reviews_noForeignLang.shape, reviews_recent20.shape)
reviews_recent20.head()

(260217, 4) (74109, 4)


Unnamed: 0,listing_id,id,date,comments
131,958,249775050,2018-04-02,A perfect spot for a mid-to-long break. Near s...
132,958,252914991,2018-04-12,"We had a fantastic stay with Holly, her place ..."
133,958,256377297,2018-04-22,Great Air BnB in a handy position in v nice su...
134,958,267480372,2018-05-21,Great place to stay and explore San Francisco....
135,958,268720981,2018-05-25,"Holly's place was great. Clean, sunny and well..."


In [14]:
# Get all the IDs of the listings
listing_ids=reviews_noForeignLang['listing_id'].unique()
print(len(listing_ids))
listing_ids[:5]

5457


array([ 958, 5858, 7918, 8142, 8339])

## Run Summarizer

In [None]:
def review_summarizer(text, num_sentences=3, printToScreen=False):
    parser = PlaintextParser(text, Tokenizer('english'))
    
    Summarizer_LexRank = LexRankSummarizer()
    summary_LexRank = Summarizer_LexRank(parser.document, num_sentences) 
    if printToScreen==True:
        print("LexRank Summarizer:")
        for sentence in summary_LexRank:
            print(sentence)
        print()
    
    Summarizer_Lsa = LsaSummarizer()
    summary_LSA = Summarizer_Lsa(parser.document, num_sentences)
    if printToScreen==True:
        print("LSA Summarizer:")
        for sentence in summary_LSA:
            print(sentence)
        print()
    
#     Summarizer_Luhn = LuhnSummarizer()
#     summary_Luhn = Summarizer_Luhn(parser.document, num_sentences)
#     if printToScreen==True:
#         print("Luhn Summarizer:")
#         for sentence in summary_Luhn:
#             print(sentence)
#         print()
    
#     Summarizer_TextRank = TextRankSummarizer()
#     summary_TextRank = Summarizer_TextRank(parser.document, num_sentences)
#     if printToScreen==True:
#         print("Text Rank Summarizer:")
#         for sentence in summary_TextRank:
#             print(sentence)
#         print()
    
    return  summary_LexRank, summary_LSA
    # , summary_Luhn, summary_TextRank
    #Summarizer_LexRank, Summarizer_Lsa, Summarizer_Luhn, Summarizer_TextRank, \

In [None]:
# Implement for all
def summarize_all_reviews(reviews, listing_ids, num_sentences=3, printToScreen=False):
    summaryLexRank = []
    summaryLSA = []
    # summaryLuhn = []
    # summaryTextRank = []
    
    # Get the reviews for one listing at a time
    for i in listing_ids:
        print("listing id:", i)
        ix_one_review = list(reviews[reviews['listing_id']==i].index)
        reviews_one_listing = [reviews['comments'].ix[ix] for ix in ix_one_review]
        
        # Prepare the data: put all the reviews together
        reviews_one_listing2 = (" ").join(item for item in reviews_one_listing if isinstance(item, str))
        
        # Summarize
        # sum1_LexRank, sum1_LSA, sum1_Luhn, sum1_TextRank = \
        sum1_LexRank, sum1_LSA = \
        review_summarizer(reviews_one_listing2, num_sentences=num_sentences, printToScreen=printToScreen)
        
        # Add to list of summaries
        stringLexRank = str("")
        stringLSA = str("")
        for i in range(len(sum1_LexRank)):
            stringLexRank = stringLexRank + str(sum1_LexRank[i])+" "
            stringLSA = stringLSA + str(sum1_LSA[i])+" "
            
        summaryLexRank.append(stringLexRank)
        summaryLSA.append(stringLSA)
        # summaryLuhn.append(str(sum1_Luhn[0])+" "+str(sum1_Luhn[1]))
        # summaryTextRank.append(str(sum1_TextRank[0])+" "+str(sum1_TextRank[1]))

    return summaryLexRank, summaryLSA #, summaryLuhn, summaryTextRank

In [None]:
summaryLexRank, summaryLSA = \
    summarize_all_reviews(reviews=reviews_recent20, listing_ids=listing_ids, num_sentences=2, printToScreen=False)

In [None]:
print(len(summaryLexRank), len(summaryLSA))
print(summaryLexRank[:2])
print()
print(summaryLSA[:2])

In [None]:
with open('/home/ubuntu/data/summaryLexRank_all.pkl', 'wb') as file: 
    pickle.dump(summaryLexRank, file)
with open('/home/ubuntu/data/summaryLSA_all.pkl', 'wb') as file: 
    pickle.dump(summaryLSA, file)
# with open('/home/ubuntu/data/summaryLuhn400.pkl', 'wb') as file: 
#     pickle.dump(summaryLuhn3, file)
# with open('/home/ubuntu/data/summaryTextRank400.pkl', 'wb') as file: 
#     pickle.dump(summaryTextRank3, file)

In [None]:
len(listing_ids)

__Diagnose errors__

In [None]:
print(list(listing_ids).index(1471683))
print(np.argwhere(listing_ids==1471683))
listing_ids[783]

In [None]:
import re
# re.search(a, 'place')
re.search('close', str(list(a)))
# str(list(a))[201:205]

#### Create a summarized Dataset of Listing_ID, summaryLexRank, and summaryLSA

In [None]:
print(len(summaryLexRank), len(summaryLSA), len(listing_ids))
summary_df = pd.DataFrame(np.column_stack([listing_ids, summaryLexRank, summaryLSA]), 
                          columns=['listing_id', 'summaryLexRank', 'summaryLSA'])
print(summary_df.shape)
# summary_df.head(2)
# summary_df['summaryLexRank'].iloc[0]

In [None]:
# with open('/home/ubuntu/data/summary_df.pkl', 'wb') as file:
#     pickle.dump(summary_df, file)

## Topic Modeling

### Preprocess & Vectorize for Topic Modelling (Vectorize and clean only the top 20 reviews for each listing)

In [12]:
# Remove Stop Words
stop = stopwords.words('english')
print(len(stop))
punctuations = ['.', ',', '(', ')', "!", "?", "'", '"', "<", ">", "-", ":", ";"]
more = ["/n", "/t", "'s"]
airbnb =['me', 'us', 'our', 'you', 'your', 'yours', 'her', 'his', 'him', 'her', 'was', 'is', 'them', 'there', 
         'great', 'place', 'stay', 'recommend',  'would', 'even', 'good', 'wait', 'still'] 
stop += punctuations
print(len(stop))
stop += more
print(len(stop))
stop += airbnb
print(len(stop))

179
192
195
218


In [13]:
def clean_text(text):
    if (type(text)==float):         # For missings
        return ""
    else:
        text = contractions.fix(text) # Fix Contractions
        tokens = word_tokenize(text)  # Tokenize
        # remove pronouns
        pos = nltk.pos_tag(tokens)   
        list_to_remove=[]
        for i in range(len(tokens)):
            if len(pos[i][1])>0 and pos[i][1] in ("NNP", "NNPS"):
                list_to_remove.append(i)
        tokens = np.delete(tokens, list_to_remove)
        # Lemmatize
        lemmatize = WordNetLemmatizer()
        clean_text = [lemmatize.lemmatize(token.lower().strip(), pos='v') for token in tokens]
        # Remove stop words
        clean_text = [x for x in clean_text if x not in stop]
    
    return ' '.join(clean_text)

In [19]:
# Clean just the top 20 reviews for each listing
reviews_cleaned = [clean_text(r) for r in reviews_recent20['comments']]
print(len(reviews_cleaned))

74109


In [14]:
# Clean all of the reviews
reviews_cleaned_all = [clean_text(r) for r in reviews_noForeignLang['comments']]
print(len(reviews_cleaned_all))

278884


In [15]:
os.chdir('/home/ubuntu/data')
with open('reviews_cleaned_all.pkl', 'wb') as file:
    pickle.dump(reviews_cleaned_all, file)

In [23]:
os.chdir('/home/ubuntu/data')
with open('reviews_cleaned.pkl', 'wb') as file:
    pickle.dump(reviews_cleaned, file)

### Topic Modeling
For the 20 most recent reviews of each listing:  
1. Count Vectorize
1. NMF to get the topics
1. Get the highest topic for each review
1. For each listing, get the highet frequency of the topic
1. Label the topic mannualy

In [16]:
# Vectorize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1,2), stop_words=stop, lowercase=True)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words=stop, lowercase=True)

### Find the topic of each review, then aggregate up to the listing level

In [17]:
# Display the topics and save the topic words
def display_topics(model, fit, feature_names, no_top_words, topic_names=None):
    topic_words=[]
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic {a:} score: {b: 6.1f}%".format(a=ix, b=100*sum(fit[:,ix])/fit.sum()   ))
        else:
            print("\nTopic {a:} score: {b: 6.1f}%".format(a=topic_names[ix], b=100*sum(fit[:,ix])/fit.sum()   ))
        
        single_topic=[feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
        print(", ".join(single_topic))
        topic_words.append(single_topic)
               
    return topic_words

In [154]:
# NMF
n_comp=20
n_words=20

cv = count_vectorizer.fit_transform(reviews_cleaned)
tfidf = tfidf_vectorizer.fit_transform(reviews_cleaned)
print("Count Vectorizer Shape", cv.shape)
print("TFIDF Vectorizer Shape", tfidf.shape)
print()

print("NMF Decomposition with Count Vectorizer")
nmf = NMF(n_components=n_comp)
nmf_data = nmf.fit_transform(cv)

topics_nmf = display_topics(nmf, nmf_data, count_vectorizer.get_feature_names(), n_words)

Count Vectorizer Shape (74109, 458854)
TFIDF Vectorizer Shape (74109, 458854)

NMF Decomposition with Count Vectorizer

Topic 0 score:    3.2%
room, bathroom, live, live room, kitchen, share, private, bed, one, room clean, large, hotel, small, two, guests, floor, bedroom, spacious, door, use

Topic 1 score:    5.2%
apartment, apartment clean, spacious, locate, lovely, live, beautiful, apartment location, two, kitchen, build, apartment perfect, responsive, apartment spacious, highly, helpful, apartment locate, street, lot, apartment beautiful

Topic 2 score:    5.8%
home, beautiful, like, felt, feel, away, love, family, lovely, wonderful, like home, away home, home away, welcome, perfect, felt like, beautiful home, enjoy, home beautiful, neighborhood

Topic 3 score:    7.7%
host, wonderful, arrival, days, friendly, wonderful host, reservation, post, cancel, helpful, amaze, automate, arrival automate, automate post, host cancel, cancel reservation, days arrival, excellent, responsive, ac

In [355]:
topic_labels = ['Bathroom, Livingroom, Kitchen, Bedroom', 'Lovely apartment', 'Home feels like home away from home',
                'Cancellation / Reservation / Arrival', 'Within walking distance to shops/restaurants/cafe',
                'Quiet neighborhood, close to public transportation', 'Clean', 'Generic thank you, nothing specific', 'Really enjoyed it',
                'Beautiful house', 'Good area for work', 'Comfort', 'Feels welcoming', 'Communication and access to downtown/transportation',
                'Nice host', 'Perfect location', 'Well equipt', 'Beautiful view / city view', 'Parking', 
                'Definitely will come back next time!'
               ]

In [356]:
# testing 
len(topics_nmf)
print(topics_nmf[0])
print(nmf_data.shape)
print(nmf_data[0].argsort())
print(nmf_data[0].argsort()[::-1])
print(nmf_data[0].argsort()[::-1][0])

['room', 'bathroom', 'live', 'live room', 'kitchen', 'share', 'private', 'bed', 'one', 'room clean', 'large', 'hotel', 'small', 'two', 'guests', 'floor', 'bedroom', 'spacious', 'door', 'use']
(74109, 20)
[ 0 16 14 13 12 11 10  8  9  3  2  1 19  6 15 17  5 18  4  7]
[ 7  4 18  5 17 15  6 19  1  2  3  9  8 10 11 12 13 14 16  0]
7


In [357]:
# Get the topic of each review
topics = [review.argsort()[::-1][0] for review in nmf_data]
print(len(topics))
topics[:5]

74109


[7, 6, 13, 6, 7]

In [358]:
# Map to the labels
topics_in_words = [topic_labels[x] for x in topics]
print(len(topics_in_words))
topics_in_words[:10]

74109


['Generic thank you, nothing specific',
 'Clean',
 'Communication and access to downtown/transportation',
 'Clean',
 'Generic thank you, nothing specific',
 'Clean',
 'Well equipt',
 'Within walking distance to shops/restaurants/cafe',
 'Good area for work',
 'Cancellation / Reservation / Arrival']

In [372]:
# merge back to data
# reviews_recent20.reset_index(inplace=True)
reviews_recent20_with_topics = pd.merge(reviews_recent20, pd.DataFrame(topics, columns=['review_topic']),
                                       left_index=True, right_index=True)
reviews_recent20_with_topics.shape

(74109, 5)

In [373]:
reviews_recent20_with_topics2 = pd.merge(reviews_recent20_with_topics, pd.DataFrame(topics_in_words, columns=['review_topic_in_words']),
                                       left_index=True, right_index=True)
reviews_recent20_with_topics2.shape


(74109, 6)

In [374]:
reviews_recent20_with_topics2.head()

Unnamed: 0,listing_id,id,date,comments,review_topic,review_topic_in_words
0,958,249775050,2018-04-02,A perfect spot for a mid-to-long break. Near s...,7,"Generic thank you, nothing specific"
1,958,252914991,2018-04-12,"We had a fantastic stay with Holly, her place ...",6,Clean
2,958,256377297,2018-04-22,Great Air BnB in a handy position in v nice su...,13,Communication and access to downtown/transport...
3,958,267480372,2018-05-21,Great place to stay and explore San Francisco....,6,Clean
4,958,268720981,2018-05-25,"Holly's place was great. Clean, sunny and well...",7,"Generic thank you, nothing specific"


In [375]:
# Get the review topic counts on listing level
listing_review_topic_counts = reviews_recent20_with_topics2.groupby(['listing_id'])['review_topic_in_words'].value_counts()

# manipulate it back to dataframe and fix the cosmetics
listing_top_topic = pd.DataFrame(listing_review_topic_counts.groupby(['listing_id']).head(3))
listing_top_topic.columns=['topic_frequency']
listing_top_topic.reset_index(inplace=True)
listing_top_topic.set_index('listing_id', inplace=True)

print(listing_top_topic.shape)
listing_top_topic.head()

(14987, 2)


Unnamed: 0_level_0,review_topic_in_words,topic_frequency
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1
958,Clean,3
958,"Generic thank you, nothing specific",2
958,Home feels like home away from home,2
5858,Beautiful house,3
5858,Clean,3


In [377]:
topic1 = listing_top_topic.groupby('listing_id').nth(0)
topic1.columns=['Topic1', 'Topic1_freq']
topic1.head()

Unnamed: 0_level_0,Topic1,Topic1_freq
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1
958,Clean,3
5858,Beautiful house,3
7918,Feels welcoming,3
8142,Clean,2
8339,Beautiful house,3


In [378]:
topic2 = listing_top_topic.groupby('listing_id').nth(1)
topic2.columns=['Topic2', 'Topic2_freq']
topic2.head()

Unnamed: 0_level_0,Topic2,Topic2_freq
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1
958,"Generic thank you, nothing specific",2
5858,Clean,3
7918,"Bathroom, Livingroom, Kitchen, Bedroom",2
8142,Feels welcoming,2
8339,Feels welcoming,3


In [379]:
topic3 = listing_top_topic.groupby('listing_id').nth(2)
topic3.columns=['Topic3', 'Topic3_freq']
topic3.head()

Unnamed: 0_level_0,Topic3,Topic3_freq
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1
958,Home feels like home away from home,2
5858,Home feels like home away from home,3
7918,Cancellation / Reservation / Arrival,2
8142,Nice host,1
8339,Cancellation / Reservation / Arrival,2


In [380]:
print(len(topic1), len(topic2), len(topic3))

5457 4951 4579


In [381]:
listing_top_topic = pd.merge(topic1, topic2, left_index=True, right_index=True, how='outer')
listing_top_topic.shape

(5457, 4)

In [382]:
listing_top_topic = pd.merge(listing_top_topic, topic3, left_index=True, right_index=True, how='outer')
listing_top_topic.shape

(5457, 6)

In [383]:
listing_top_topic.head()

Unnamed: 0_level_0,Topic1,Topic1_freq,Topic2,Topic2_freq,Topic3,Topic3_freq
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
958,Clean,3,"Generic thank you, nothing specific",2.0,Home feels like home away from home,2.0
5858,Beautiful house,3,Clean,3.0,Home feels like home away from home,3.0
7918,Feels welcoming,3,"Bathroom, Livingroom, Kitchen, Bedroom",2.0,Cancellation / Reservation / Arrival,2.0
8142,Clean,2,Feels welcoming,2.0,Nice host,1.0
8339,Beautiful house,3,Feels welcoming,3.0,Cancellation / Reservation / Arrival,2.0


In [384]:
listing_top_topic.to_csv('/home/ubuntu/data/listing_top_topic.csv')

In [354]:
# ! scp -i ~/.ssh/aws_key.pem ubuntu@18.188.92.105:/home/ubuntu/data/listing_top_topic.csv bellepeng@127.0.0.1: ~/Desktop/Metis/Projects/Project_AirBNB/data
# ! scp -i ~/.ssh/aws_key.pem ubuntu@18.188.92.105: /home/ubuntu/Airbnb/notebooks/Part\ II\ Topic\ Modeling\ \(AWS\).ipynb bellepeng@127.0.0.1: ~/Desktop/Metis/Projects/Project_AirBNB/notebook
        

#### Attempts not chosen:

In [32]:
print("LSA Decomposition with Count Vectorizer")
lsa_tfidf = TruncatedSVD(n_components=n_comp)
lsa_tfidf_data = lsa_tfidf.fit_transform(tfidf)

topics_lsa_tfidf = display_topics(lsa_tfidf, lsa_tfidf_data, tfidf_vectorizer.get_feature_names(), n_words)

LSA Decomposition with Count Vectorizer

Topic 0 score:  101.2%
location, host, clean, nice, apartment, comfortable, room, everything, home, really, house, perfect, definitely, easy, need, make, time, walk, wonderful, neighborhood

Topic 1 score:    2.8%
automate post, arrival automate, automate, cancel, post, reservation, days arrival, host cancel, cancel reservation, arrival, days, host, reservation days, reservation cancel, day arrival, cancel days, reservation day, 10 days, day, reservation 10

Topic 2 score:   -4.9%
location, location host, host, host location, location clean, perfect location, clean location, apartment location, nice location, excellent location, convenient location, location perfect, location nice, excellent, location close, amaze location, location definitely, perfect, value, room location

Topic 3 score:   -2.0%
host, wonderful, wonderful host, location host, friendly, friendly host, nice host, amaze, host location, amaze host, excellent host, excellent, fanta

In [34]:
print("LSA Decomposition with Count Vectorizer")
lsa_cv = TruncatedSVD(n_components=n_comp)
lsa_cv_data = lsa_cv.fit_transform(cv)

topics_lsa_cv = display_topics(lsa_cv, lsa_cv_data, count_vectorizer.get_feature_names(), n_words)

LSA Decomposition with Count Vectorizer

Topic 0 score:  126.3%
host, room, clean, location, apartment, home, comfortable, walk, nice, make, get, house, need, time, really, everything, also, well, easy, city

Topic 1 score:   -3.1%
room, house, nice, bathroom, bed, home, clean, like, share, room clean, private, live, night, live room, one, really, use, time, guests, get

Topic 2 score:    0.1%
home, make, host, feel, welcome, make feel, beautiful, wonderful, house, like, amaze, experience, time, felt, love, perfect, city, lovely, sure, make sure

Topic 3 score:  -13.1%
walk, get, park, also, home, like, house, around, away, time, city, one, use, street, take, find, distance, area, day, walk distance

Topic 4 score:   -4.6%
apartment, room, make, really, feel, home, everything, need, like, time, bed, make feel, comfortable, welcome, bathroom, could, one, sure, kitchen, make sure

Topic 5 score:    2.4%
nice, really, house, host, everything, really nice, time, enjoy, really enjoy, helpfu