<a href="https://colab.research.google.com/github/andrew66882011/qss20_slides_activities/blob/main/activities/06_textasdata_partII_topicmodeling_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook for topic modeling 

# Imports

In [None]:
## load packages 
import pandas as pd
import re
import numpy as np

## nltk imports
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## sklearn imports
from sklearn.feature_extraction.text import CountVectorizer

## lda 
from gensim import corpora
import gensim

## viz
import pyLDAvis.gensim as gensimvis
import pyLDAvis

## print mult things
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## random
import random

# Load data

In [None]:
## full data
ab = pd.read_csv("../public_data/airbnb_text.zip")
ab.head()

## random sample to create ab small 
## removing null listings 
ab_small = ab.loc[~ab.name.isnull(),
           ['id', 'neighbourhood_group', 'price', 'name']].copy().rename(columns = {'price':
            'price_rawdata'}).sample(n = 1000, random_state = 478)

ab_small['name_lower'] = ab_small['name'].str.lower()
ab_small.head()


Unnamed: 0,id,name,name_upper,neighbourhood_group,price
0,2539,Clean & quiet apt home by the park,CLEAN & QUIET APT HOME BY THE PARK,Brooklyn,149
1,2595,Skylit Midtown Castle,SKYLIT MIDTOWN CASTLE,Manhattan,225
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,THE VILLAGE OF HARLEM....NEW YORK !,Manhattan,150
3,3831,Cozy Entire Floor of Brownstone,COZY ENTIRE FLOOR OF BROWNSTONE,Brooklyn,89
4,5022,Entire Apt: Spacious Studio/Loft by central park,ENTIRE APT: SPACIOUS STUDIO/LOFT BY CENTRAL PARK,Manhattan,80


Unnamed: 0,id,neighbourhood_group,price_rawdata,name,name_lower
39160,30553339,Brooklyn,50,Spacious Brooklyn Apartment,spacious brooklyn apartment
16681,13341820,Manhattan,239,Upscale 1 Bedroom Hell's Kitchen Apartment,upscale 1 bedroom hell's kitchen apartment
19815,15865251,Manhattan,60,Manhattan - Upper East Side Lovely Private Bed...,manhattan - upper east side lovely private bed...
43826,33901138,Brooklyn,81,Location! Room in Bklyn 10 minutes to Manhatta...,location! room in bklyn 10 minutes to manhatta...
26543,21121441,Manhattan,195,Bright 1 bd apartment next to west villiage,bright 1 bd apartment next to west villiage


# Activity

- Preprocess the texts
- Repeat the preprocessing steps and running of the topic model with preprocessed texts (can also play around with other parameters like n_topics)- what seems to produce useful topics?


If you get stuck on the preprocessing part, you can use below function and I show example of how to apply

## Preprocess and model estimation

In [None]:
def processtext(row, colname, stopword_list, min_token_length = 3):
    
    ## get string form of listing
    string_of_col = str(row[colname])
    try:
        ## remove stopwords 
        remove_stop = [word for word in wordpunct_tokenize(string_of_col)
                      if word not in stopword_list]
        processed_string = " ".join([porter.stem(i) 
                        for i in remove_stop if 
                        i.isalpha() and len(i) >= min_token_length])
        return(processed_string)
    except:
        processed_string = "" # to handle data errors where not actually text
        return(processed_string)

In [None]:
## more extensive stopwords list
list_stopwords = stopwords.words("english")
custom_words_toadd = ['apartment', 'new york', 'nyc',
                      'bronx', 'brooklyn', "room", "private", "apt",
                     'manhattan', 'queens', 
                      'staten island', "bedroom"]

list_stopwords_longer = list_stopwords + custom_words_toadd

## initialize stemmer
## initialize stemmer
porter = PorterStemmer()

In [None]:
## apply function row-wise (axis=1)
## and feed it, in addition to row, the 
## name of the lowercase text column
## and a list of stop words
ab_small['text_preprocess'] = ab_small.apply(processtext,
                             axis = 1,
                             args = ["name_lower", list_stopwords_longer])
ab_small.head()


Unnamed: 0,id,neighbourhood_group,price_rawdata,name,name_lower,text_preprocess
39160,30553339,Brooklyn,50,Spacious Brooklyn Apartment,spacious brooklyn apartment,spaciou
16681,13341820,Manhattan,239,Upscale 1 Bedroom Hell's Kitchen Apartment,upscale 1 bedroom hell's kitchen apartment,upscal hell kitchen
19815,15865251,Manhattan,60,Manhattan - Upper East Side Lovely Private Bed...,manhattan - upper east side lovely private bed...,upper east side love
43826,33901138,Brooklyn,81,Location! Room in Bklyn 10 minutes to Manhatta...,location! room in bklyn 10 minutes to manhatta...,locat bklyn minut
26543,21121441,Manhattan,195,Bright 1 bd apartment next to west villiage,bright 1 bd apartment next to west villiage,bright next west villiag


In [None]:
## filtering out any empty strings post preprocessing
ab_small = ab_small[ab_small.text_preprocess != ""].copy()
ab_small.shape

tokenized_text = [wordpunct_tokenize(one_text) for one_text in 
                                      ab_small.text_preprocess]

(988, 6)

In [None]:
## preprocess and estimate topicmod
### create dictionary
text_proc_dict = corpora.Dictionary(tokenized_text)
### filter dictionary- using 2% as bounds
text_proc_dict.filter_extremes(no_below = round(ab_small.shape[0]*0.02),
                             no_above = round(ab_small.shape[0]*0.98))

### create corpus from dictionary
corpus_fromdict_proc = [text_proc_dict.doc2bow(one_text) 
                   for one_text in tokenized_text]

### estimate model
n_topics = 6
ldamod_proc = gensim.models.ldamodel.LdaModel(corpus_fromdict_proc, 
                                         num_topics = n_topics, id2word=text_proc_dict, 
                                         passes=6, alpha = 'auto',
                                        per_word_topics = True, random_state = 91988)


## Interpretation

In [None]:
### print topics and words
topics = ldamod_proc.print_topics(num_words = 10)
for topic in topics:
    print(topic)

(0, '0.190*"studio" + 0.179*"spaciou" + 0.119*"sunni" + 0.094*"luxuri" + 0.080*"midtown" + 0.059*"time" + 0.056*"squar" + 0.056*"bushwick" + 0.035*"east" + 0.032*"loft"')
(1, '0.236*"locat" + 0.186*"modern" + 0.120*"bath" + 0.094*"love" + 0.056*"garden" + 0.051*"spaciou" + 0.039*"luxuri" + 0.031*"bright" + 0.026*"park" + 0.024*"central"')
(2, '0.173*"bed" + 0.166*"beauti" + 0.079*"one" + 0.070*"close" + 0.070*"citi" + 0.061*"park" + 0.051*"view" + 0.049*"cozi" + 0.039*"bright" + 0.037*"central"')
(3, '0.144*"villag" + 0.100*"heart" + 0.095*"west" + 0.085*"bright" + 0.075*"comfort" + 0.073*"clean" + 0.070*"new" + 0.061*"charm" + 0.056*"east" + 0.043*"cozi"')
(4, '0.227*"cozi" + 0.132*"williamsburg" + 0.118*"home" + 0.087*"side" + 0.086*"east" + 0.074*"upper" + 0.073*"larg" + 0.045*"min" + 0.030*"beauti" + 0.028*"west"')
(5, '0.147*"near" + 0.118*"loft" + 0.107*"park" + 0.080*"big" + 0.077*"brownston" + 0.065*"train" + 0.062*"huge" + 0.060*"central" + 0.059*"subway" + 0.055*"sunni"')


In [None]:
### visualize
pyLDAvis.enable_notebook()
lda_display_proc = gensimvis.prepare(ldamod_proc, corpus_fromdict_proc, text_proc_dict)
pyLDAvis.display(lda_display_proc)

## if we move lambda to lower than 1, see stuff like
## topic 3 possibly reflecting east village
## topic 4 sunny near park

## Extra interpretation code relevant for problem set

What if we want to find which topics are associated with higher listing prices?

In [None]:
## get topic probabilities by doc and find mean listing by topic
### get document topics - list of list tuples
topic_probs_bydoc =[ldamod_proc.get_document_topics(item) for item in corpus_fromdict_proc]

## each document has a list containing topic, probability
## tuples- example w/ first document
one_list_tup = topic_probs_bydoc[0]
one_list_tup

## create a long for dataframe by flattening the list
topic_probs_bydoc_long = pd.DataFrame([t for lst in topic_probs_bydoc for t in lst],
                                     columns = ['topic', 'probability'])

## add id var- we're repeating each id in the original data k times
## for the number of topics
topic_probs_bydoc_long['doc_id'] = list(np.concatenate([[one_id] * n_topics for one_id in ab_small.id]).flat)

## pivot to wide format
topic_probs_bydoc_wide = pd.pivot_table(topic_probs_bydoc_long, index = ['doc_id'],
                        columns = ['topic']).reset_index().reset_index(drop = True)
topic_probs_bydoc_wide.columns = ['doc_id'] + ["topic_" + str(i) for i in np.arange(0, n_topics)]
topic_probs_bydoc_wide.head()


[(0, 0.6055935),
 (1, 0.07692643),
 (2, 0.080126114),
 (3, 0.07794536),
 (4, 0.08505535),
 (5, 0.074353255)]

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5
0,57754,0.058895,0.050696,0.052801,0.051407,0.05686,0.72934
1,62903,0.207262,0.249777,0.215922,0.264471,0.033417,0.029151
2,125053,0.184139,0.158916,0.165842,0.161296,0.175851,0.153956
3,174527,0.044871,0.037733,0.039401,0.038356,0.803082,0.036557
4,177421,0.043722,0.038157,0.801151,0.038558,0.041796,0.036616


In [None]:
## merge with original data using doc id
topic_wmeta = pd.merge(topic_probs_bydoc_wide,
                      ab_small,
                      left_on = 'doc_id',
                      right_on = 'id')

## create indicator for listing's top topic
topic_wmeta['toptopic'] = topic_wmeta[[col for col in topic_wmeta.columns if 
                                    "topic_" in col]].idxmax(axis=1)
topic_wmeta.head()

## group by topic and find mean price
topic_wmeta.groupby('toptopic').agg({'price_rawdata': np.mean})

## group by borough and topic -- higher price for some also reflects
## diff borough composition
topic_wmeta.groupby(['toptopic', 'neighbourhood_group']).agg({'price_rawdata': np.mean})

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,id,neighbourhood_group,price_rawdata,name,name_lower,text_preprocess,toptopic
0,57754,0.058895,0.050696,0.052801,0.051407,0.05686,0.72934,57754,Manhattan,305,Stylish Large Gramercy Loft!,stylish large gramercy loft!,stylish larg gramerci loft,topic_5
1,62903,0.207262,0.249777,0.215922,0.264471,0.033417,0.029151,62903,Manhattan,205,Beautiful modern studio apartment in heart of NYC,beautiful modern studio apartment in heart of nyc,beauti modern studio heart,topic_3
2,125053,0.184139,0.158916,0.165842,0.161296,0.175851,0.153956,125053,Manhattan,395,⚡Quiet Gem w/roof deck on NY's Hottest Street⚡,⚡quiet gem w/roof deck on ny's hottest street⚡,quiet gem roof deck hottest street,topic_0
3,174527,0.044871,0.037733,0.039401,0.038356,0.803082,0.036557,174527,Brooklyn,150,Cozy private family home in Bushwick,cozy private family home in bushwick,cozi famili home bushwick,topic_4
4,177421,0.043722,0.038157,0.801151,0.038558,0.041796,0.036616,177421,Brooklyn,500,Brand New Beautiful Duplex Apartment with Garden,brand new beautiful duplex apartment with garden,brand new beauti duplex garden,topic_2


Unnamed: 0_level_0,price_rawdata
toptopic,Unnamed: 1_level_1
topic_0,148.963989
topic_1,164.75
topic_2,152.867257
topic_3,161.41129
topic_4,134.645714
topic_5,145.573913


Unnamed: 0_level_0,Unnamed: 1_level_0,price_rawdata
toptopic,neighbourhood_group,Unnamed: 2_level_1
topic_0,Bronx,89.0
topic_0,Brooklyn,109.387597
topic_0,Manhattan,201.796407
topic_0,Queens,92.06
topic_0,Staten Island,102.5
topic_1,Bronx,45.0
topic_1,Brooklyn,126.111111
topic_1,Manhattan,217.755556
topic_1,Queens,106.222222
topic_2,Bronx,72.5
