<a href="https://colab.research.google.com/github/andrew66882011/qss20_slides_activities/blob/main/activities/06_textasdata_partII_topicmodeling_examplecode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook for topic modeling 

# Imports

In [None]:
## load packages 
import pandas as pd
import re
import numpy as np

## nltk imports
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## sklearn imports
from sklearn.feature_extraction.text import CountVectorizer

## lda 
from gensim import corpora
import gensim

## print mult things
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## random
import random

# Load data

In [None]:
ab = pd.read_csv("../public_data/airbnb_text.zip")
ab.head()

Unnamed: 0,id,name,name_upper,neighbourhood_group,price
0,2539,Clean & quiet apt home by the park,CLEAN & QUIET APT HOME BY THE PARK,Brooklyn,149
1,2595,Skylit Midtown Castle,SKYLIT MIDTOWN CASTLE,Manhattan,225
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,THE VILLAGE OF HARLEM....NEW YORK !,Manhattan,150
3,3831,Cozy Entire Floor of Brownstone,COZY ENTIRE FLOOR OF BROWNSTONE,Brooklyn,89
4,5022,Entire Apt: Spacious Studio/Loft by central park,ENTIRE APT: SPACIOUS STUDIO/LOFT BY CENTRAL PARK,Manhattan,80


## Preprocess documents

In this case, each name/name_upper, or listing title, we're treating as a document

### Step 1- load stopwords and augment with our own custom ones

In [None]:
list_stopwords = stopwords.words("english")

custom_words_toadd = ['apartment', 'new york', 'nyc',
                      'bronx', 'brooklyn',
                     'manhattan', 'queens', 
                      'staten island']

list_stopwords_new = list_stopwords + custom_words_toadd


### Step 2- remove stopwords from lowercase version of corpus


In [None]:
## convert to lowercase and a list
corpus_lower = ab.name.str.lower().to_list()
corpus_lower[0:5]

## use wordpunct tokenize and filter out with one
example_listing = corpus_lower[3]
nostop_listing = [word for word in wordpunct_tokenize(example_listing) 
                          if word not in list_stopwords_new]
nostop_listing

['cozy', 'entire', 'floor', 'brownstone']

### Step 3- stem and remove non-alpha

Other contexts we may want to leave digits in

In [None]:
## initialize stemmer
porter = PorterStemmer()

## apply to one by iterating
## over the tokens in the list
example_listing_preprocess = [porter.stem(token) 
                            for token in nostop_listing 
                            if token.isalpha() and 
                            len(token) > 2]

example_listing_preprocess

['cozi', 'entir', 'floor', 'brownston']

In [None]:
example_listing
example_listing_preprocess

['cozi', 'entir', 'floor', 'brownston']

# Activity

- Embed steps two and three into one or two functions
- Apply the function to all the texts in `corpus_lower`

## Create a document-term matrix and do some basic diagnostics (more manual approach)

Here we'll create a DTM first using the raw documents; in the activity, you'll create one using the preprocessed docs
that you created in the previous activity break

In [None]:
## function I'm providing
def create_dtm(list_of_strings, metadata):
    vectorizer = CountVectorizer(lowercase = True)
    dtm_sparse = vectorizer.fit_transform(list_of_strings)
    dtm_dense_named = pd.DataFrame(dtm_sparse.todense(), columns=vectorizer.get_feature_names())
    dtm_dense_named_withid = pd.concat([metadata.reset_index(), dtm_dense_named], axis = 1)
    return(dtm_dense_named_withid)

In [None]:
## first, filter out na's
corpus_lower_nonull = ab.name[~ab.name.isnull()].str.lower()

## filter out na's
## for shorter runtime, random sampling of 1000
## get metadata for those
## and also renaming price col since it's likely to be corpus word
ab_small = ab.loc[(ab.name.str.lower().isin(corpus_lower_small)) & 
                  (~ab.name.isnull()),
           ['id', 'neighbourhood_group', 'price', 'name']].copy().rename(columns = {'price':
            'price_rawdata'}).sample(n = 1000, random_state = 422)

ab_small['name_lower'] = ab_small['name'].str.lower()
ab_small.head()

Unnamed: 0,id,neighbourhood_group,price_rawdata,name,name_lower
21667,17387135,Brooklyn,90,Intimate Williamsburg Apartment,intimate williamsburg apartment
944,361803,Manhattan,115,"Luxury, Adorable Studio Apartment","luxury, adorable studio apartment"
43024,33401610,Brooklyn,120,"Park Slope, Brooklyn","park slope, brooklyn"
35198,27924639,Bronx,145,NYC Private home-1 bedroom/Private Entrance,nyc private home-1 bedroom/private entrance
10255,7853110,Brooklyn,60,"Carroll Gardens, 25 ft from F/G","carroll gardens, 25 ft from f/g"


In [None]:

## example application on raw lowercase texts; 
dtm_nopre = create_dtm(list_of_strings= ab_small.name_lower,
                      metadata = ab_small[['id', 'neighbourhood_group', 'price_rawdata']])



In [None]:
## first set of rows/cols
dtm_nopre.head()

## arbitrary later cols
dtm_nopre.shape
dtm_nopre.iloc[0:5, 480:500]

Unnamed: 0,index,id,neighbourhood_group,price_rawdata,10,10min,110th,116,12,14,...,无其他费用,法拉盛美丽豪华大套房,独立洗手间privatebathroom,纽约之家,超级便利的豪华公寓次卧,近哥伦比亚大学超方便短租房,限1,限女生,韓城,한성
0,21667,17387135,Brooklyn,90,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,944,361803,Manhattan,115,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,43024,33401610,Brooklyn,120,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,35198,27924639,Bronx,145,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,10255,7853110,Brooklyn,60,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


(1000, 878)

Unnamed: 0,lincoln,line,lit,live,lively,living,loc,local,located,location,loft,lofted,long,looking,lorimer,lots,lounge,lovely,lovers,lower
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Can use column sums on that dtm to get basic summary stats of top words

In [None]:
## summing each col
top_terms = dtm_nopre[dtm_nopre.columns[4:]].sum(axis = 0)

## sorting from most frequent to least frequent
top_terms.sort_values(ascending = False)

in           323
room         204
bedroom      162
private      153
apartment    145
            ... 
can            1
lofted         1
carriage       1
casa           1
한성             1
Length: 874, dtype: int64

## Use built in functions within gensim to skip some steps and to estimate a topic model

### Creating the objects to feed the LDA modeling function

In [None]:

## Step 1: re-tokenize and store in list
## here, i'm doing with the raw random sample of text
## in activity, you should do with the preprocessed texts
text_raw_tokens = [wordpunct_tokenize(one_text) for one_text in 
                  ab_small.name_lower]

text_raw_tokens[0:5]

[['intimate', 'williamsburg', 'apartment'],
 ['luxury', ',', 'adorable', 'studio', 'apartment'],
 ['park', 'slope', ',', 'brooklyn'],
 ['nyc', 'private', 'home', '-', '1', 'bedroom', '/', 'private', 'entrance'],
 ['carroll', 'gardens', ',', '25', 'ft', 'from', 'f', '/', 'g']]

In [None]:
## Step 2: use gensim create dictionary - gets all unique words across documents
text_raw_dict = corpora.Dictionary(text_raw_tokens)

### explore first few keys and values
### see that key is just an arbitrary counter; value is the word itself
firstkpairs = {k: text_raw_dict[k] for k in list(text_raw_dict)[:5]}
firstkpairs

## Step 3: filter out very rare and very common words
## here, i'm using the threshold that a word needs to appear in at least
## 5% of docs but not more than 95%
## this is an integer count of docs so i round
lower_bound = round(ab_small.shape[0]*0.05)
lower_bound
upper_bound = round(ab_small.shape[0]*0.95)
upper_bound
### apply filtering to dictionary
text_raw_dict.filter_extremes(no_below = lower_bound,
                             no_above = upper_bound)

## Step 4: apply dictionary to TOKENIZED texts
## this creates a mapping between each word 
## in a specific listing and the key in the dictionary
## for words that remain in the filtered dictionary
## output is a list where len(list) == n documents
## and each element in the list is a list of tuples
## containing the mappings
corpus_fromdict = [text_raw_dict.doc2bow(one_text) 
                   for one_text in text_raw_tokens]
corpus_fromdict[0:5]
text_raw_tokens[0:5]

### can apply doc2bow(one_text, return_missing = True) to print words
### eliminated from the listing bc they're not in filtered dictionary
### but feeding that one with missing values to
### the lda function can cause errors
corpus_fromdict_showmiss = [text_raw_dict.doc2bow(one_text, return_missing = True)
                            for one_text in text_raw_tokens]
corpus_fromdict_showmiss[0:5]

{0: 'apartment', 1: 'intimate', 2: 'williamsburg', 3: ',', 4: 'adorable'}

50

950

[[(0, 1), (1, 1)],
 [(0, 1), (2, 1), (3, 1)],
 [(2, 1), (4, 1), (5, 1)],
 [(6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2)],
 [(2, 1), (7, 1), (13, 1)]]

[['intimate', 'williamsburg', 'apartment'],
 ['luxury', ',', 'adorable', 'studio', 'apartment'],
 ['park', 'slope', ',', 'brooklyn'],
 ['nyc', 'private', 'home', '-', '1', 'bedroom', '/', 'private', 'entrance'],
 ['carroll', 'gardens', ',', '25', 'ft', 'from', 'f', '/', 'g']]

[([(0, 1), (1, 1)], {'intimate': 1}),
 ([(0, 1), (2, 1), (3, 1)], {'adorable': 1, 'luxury': 1}),
 ([(2, 1), (4, 1), (5, 1)], {'slope': 1}),
 ([(6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2)],
  {'entrance': 1}),
 ([(2, 1), (7, 1), (13, 1)],
  {'25': 1, 'carroll': 1, 'f': 1, 'ft': 1, 'g': 1, 'gardens': 1})]

### Estimating the model

In [None]:
## Step 5: we're finally ready to estimate the model!
## full documentation here - https://radimrehurek.com/gensim/models/ldamodel.html
## here, we're feed the lda function (1) the corpus we created from the dictionary
## (2) a parameter we decide on for the number of topics,
## (3) the dictionary itself,
## (4) parameter for number of passes through training data
## (5) parameter that returns, for each word remaining in dict, the 
## topic probabilities
## see documentation for many other arguments you can vary
ldamod = gensim.models.ldamodel.LdaModel(corpus_fromdict, 
                                         num_topics = 5, id2word=text_raw_dict, 
                                         passes=6, alpha = 'auto',
                                        per_word_topics = True)

print(type(ldamod))



<class 'gensim.models.ldamodel.LdaModel'>


### Post-model exploration

In [None]:
## Post-model 1: explore corpus-wide summary of topics
### getting the topics and top words; can retrieve diff top words
topics = ldamod.print_topics(num_words = 10)
for topic in topics:
    print(topic)


(0, '0.156*"2" + 0.150*"manhattan" + 0.097*"studio" + 0.078*"in" + 0.077*"bedroom" + 0.062*"," + 0.059*"apt" + 0.054*"-" + 0.047*"to" + 0.039*"/"')
(1, '0.165*"room" + 0.125*"private" + 0.115*"in" + 0.115*"," + 0.070*"/" + 0.059*"east" + 0.046*"-" + 0.044*"bedroom" + 0.036*"park" + 0.028*"with"')
(2, '0.246*"home" + 0.163*"-" + 0.130*"from" + 0.079*"near" + 0.068*"cozy" + 0.048*"&" + 0.041*"1" + 0.034*"spacious" + 0.033*"and" + 0.031*"park"')
(3, '0.131*"bedroom" + 0.124*"in" + 0.119*"!" + 0.101*"apartment" + 0.093*"the" + 0.061*"nyc" + 0.057*"with" + 0.053*"1" + 0.041*"cozy" + 0.034*"brooklyn"')
(4, '0.219*"in" + 0.156*"apartment" + 0.151*"williamsburg" + 0.099*"brooklyn" + 0.082*"room" + 0.057*"private" + 0.038*"&" + 0.037*"apt" + 0.037*"," + 0.028*"."')


In [None]:
    
## Post-model 2: explore topics associated with each document
### for each item in our original dictionary, get list of topic probabilities
l=[ldamod.get_document_topics(item) for item in corpus_fromdict]
### print result
text_raw_tokens[0:5]
l[0:5]

[['intimate', 'williamsburg', 'apartment'],
 ['luxury', ',', 'adorable', 'studio', 'apartment'],
 ['park', 'slope', ',', 'brooklyn'],
 ['nyc', 'private', 'home', '-', '1', 'bedroom', '/', 'private', 'entrance'],
 ['carroll', 'gardens', ',', '25', 'ft', 'from', 'f', '/', 'g']]

[[(0, 0.050828494),
  (1, 0.07230436),
  (2, 0.055267576),
  (3, 0.05542449),
  (4, 0.7661751)],
 [(0, 0.47102296),
  (1, 0.05487115),
  (2, 0.040755004),
  (3, 0.040973753),
  (4, 0.3923771)],
 [(0, 0.03749694),
  (1, 0.8420732),
  (2, 0.040822763),
  (3, 0.040919527),
  (4, 0.038687598)],
 [(0, 0.016214045),
  (1, 0.7037901),
  (2, 0.2456928),
  (3, 0.017770553),
  (4, 0.01653242)],
 [(0, 0.03755963),
  (1, 0.5739747),
  (2, 0.3094248),
  (3, 0.040759083),
  (4, 0.038281746)]]

### Visualizing 

In [None]:
## Visualize - may not work on jhub yet
import pyLDAvis.gensim as gensimvis
import pyLDAvis
pyLDAvis.enable_notebook()
lda_display = gensimvis.prepare(ldamod, corpus_fromdict, text_raw_dict)
pyLDAvis.display(lda_display)

# Activity

- Preprocess the texts
- Repeat the preprocessing steps and running of the topic model with preprocessed texts (can also play around with other parameters like n_topics)- what seems to produce useful topics?


If you get stuck on the preprocessing part, you can use below function and I show example of how to apply

In [None]:
def processtext(row, colname):
    
    string_of_col = str(row[colname])
    try:
        processed_string = " ".join([porter.stem(i.lower()) for i in wordpunct_tokenize(string_of_col) if 
                        i.lower().isalpha() and len(i) >=3])  
        return(processed_string)
    except:
        processed_string = "" # to handle data errors where not actually text
        return(processed_string)

# ab_small['text_preprocess'] = ab_small.apply(processtext,
#                             axis = 1,
#                             args = ["name_lower"])
# ab_small.head()