# Notebook for topic modeling 

# 0. Imports

In [5]:
## load packages 
import pandas as pd
import re
import numpy as np

## nltk imports
#!pip install nltk # can install on terminal or by uncommenting this line
# import nltk; nltk.download('punkt'); nltk.download('stopwords')
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## sklearn imports
from sklearn.feature_extraction.text import CountVectorizer

## lda
#!pip install gensim # can install by uncommenting this line
from gensim import corpora
import gensim

## visualizing LDA--likely need to install
#!pip install pyLDAvis # can install by uncommenting this line
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
pyLDAvis.enable_notebook()

## print mult things
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## random
import random
import string; punctlist = [char for char in string.punctuation] # list of english punctuation marks

# 0. Load data

In [2]:
ab = pd.read_csv("../public_data/airbnb_text.zip")
ab.head()

Unnamed: 0,id,name,name_upper,neighbourhood_group,price
0,2539,Clean & quiet apt home by the park,CLEAN & QUIET APT HOME BY THE PARK,Brooklyn,149
1,2595,Skylit Midtown Castle,SKYLIT MIDTOWN CASTLE,Manhattan,225
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,THE VILLAGE OF HARLEM....NEW YORK !,Manhattan,150
3,3831,Cozy Entire Floor of Brownstone,COZY ENTIRE FLOOR OF BROWNSTONE,Brooklyn,89
4,5022,Entire Apt: Spacious Studio/Loft by central park,ENTIRE APT: SPACIOUS STUDIO/LOFT BY CENTRAL PARK,Manhattan,80


# 1. Preprocess documents

In this case, each name/name_upper, or listing title, we're treating as a document

## 1.1 Load stopwords list and augment with our own custom ones

In [6]:
list_stopwords = stopwords.words("english")

custom_words_toadd = ['apartment', 'new york', 'nyc',
                      'bronx', 'brooklyn',
                     'manhattan', 'queens', 
                      'staten island']

list_stopwords_new = list_stopwords + custom_words_toadd


## 1.2 Remove stopwords from lowercase version of corpus


In [7]:
## convert to lowercase and a list
corpus_lower = ab.name.str.lower().to_list()
corpus_lower[0:5]

## use wordpunct tokenize and filter out with one
example_listing = corpus_lower[3]
nostop_listing = [word for word in wordpunct_tokenize(example_listing) 
                          if word not in list_stopwords_new]
nostop_listing

['clean & quiet apt home by the park',
 'skylit midtown castle',
 'the village of harlem....new york !',
 'cozy entire floor of brownstone',
 'entire apt: spacious studio/loft by central park']

['cozy', 'entire', 'floor', 'brownstone']

## 1.3 stem and remove non-alpha

Other contexts we may want to leave digits in

In [8]:
## initialize stemmer
porter = PorterStemmer()

## apply to one by iterating
## over the tokens in the list
example_listing_preprocess = [porter.stem(token) 
                            for token in nostop_listing 
                            if token.isalpha() and 
                            len(token) > 2]

example_listing_preprocess

['cozi', 'entir', 'floor', 'brownston']

In [9]:
example_listing
example_listing_preprocess

'cozy entire floor of brownstone'

['cozi', 'entir', 'floor', 'brownston']

## 1.4 Activity 1

The above example performed preprocessing on a single Airbnb listing. We want to generalize this preprocessing across all listings.

- Embed step two (remove stopwords) and step three (stem) into one or two functions that take in a raw string (eg the raw text of an Airbnb review) and return a preprocessed string 
- Apply the function iteratively to preprocess all the texts in `corpus_lower`. Output could either be a list where each list element is a string of a list (e.g., `cozy brownstone apt`), or a list of lists where each element is a tokenized string (e.g., `['cozy', 'brownstone', 'apt'])`

Output is flexible: it could be a list of lists containing tokenized/stemmed text or a list of strings.

In [20]:
# your code here to define the function(s)
def preprocess(raw_string_input):   
    
    if type(raw_string_input) == str:
        ## use wordpunct tokenize and filter out with one
        nostop_list = [word for word in wordpunct_tokenize(raw_string_input) 
                          if word not in list_stopwords_new]
    

        ## apply to one by iterating
        ## over the tokens in the list
        list_preprocessed = [porter.stem(token) 
                                    for token in nostop_list 
                                    if token.isalpha() and 
                                    len(token) > 2]

    
        return list_preprocessed

# your code here to apply the function
test = [preprocess(word) for word in corpus_lower]
test

[['clean', 'quiet', 'apt', 'home', 'park'],
 ['skylit', 'midtown', 'castl'],
 ['villag', 'harlem', 'new', 'york'],
 ['cozi', 'entir', 'floor', 'brownston'],
 ['entir', 'apt', 'spaciou', 'studio', 'loft', 'central', 'park'],
 ['larg', 'cozi', 'midtown', 'east'],
 ['blissartsspac'],
 ['larg', 'furnish', 'room', 'near', 'way'],
 ['cozi', 'clean', 'guest', 'room', 'famili', 'apt'],
 ['cute', 'cozi', 'lower', 'east', 'side', 'bdrm'],
 ['beauti', 'upper', 'west', 'side'],
 ['central', 'near', 'broadway'],
 ['love', 'room', 'garden', 'best', 'area', 'legal', 'rental'],
 ['wonder', 'guest', 'bedroom', 'singl'],
 ['west', 'villag', 'nest', 'superhost'],
 ['stop', 'studio'],
 ['perfect', 'parent', 'garden'],
 ['chelsea', 'perfect'],
 ['hip', 'histor', 'brownston', 'backyard'],
 ['huge', 'upper', 'east', 'cental', 'park'],
 ['sweet', 'spaciou', 'loft'],
 ['cbg', 'ctybgd', 'helpshaiti'],
 ['cbg', 'help', 'haiti', 'room'],
 ['cbg', 'help', 'haiti'],
 ['maison', 'de', 'bohemian'],
 ['sunni', 'bedroo

In [23]:
def process_step1(one_str):
    try:
        nostop_listing1 = [word for word in wordpunct_tokenize(one_str)
                          if word not in list_stopwords_new]
        clean_listing = [porter.stem(word) for word in nostop_listing1
                        if word.isalpha()
                        and len(word) > 3]
        clean_listing_str = " ".join(clean_listing)
        return(clean_listing_str)
    except:
        return("")
cleaned_listings = [process_step1(one_listing) for one_listing in
                   corpus_lower]

cleaned_listings

['clean quiet home park',
 'skylit midtown castl',
 'villag harlem york',
 'cozi entir floor brownston',
 'entir spaciou studio loft central park',
 'larg cozi midtown east',
 'blissartsspac',
 'larg furnish room near',
 'cozi clean guest room famili',
 'cute cozi lower east side bdrm',
 'beauti upper west side',
 'central near broadway',
 'love room garden best area legal rental',
 'wonder guest bedroom singl',
 'west villag nest superhost',
 'stop studio',
 'perfect parent garden',
 'chelsea perfect',
 'histor brownston backyard',
 'huge upper east cental park',
 'sweet spaciou loft',
 'ctybgd helpshaiti',
 'help haiti room',
 'help haiti',
 'maison bohemian',
 'sunni bedroom across prospect park',
 'magnifiqu suit cloitr',
 'midtown pie terr',
 'spaciou love furnish bedroom',
 'modern east villag',
 'front room doubl',
 'spaciou bedroom lux build',
 'loft williamsburg area roof',
 'back room bunk bed',
 'larg style room',
 'love room garden best area legal rental',
 'clean quiet',
 

# 2. Create a document-term matrix and do some basic diagnostics (more manual approach)

Here we'll create a DTM first using the raw documents; in the activity, you'll create one using the preprocessed docs
that you created in activity 1

## 2.1 Define the dtm function and select data to transform into a document-term matrix

In [24]:
## function provided
def create_dtm(list_of_strings, metadata):
    """ 
    Function to create dense document-term matrix (DTM) from a list of strings and provided metadata. 
    A sparse DTM is a list of term_index/doc_index tuples: if a given term occurs in a given doc at least once, 
        then this count is listed as a tuple; if not, that term/doc pair is omitted. 
    In a dense DTM, each row is one text (e.g., an Airbnb listing), each column is a term, and 
        each cell indicates the frequency of that word in that text. 
    
    Parameters:
        list_of_strings (Series): each row contains a preprocessed string (need not be tokenized)
        metadata (DataFrame): contains document-level covariates
    
    Returns:
        Dense DTM with metadata on left and then one column per word in lexicon
    """
    
    # initialize a sklearn tokenizer; this helps us tokenize the preprocessed string input
    vectorizer = CountVectorizer(lowercase = True) 
    dtm_sparse = vectorizer.fit_transform(list_of_strings)
    print('Sparse matrix form:\n', dtm_sparse[:3]) # take a look at sparse representation
    print()
    
    # switch the dataframe from the sparse representation to the normal dense representation (so we can treat it as regular dataframe)
    dtm_dense_named = pd.DataFrame(dtm_sparse.todense(), columns=vectorizer.get_feature_names_out ())
    print('Dense matrix form:\n', dtm_dense_named.head()) # take a look at dense representation
    dtm_dense_named_withid = pd.concat([metadata.reset_index(), dtm_dense_named], axis = 1) # add back document-level covariates

    return(dtm_dense_named_withid)

In [25]:

## filter out na's
## for shorter runtime, random sampling of 1000
## get metadata for those
## and also renaming price col since it's likely to be corpus word
ab_small = ab.loc[~ab.name.isnull(),
           ['id', 'neighbourhood_group', 'price', 'name']].copy().rename(columns = {'price':
            'price_rawdata'}).sample(n = 1000, random_state = 422)

ab_small['name_lower'] = ab_small['name'].str.lower()
ab_small.head()

Unnamed: 0,id,neighbourhood_group,price_rawdata,name,name_lower
23821,19227560,Queens,100,Super Cozy!,super cozy!
22905,18560625,Brooklyn,30,Beautiful Private Bedroom by Prospect Park,beautiful private bedroom by prospect park
20426,16289576,Manhattan,80,Best Location on the Upper West Side! - Part II,best location on the upper west side! - part ii
2018,893413,Manhattan,2500,Architecturally Stunning Former Synagogue!,architecturally stunning former synagogue!
18790,14882137,Queens,50,"Large, beautiful room near Bushwick","large, beautiful room near bushwick"


## 2.2 Execute the dtm function to create the document-term matrix

In [30]:
## example application on raw lowercase texts; 
dtm_nopre = create_dtm(list_of_strings= ab_small.name_lower,
                      metadata = ab_small[['id', 'neighbourhood_group', 'price_rawdata']])

# print(dtm_nopre)

Sparse matrix form:
   (0, 841)	1
  (0, 281)	1
  (1, 152)	1
  (1, 693)	1
  (1, 157)	1
  (1, 205)	1
  (1, 698)	1
  (1, 653)	1
  (2, 165)	1
  (2, 537)	1
  (2, 637)	1
  (2, 856)	1
  (2, 902)	1
  (2, 939)	1
  (2, 774)	1
  (2, 657)	1
  (2, 471)	1

Dense matrix form:
    001  10  10m  10min  10mins  1100  12mins  14  15  15min  ...  yoga  york  \
0    0   0    0      0       0     0       0   0   0      0  ...     0     0   
1    0   0    0      0       0     0       0   0   0      0  ...     0     0   
2    0   0    0      0       0     0       0   0   0      0  ...     0     0   
3    0   0    0      0       0     0       0   0   0      0  ...     0     0   
4    0   0    0      0       0     0       0   0   0      0  ...     0     0   

   you  your  yu  zen  ღღღsteps  法拉盛中心私人房間獨立衛浴  溫馨大套房  獨一無二的紐約閣樓  
0    0     0   0    0         0              0      0          0  
1    0     0   0    0         0              0      0          0  
2    0     0   0    0         0              0      0  

In [31]:
## show first set of rows/cols
dtm_nopre.head()

## show arbitrary later cols in resulting data
dtm_nopre.shape
dtm_nopre.iloc[0:5, 480:500]

Unnamed: 0,index,id,neighbourhood_group,price_rawdata,001,10,10m,10min,10mins,1100,...,yoga,york,you,your,yu,zen,ღღღsteps,法拉盛中心私人房間獨立衛浴,溫馨大套房,獨一無二的紐約閣樓
0,23821,19227560,Queens,100,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,22905,18560625,Brooklyn,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20426,16289576,Manhattan,80,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2018,893413,Manhattan,2500,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,18790,14882137,Queens,50,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


(1000, 974)

Unnamed: 0,inclusive,incredible,incredibly,indoor,inn,inq,insane,int,interior,international,interns,invincible,inviting,inwood,island,it,italy,its,jefferson,jewel
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## 2.3 Use that matrix/column sums to get basic summary stats of top words

In [38]:
## summing each col
top_terms = dtm_nopre[dtm_nopre.columns[4:]].sum(axis = 0)

# dtm_nopre[dtm_nopre.columns[4:]].sum(axis=0)

## sorting from most frequent to least frequent
top_terms.sort_values(ascending = False)

in           367
room         244
private      163
bedroom      152
apartment    130
            ... 
gay            1
gente          1
geodesic       1
george         1
獨一無二的紐約閣樓      1
Length: 970, dtype: int64

## 2.4 Activity 2: repeat the above but using the preprocessed text data

- Stick with the same random sample of 1000 `ab_small`
- Apply the preprocessing steps from activity 1 to create a new column in `ab_small` with the preprocessed text (if you got stuck on that, try just removing stopwords)
- Use the `create_dtm` function to create a document-term matrix from the preprocessed data
- Use colsums to summarize

In [39]:
## your code here 
ab_small["name_preprocessed"] = [process_step1(one_listing) for one_listing in
                   ab_small.name]

dtm_pre = create_dtm(list_of_strings= ab_small.name_preprocessed,
                      metadata = ab_small[['id', 'neighbourhood_group', 'price_rawdata']])



Sparse matrix form:
   (0, 595)	1
  (0, 161)	1
  (1, 54)	1
  (1, 471)	1
  (1, 59)	1
  (1, 474)	1
  (1, 438)	1
  (2, 64)	1
  (2, 357)	1
  (2, 643)	1
  (2, 673)	1
  (2, 535)	1
  (2, 440)	1

Dense matrix form:
    abcd  abod  access  acidot  acogedor  across  address  ador  aesthet  \
0     0     0       0       0         0       0        0     0        0   
1     0     0       0       0         0       0        0     0        0   
2     0     0       0       0         0       0        0     0        0   
3     0     0       0       0         0       0        0     0        0   
4     0     0       0       0         0       0        0     0        0   

   afford  ...  yard  year  yellow  yoga  york  your  ღღღstep  法拉盛中心私人房間獨立衛浴  \
0       0  ...     0     0       0     0     0     0        0              0   
1       0  ...     0     0       0     0     0     0        0              0   
2       0  ...     0     0       0     0     0     0        0              0   
3       0  ...     0 

# 3. Use gensim to more automatically preprocess/estimate a topic model

## 3.1 Creating the objects to feed the LDA modeling function

Different outputs described below: 
- Tokenized and preprocessed text 
- Dictionary 
- Corpus 

In [40]:

## Step 1: re-tokenize and store in list
## here, i'm doing with the raw random sample of text
## in activity, you should do with the preprocessed texts
text_raw_tokens = [wordpunct_tokenize(one_text) for one_text in 
                  ab_small.name_lower]


## Step 2: use gensim create dictionary - gets all unique words across documents
text_raw_dict = corpora.Dictionary(text_raw_tokens)
raw_len = len(text_raw_dict) # get length for comparison below

### explore first few keys and values
### see that key is just an arbitrary counter; value is the word itself
{k: text_raw_dict[k] for k in list(text_raw_dict)[:5]}


## Step 3: filter out very rare and very common words
## here, i'm using the threshold that a word needs to appear in at least
## 5% of docs but not more than 95%
## this is an integer count of docs so i round
lower_bound = round(ab_small.shape[0]*0.05)
upper_bound = round(ab_small.shape[0]*0.95)

### apply filtering to dictionary
text_raw_dict.filter_extremes(no_below = lower_bound,
                             no_above = upper_bound)
print(f'Filtering out very rare and very common words reduced the \
length of dictionary from {str(raw_len)} to {str(len(text_raw_dict))}.')
{k: text_raw_dict[k] for k in list(text_raw_dict)[:5]} # show first five entries after filtering


## Step 4: apply dictionary to TOKENIZED texts
## this creates a mapping between each word 
## in a specific listing and the key in the dictionary.
## for words that remain in the filtered dictionary,
## output is a list where len(list) == n documents
## and each element in the list is a list of tuples
## containing the mappings
corpus_fromdict = [text_raw_dict.doc2bow(one_text) 
                   for one_text in text_raw_tokens]

### can apply doc2bow(one_text, return_missing = True) to print words
### eliminated from the listing bc they're not in filtered dictionary.
### but feeding that one with missing values to
### the lda function can cause errors
corpus_fromdict_showmiss = [text_raw_dict.doc2bow(one_text, return_missing = True)
                            for one_text in text_raw_tokens]
print('Sample of documents represented in dictionary format (with omitted words noted):')
corpus_fromdict_showmiss[:10]

{0: '!', 1: 'cozy', 2: 'super', 3: 'beautiful', 4: 'bedroom'}

Filtering out very rare and very common words reduced the length of dictionary from 1047 to 31.


{0: '!', 1: 'cozy', 2: 'beautiful', 3: 'bedroom', 4: 'park'}

Sample of documents represented in dictionary format (with omitted words noted):


[([(0, 1), (1, 1)], {'super': 1}),
 ([(2, 1), (3, 1), (4, 1), (5, 1)], {'by': 1, 'prospect': 1}),
 ([(0, 1), (6, 1), (7, 1)],
  {'best': 1,
   'ii': 1,
   'location': 1,
   'on': 1,
   'part': 1,
   'side': 1,
   'upper': 1,
   'west': 1}),
 ([(0, 1)],
  {'architecturally': 1, 'former': 1, 'stunning': 1, 'synagogue': 1}),
 ([(2, 1), (8, 1), (9, 1), (10, 1), (11, 1)], {'bushwick': 1}),
 ([(4, 1), (8, 1), (9, 1), (12, 1), (13, 2)],
  {'bath': 1, 'bed': 1, 'by': 1, 'central': 1, 'college': 1, 'hunter': 1}),
 ([(9, 1), (11, 1), (14, 1), (15, 1)], {'bohemian': 1, 'brownstone': 1}),
 ([(16, 1)],
  {'fidi': 1, 'huge': 1, 'loft': 1, 'views': 1, 'w': 1, 'water': 1}),
 ([], {'hillside': 1, 'hotel': 1}),
 ([(5, 1), (9, 1), (11, 1), (14, 1), (15, 1)], {'airy': 1})]

##  3.2 Estimating the model

In [41]:
## Step 5: we're finally ready to estimate the model!
## full documentation here - https://radimrehurek.com/gensim/models/ldamodel.html
## here, we're feeding the lda function:
## (1) the corpus we created from the dictionary,
## (2) a parameter we decide on for the number of topics (k),
## (3) the dictionary itself,
## (4) parameter for number of passes through training data (more means slower), and
## (5) parameter that returns, for each word remaining in dict, the topic probabilities.
## see documentation for many other arguments you can vary
ldamod = gensim.models.ldamodel.LdaModel(corpus_fromdict, 
                                         num_topics = 5, 
                                         id2word=text_raw_dict, 
                                         passes=6, 
                                         alpha = 'auto',
                                         per_word_topics = True)

print(type(ldamod))



<class 'gensim.models.ldamodel.LdaModel'>


## 3.3  Seeing what topics the estimated model discovers

In [42]:
## Post-model 1: explore corpus-wide summary of topics
### getting the topics and top words; can retrieve diff top words
topics = ldamod.print_topics(num_words = 10)
for topic in topics:
    print(topic)


(0, '0.216*"!" + 0.105*"and" + 0.093*"in" + 0.085*"bedroom" + 0.076*"room" + 0.075*"private" + 0.064*"," + 0.051*"cozy" + 0.044*"brooklyn" + 0.038*"manhattan"')
(1, '0.150*"," + 0.117*"-" + 0.090*"bedroom" + 0.086*"1" + 0.077*"2" + 0.052*"." + 0.047*"/" + 0.037*"spacious" + 0.036*"apt" + 0.036*"in"')
(2, '0.114*"apt" + 0.111*"near" + 0.090*"beautiful" + 0.083*"park" + 0.079*"to" + 0.076*"large" + 0.072*"&" + 0.056*"studio" + 0.055*"bedroom" + 0.041*"in"')
(3, '0.200*"in" + 0.093*"the" + 0.087*"room" + 0.084*"cozy" + 0.071*"/" + 0.066*"apartment" + 0.059*"manhattan" + 0.050*"of" + 0.043*"," + 0.032*"spacious"')
(4, '0.201*"room" + 0.179*"in" + 0.107*"private" + 0.092*"studio" + 0.073*"brooklyn" + 0.056*"williamsburg" + 0.050*"with" + 0.047*"apartment" + 0.035*"beautiful" + 0.034*"sunny"')


In [43]:
    
## Post-model 2: explore topics associated with each document
### for each item in our original dictionary, get list of topic probabilities
l=[ldamod.get_document_topics(item) for item in corpus_fromdict]
### print result
text_raw_tokens[0:5]
l[0:5]

[['super', 'cozy', '!'],
 ['beautiful', 'private', 'bedroom', 'by', 'prospect', 'park'],
 ['best',
  'location',
  'on',
  'the',
  'upper',
  'west',
  'side',
  '!',
  '-',
  'part',
  'ii'],
 ['architecturally', 'stunning', 'former', 'synagogue', '!'],
 ['large', ',', 'beautiful', 'room', 'near', 'bushwick']]

[[(0, 0.77083915),
  (1, 0.06579759),
  (2, 0.04396828),
  (3, 0.0633491),
  (4, 0.05604582)],
 [(0, 0.029221987),
  (1, 0.038913596),
  (2, 0.8624762),
  (3, 0.03642201),
  (4, 0.032966226)],
 [(0, 0.3458788),
  (1, 0.22728588),
  (2, 0.03232019),
  (3, 0.35335752),
  (4, 0.04115759)],
 [(0, 0.6428109),
  (1, 0.10295142),
  (2, 0.0687994),
  (3, 0.09781318),
  (4, 0.08762516)],
 [(0, 0.024121726),
  (1, 0.12312476),
  (2, 0.59727657),
  (3, 0.03039762),
  (4, 0.22507934)]]

### Visualizing 

In [44]:
lda_display = gensimvis.prepare(ldamod, corpus_fromdict, text_raw_dict)
pyLDAvis.display(lda_display)

## 3.4 Activity 3

- Preprocess the texts if you haven't already
- Run the topic model with preprocessed texts
- Play around with other parameters like `n_topics` to find a configuration that produces useful topics

If you get stuck on the preprocessing part, you can use below function and example code for applying it. Then continue as above (start with tokenizing).

In [46]:
# your code here
## Step 1: re-tokenize and store in list
## here, i'm doing with the raw random sample of text
## in activity, you should do with the preprocessed texts
text_preproc_tokens = [wordpunct_tokenize(one_text) for one_text in 
                  ab_small.name_preprocessed]


## Step 2: use gensim create dictionary - gets all unique words across documents
text_preproc_dict = corpora.Dictionary(text_preproc_tokens)
preproc_len = len(text_preproc_dict) # get length for comparison below

### explore first few keys and values
### see that key is just an arbitrary counter; value is the word itself
{k: text_preproc_dict[k] for k in list(text_preproc_dict)[:5]}


## Step 3: filter out very rare and very common words
## here, i'm using the threshold that a word needs to appear in at least
## 5% of docs but not more than 95%
## this is an integer count of docs so i round
lower_bound = round(ab_small.shape[0]*0.05)
upper_bound = round(ab_small.shape[0]*0.95)

### apply filtering to dictionary
text_preproc_dict.filter_extremes(no_below = lower_bound,
                             no_above = upper_bound)
print(f'Filtering out very rare and very common words reduced the \
length of dictionary from {str(preproc_len)} to {str(len(text_preproc_dict))}.')
{k: text_preproc_dict[k] for k in list(text_preproc_dict)[:5]} # show first five entries after filtering


## Step 4: apply dictionary to TOKENIZED texts
## this creates a mapping between each word 
## in a specific listing and the key in the dictionary.
## for words that remain in the filtered dictionary,
## output is a list where len(list) == n documents
## and each element in the list is a list of tuples
## containing the mappings
corpus_fromdict = [text_preproc_dict.doc2bow(one_text) 
                   for one_text in text_preproc_tokens]

### can apply doc2bow(one_text, return_missing = True) to print words
### eliminated from the listing bc they're not in filtered dictionary.
### but feeding that one with missing values to
### the lda function can cause errors
corpus_fromdict_showmiss = [text_preproc_dict.doc2bow(one_text, return_missing = True)
                            for one_text in text_preproc_tokens]
print('Sample of documents represented in dictionary format (with omitted words noted):')
corpus_fromdict_showmiss[:10]

{0: 'cozi', 1: 'super', 2: 'beauti', 3: 'bedroom', 4: 'park'}

Filtering out very rare and very common words reduced the length of dictionary from 697 to 16.


{0: 'cozi', 1: 'beauti', 2: 'bedroom', 3: 'park', 4: 'privat'}

Sample of documents represented in dictionary format (with omitted words noted):


[([(0, 1)], {'super': 1}),
 ([(1, 1), (2, 1), (3, 1), (4, 1)], {'prospect': 1}),
 ([], {'best': 1, 'locat': 1, 'part': 1, 'side': 1, 'upper': 1, 'west': 1}),
 ([], {'architectur': 1, 'former': 1, 'stun': 1, 'synagogu': 1}),
 ([(1, 1), (5, 1), (6, 1), (7, 1)], {'bushwick': 1}),
 ([(3, 1), (5, 1)], {'bath': 1, 'central': 1, 'colleg': 1, 'hunter': 1}),
 ([(5, 1), (7, 1), (8, 1)], {'bohemian': 1, 'brownston': 1}),
 ([(9, 1)], {'fidi': 1, 'huge': 1, 'loft': 1, 'view': 1, 'water': 1}),
 ([], {'hillsid': 1, 'hotel': 1}),
 ([(4, 1), (5, 1), (7, 1), (8, 1)], {'airi': 1})]

In [47]:
## Step 5: we're finally ready to estimate the model!
## full documentation here - https://radimrehurek.com/gensim/models/ldamodel.html
## here, we're feeding the lda function:
## (1) the corpus we created from the dictionary,
## (2) a parameter we decide on for the number of topics (k),
## (3) the dictionary itself,
## (4) parameter for number of passes through training data (more means slower), and
## (5) parameter that returns, for each word remaining in dict, the topic probabilities.
## see documentation for many other arguments you can vary
ldamod = gensim.models.ldamodel.LdaModel(corpus_fromdict, 
                                         num_topics = 5, 
                                         id2word=text_preproc_dict, 
                                         passes=6, 
                                         alpha = 'auto',
                                         per_word_topics = True)

print(type(ldamod))



<class 'gensim.models.ldamodel.LdaModel'>


In [48]:
## Post-model 1: explore corpus-wide summary of topics
### getting the topics and top words; can retrieve diff top words
topics1 = ldamod.print_topics(num_words = 10)
for topic in topics1:
    print(topic)

(0, '0.287*"room" + 0.158*"manhattan" + 0.140*"brooklyn" + 0.108*"larg" + 0.095*"sunni" + 0.078*"beauti" + 0.066*"cozi" + 0.031*"privat" + 0.024*"bedroom" + 0.008*"near"')
(1, '0.295*"privat" + 0.271*"room" + 0.218*"williamsburg" + 0.089*"east" + 0.056*"cozi" + 0.024*"sunni" + 0.017*"beauti" + 0.016*"park" + 0.004*"brooklyn" + 0.002*"bedroom"')
(2, '0.267*"spaciou" + 0.231*"park" + 0.177*"near" + 0.122*"room" + 0.058*"privat" + 0.049*"sunni" + 0.029*"beauti" + 0.022*"brooklyn" + 0.018*"cozi" + 0.009*"studio"')
(3, '0.456*"bedroom" + 0.202*"apart" + 0.101*"privat" + 0.062*"brooklyn" + 0.044*"east" + 0.040*"larg" + 0.036*"beauti" + 0.033*"spaciou" + 0.012*"williamsburg" + 0.004*"sunni"')
(4, '0.454*"studio" + 0.248*"cozi" + 0.096*"privat" + 0.070*"apart" + 0.064*"east" + 0.019*"brooklyn" + 0.015*"sunni" + 0.011*"larg" + 0.006*"park" + 0.006*"room"')


In [50]:
    
## Post-model 2: explore topics associated with each document
### for each item in our original dictionary, get list of topic probabilities
l=[ldamod.get_document_topics(item) for item in corpus_fromdict]
### print result
text_preproc_tokens[0:5]
l[0:5]


[['super', 'cozi'],
 ['beauti', 'privat', 'bedroom', 'prospect', 'park'],
 ['best', 'locat', 'upper', 'west', 'side', 'part'],
 ['architectur', 'stun', 'former', 'synagogu'],
 ['larg', 'beauti', 'room', 'near', 'bushwick']]

[[(0, 0.13156687),
  (1, 0.08989636),
  (2, 0.09294451),
  (3, 0.09796584),
  (4, 0.5876264)],
 [(0, 0.054060962),
  (1, 0.036087673),
  (2, 0.3549175),
  (3, 0.52229804),
  (4, 0.032635808)],
 [(0, 0.2614838),
  (1, 0.18234263),
  (2, 0.18906555),
  (3, 0.19958653),
  (4, 0.16752154)],
 [(0, 0.2614838),
  (1, 0.18234263),
  (2, 0.18906555),
  (3, 0.19958653),
  (4, 0.16752154)],
 [(0, 0.6326065),
  (1, 0.035606317),
  (2, 0.26023585),
  (3, 0.039009728),
  (4, 0.032541607)]]

In [51]:
lda_display = gensimvis.prepare(ldamod, corpus_fromdict, text_preproc_dict)
pyLDAvis.display(lda_display)

In [45]:
# Hint: example code for preprocessing
def processtext(row, colname):
    
    string_of_col = str(row[colname])
    try:
        processed_string = " ".join([porter.stem(i.lower()) 
                        for i in wordpunct_tokenize(string_of_col) if 
                        i.lower().isalpha() and len(i) >=3]) #and i not in punctlist]) # optional: remove punctuation too 
        return(processed_string)
    except:
        processed_string = "" # to handle data errors where not actually text
        return(processed_string)

# ab_small['text_preprocess'] = ab_small.apply(processtext,
#                             axis = 1,
#                             args = ["name_lower"])
#
# ab_small = ab_small[ab_small.text_preprocess != ""].copy()
#
# ab_small.head()