In [17]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD 
from sklearn.feature_extraction import text 
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from scipy.sparse import random as sparse_random
from sklearn.random_projection import sparse_random_matrix


In [3]:
# Read in cleaned data.
muni_df = pd.read_pickle("./muni_clean.pkl")

In [9]:
# A useful function for getting topic words.
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

# Vectorize

In [7]:
# create a word count vectorizer to return word counts per document.
vectorizer = CountVectorizer(strip_accents = 'unicode',
                                #ngram_range=(1,2),
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.60, #remove words in over 60% of dataset
                                min_df = 10) # words must be in at least 10 documents.

doc_word = vectorizer.fit_transform(muni_df['clean_text'])

# We can also make a tf-idf version to see how topics and clustering may change.
tfidf_vectorizer = TfidfVectorizer(strip_accents = 'unicode',
                                #ngram_range=(1,2),
                                token_pattern = r'\b[a-zA-Z]{3,}\b', #focus on words with 3 or more characters
                                max_df = 0.60,
                                min_df = 10)

doc_tf_idf = tfidf_vectorizer.fit_transform(muni_df['clean_text'])


# NMF Topic Model using count vectorizer

In [21]:
from sklearn.decomposition import NMF
nmf_model = NMF(10, max_iter = 500)
doc_topic = nmf_model.fit_transform(doc_word)

topic_word = pd.DataFrame(nmf_model.components_.round(10),
             index = ["component_1","component_2","component_3","component_4","component_5","component_6","component_7","component_8","component_9","component_10"],
             columns = vectorizer.get_feature_names())


display_topics(nmf_model, vectorizer.get_feature_names(), 15)

H = pd.DataFrame(doc_topic.round(10),
             columns = ["component_1","component_2","component_3","component_4","component_5","component_6","component_7","component_8","component_9","component_10"],
                 index=muni_df['Issuer Name'])




Topic  0
cityTM, citi, businesstyp, ordin, governmentwid, mayor, proprietari, increment, enterpris, mortal, retire, dissemin, urban, proportion, green

Topic  1
districtTM, apprais, unlimit, wastewat, drainag, acr, homestead, agentregistrar, flood, fithe, engin, tceq, master, fitax, governmentwid

Topic  2
variabl, swapsrec, per, biennium, mortgag, lotteri, forecast, univers, libor, assembl, stateTM, swap, businesstyp, proprietari, session

Topic  3
town, townTM, success, governmentwid, preserv, businesstyp, vehicl, enterpris, electr, excis, deficit, motor, boston, teacher, abat

Topic  4
countyTM, commission, governmentwid, businesstyp, counti, mortal, proportion, fairfield, proprietari, airport, enterpris, wast, librari, retire, intergovernment

Topic  5
colleg, technic, student, districtTM, enrol, technolog, foundat, proportion, remodel, gateway, vice, campu, morain, train, blackhawk

Topic  6
energi, saw, yearend, cityTM, ordin, electr, retire, airport, citi, solar, fuel, convent,

In [23]:
from sklearn.decomposition import NMF
nmf_model = NMF(10, max_iter = 500)
doc_topic = nmf_model.fit_transform(doc_tf_idf)

topic_word = pd.DataFrame(nmf_model.components_.round(10),
             index = ["component_1","component_2","component_3","component_4","component_5","component_6","component_7","component_8","component_9","component_10"],
             columns = tfidf_vectorizer.get_feature_names())


display_topics(nmf_model, tfidf_vectorizer.get_feature_names(), 15)

H = pd.DataFrame(doc_topic.round(10),
             columns = ["component_1","component_2","component_3","component_4","component_5","component_6","component_7","component_8","component_9","component_10"],
                 index=muni_df['Issuer Name'])




Topic  0
apprais, agentregistrar, homestead, unlimit, tceq, ozon, acr, flood, drainag, wastewat, hgb, annex, epa, groundwat, engin

Topic  1
cityTM, iper, citi, tif, increment, mayor, ordin, srf, businesstyp, dissemin, proprietari, dtcTM, tmr, urban, mfprsi

Topic  2
star, districtwid, boce, foundat, pupil, tier, appeal, stress, teacher, moni, syracus, expensebas, obligor, formula, stateTM

Topic  3
success, townTM, bid, town, boston, bidderrepres, excis, overlay, surcharg, preserv, lord, unsold, lock, bidder, selectmen

Topic  4
subdivisionTM, eip, etf, ofth, villag, bargain, citi, mera, settl, increment, lrlif, town, trajectori, thevillag, villageTM

Topic  5
countyTM, counti, commission, supervisor, unvot, airport, dissemin, dtcTM, mental, luca, sheriff, human, governmentwid, tcdr, jail

Topic  6
njsa, ordin, dissemin, seq, townshipTM, pfr, township, cap, per, rma, chief, psa, boroughTM, burlington, mayor

Topic  7
districtTM, governmentwid, wastewat, auditorsTM, drainag, brazoria,

In Both cases there are still a lot of words related to location, like city names and township county etc.

# LDA Topic Modeling

In [13]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=15, random_state=0)
lda_tf.fit(doc_word)

# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=15, random_state=0)
lda_tfidf.fit(doc_tf_idf)

LatentDirichletAllocation(n_components=15, random_state=0)

In [16]:
## word count version
pyLDAvis.sklearn.prepare(lda_tf, doc_word, vectorizer)

In [18]:
# tfidf version
pyLDAvis.sklearn.prepare(lda_tfidf, doc_tf_idf, tfidf_vectorizer)

We can see some amount of separation on the topics however the words don't really form anything too coherent.

# Look at plotting for any indication what might be captured here.

Sparsity of the matrix leads us to try SVD for dimension reduction.

In [27]:
# Take all of the data and plot it on 2 dimensions
svd = TruncatedSVD(n_components=2)
svd.fit_transform(doc_word)

chrt_dat= pd.DataFrame(svd.transform(doc_word))
chrt_dat.columns = ['x','y']

import plotly.express as px
df = px.data.iris()
fig = px.scatter(chrt_dat, x='x', y="y",color=H.idxmax(axis=1).tolist(), hover_name=muni_df['ID'])
fig.show()

We see a fairly clear blue line here pointing downward that are all seemingly in Texas. A flat red liner is mostly New York schools but as it moves away from the center is a few other places. The orange also flat line seems to be mostly Massachusetts. It seems like the topics are mostly identifying locations.

This is a similar chart but instead used the tfidf version.

In [26]:
# Take all of the data and plot it on 2 dimensions
svd = TruncatedSVD(n_components=2)
svd.fit_transform(doc_tf_idf)

chrt_dat= pd.DataFrame(svd.transform(doc_tf_idf))
chrt_dat.columns = ['x','y']

import plotly.express as px
df = px.data.iris()
fig = px.scatter(chrt_dat, x='x', y="y",color=H.idxmax(axis=1).tolist(), hover_name=muni_df['Issuer Name'])
fig.show()

Here we see a lot of the same groupings but with fewer outliers.

### Try and reduce noise with another layer of stop words.

In [29]:
secondary_stops=['cityTM', 
'businesstyp', 
'governmentwid',
'mayor',
'proprietari',
'increment', 
'dissemin', 
'proportion', 
'districtTM', 
'unlimit', 
'acr',
'agentregistrar',
'tceq',
'master', 
'variabl',
'per',
'biennium',
'forecast',
'libor',
'stateTM'
'session',
'town',
'townTM'
'vehicl',
'boston',
'countyTM', 
'counti',
'fairfield', 
'districtTM',
'blackhawk',
'saw', 
'yearend',
'ordin',
'stateTM',
'escrow',
'clackama',
'portland',
'johnson',
'baltimor', 
'annapoli',
'dtcTM',
'syracus',
'bidderrepres',
'subdivisionTM',
'thevillag',
'villageTM',
'township',
'boroughTM',
'burlington',
'auditorTM',
'endingprincip',
'dougla',
'omaha',
'kutak']

### run the vectorizers again with new stopwords.

In [31]:
vectorizer = CountVectorizer(strip_accents = 'unicode',
                                #ngram_range=(1,2),
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                stop_words = secondary_stops,
                                max_df = 0.60, #remove words in over 50% of dataset
                                min_df = 10)

doc_word = vectorizer.fit_transform(muni_df['clean_text'])

tfidf_vectorizer = TfidfVectorizer(strip_accents = 'unicode',
                                #ngram_range=(1,2),
                                stop_words = secondary_stops,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.60, #remove words in over 50% of dataset
                                min_df = 10)

doc_tf_idf = tfidf_vectorizer.fit_transform(muni_df['clean_text'])



Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['auditortm', 'boroughtm', 'citytm', 'countytm', 'districttm', 'dtctm', 'statetm', 'statetmsession', 'subdivisiontm', 'towntmvehicl', 'villagetm'] not in stop_words.



In [35]:
from sklearn.decomposition import NMF
nmf_model = NMF(10, max_iter = 500)
doc_topic = nmf_model.fit_transform(doc_word)

topic_word = pd.DataFrame(nmf_model.components_.round(10),
             index = ["component_1","component_2","component_3","component_4","component_5","component_6","component_7","component_8","component_9","component_10"],
             columns = vectorizer.get_feature_names())


display_topics(nmf_model, vectorizer.get_feature_names(), 15)

H = pd.DataFrame(doc_topic.round(10),
             columns = ["component_1","component_2","component_3","component_4","component_5","component_6","component_7","component_8","component_9","component_10"],
                 index=muni_df['Issuer Name'])


Topic  0
mortal, retire, commission, enterpris, airport, electr, wast, librari, urban, portfolio, site, nonmajor, intergovernment, port, healthcar

Topic  1
apprais, wastewat, drainag, homestead, flood, engin, fithe, fitax, qualiti, disast, ozon, annex, storm, plant, groundwat

Topic  2
townTM, success, preserv, enterpris, vehicl, excis, deficit, motor, electr, abat, teacher, somervil, overlay, department, surcharg

Topic  3
colleg, technic, student, technolog, enrol, gateway, remodel, campu, morain, foundat, train, vice, western, staff, baird

Topic  4
swapsrec, lotteri, mortgag, univers, assembl, swap, retire, session, enterpris, portfolio, healthcar, statewid, veteran, operf, common

Topic  5
energi, electr, airport, retire, citi, solar, plant, fuel, tmr, convent, paper, hedg, junior, air, militari

Topic  6
calper, univers, committe, clean, enterpris, coastal, flood, safe, qualiti, air, retire, actuari, show, paper, drink

Topic  7
bid, success, bidder, win, competit, pariti, asse

In [38]:
from sklearn.decomposition import NMF
nmf_model = NMF(10, max_iter = 500)
doc_topic = nmf_model.fit_transform(doc_tf_idf)

topic_word = pd.DataFrame(nmf_model.components_.round(10),
             index = ["component_1","component_2","component_3","component_4","component_5","component_6","component_7","component_8","component_9","component_10"],
             columns = tfidf_vectorizer.get_feature_names())


display_topics(nmf_model, tfidf_vectorizer.get_feature_names(), 15)

H = pd.DataFrame(doc_topic.round(10),
             columns = ["component_1","component_2","component_3","component_4","component_5","component_6","component_7","component_8","component_9","component_10"],
                 index=muni_df['Issuer Name'])




Topic  0
apprais, homestead, drainag, wastewat, flood, ozon, hgb, groundwat, annex, epa, engin, fithe, mud, qualiti, surfac

Topic  1
star, districtwid, boce, foundat, pupil, tier, appeal, stress, teacher, moni, expensebas, obligor, formula, homeown, lunch

Topic  2
townTM, success, bid, excis, overlay, surcharg, preserv, lord, selectmen, unsold, bidder, lock, hilltop, competit, abat

Topic  3
citi, retire, colleg, enterpris, mortal, airport, univers, nonmajor, inflat, focu, technic, vehicl, kper, tmr, actuari

Topic  4
eip, ofth, etf, villag, bargain, citi, mera, settl, lrlif, trajectori, strike, indirectparticip, thewisconsin, wisconsinstatut, toni

Topic  5
njsa, seq, townshipTM, pfr, cap, rma, psa, chief, bergen, newark, borough, baumann, freehold, brunswick, appeal

Topic  6
iper, tif, srf, citi, mfprsi, urban, rollback, issuerTM, dorsey, whitney, cedar, multiresidenti, polk, wraTM, renew

Topic  7
swapsrec, lotteri, swap, operf, rhipa, univers, pebb, mortgag, rhia, ostf, saif, a

### Try plotting again on word freq

In [37]:
# Take all of the data and plot it on 2 dimensions
svd = TruncatedSVD(n_components=2)
svd.fit_transform(doc_word)

chrt_dat= pd.DataFrame(svd.transform(doc_word))
chrt_dat.columns = ['x','y']

import plotly.express as px
df = px.data.iris()
fig = px.scatter(chrt_dat, x='x', y="y",color=H.idxmax(axis=1).tolist(), hover_name=muni_df['Issuer Name'])
fig.show()

In [39]:
# Take all of the data and plot it on 2 dimensions
svd = TruncatedSVD(n_components=2)
svd.fit_transform(doc_tf_idf)

chrt_dat= pd.DataFrame(svd.transform(doc_tf_idf))
chrt_dat.columns = ['x','y']

import plotly.express as px
df = px.data.iris()
fig = px.scatter(chrt_dat, x='x', y="y",color=H.idxmax(axis=1).tolist(), hover_name=muni_df['Issuer Name'])
fig.show()

### If anything it seems like the state level separations are more noticeable.

While it is hard to get the topics to necessarily indicate use of proceeds in the bonds, perhaps the outlier documents tell us something of interest, particularly as they compare to other documents within their geography.

In [None]:
# Oregon

In [41]:
OR_index = muni_df[muni_df.State=='OR'].index.tolist()
or_txt = [muni_df['clean_text'][i] for i in OR_index]

vectorizer = CountVectorizer(strip_accents = 'unicode',
                                #ngram_range=(1,2),
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                stop_words = secondary_stops,
                                max_df = 0.60, #remove words in over 50% of dataset
                                min_df = 10)

doc_word = vectorizer.fit_transform(or_txt)
nmf_model = NMF(10, max_iter = 500)

doc_topic = nmf_model.fit_transform(doc_word)


topic_word = pd.DataFrame(nmf_model.components_.round(10),
             index = ["component_1","component_2","component_3","component_4","component_5","component_6","component_7","component_8","component_9","component_10"],
             columns = vectorizer.get_feature_names())


display_topics(nmf_model, vectorizer.get_feature_names(), 15)

H = pd.DataFrame(doc_topic.round(10),
             columns = ["component_1","component_2","component_3","component_4","component_5","component_6","component_7","component_8","component_9","component_10"],
                 index=muni_df['Issuer Name'][OR_index])
H



import plotly.express as px
# Take all of the data and plot it on 2 dimensions
svd = TruncatedSVD(n_components=2)
svd.fit_transform(doc_word)

chrt_dat= pd.DataFrame(svd.transform(doc_word))
chrt_dat.columns = ['x','y']

or_df = muni_df.iloc[OR_index]

import plotly.express as px
df = px.data.iris()
fig = px.scatter(or_df, x=chrt_dat['x'], y=chrt_dat["y"],color=H.idxmax(axis=1).tolist(),
                 hover_name='Issuer Name', 
                 hover_data=['State','Description','ID'])
fig.show()


Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['auditortm', 'boroughtm', 'citytm', 'countytm', 'districttm', 'dtctm', 'statetm', 'statetmsession', 'subdivisiontm', 'towntmvehicl', 'villagetm'] not in stop_words.




Topic  0
swapsrec, lotteri, veteran, rhipa, pebb, saif, ohcsd, raini, intermedi, pollut, assetback, oitp, nav, euro, moodi

Topic  1
veteransTM, remarket, swapsrec, lotteri, xia, bondsfl, welfar, mode, thereof, veteran, oregonTM, rhipa, bookentri, pebb, raini

Topic  2
thereof, bookentri, hereund, waiver, constru, thereto, nomine, bankruptci, instruct, hereto, cede, lien, insolv, authent, attach

Topic  3
swapsrec, lotteri, governorTM, rhipa, raini, ohcsd, saif, pebb, employeesTM, intermedi, veteran, pollut, assetback, nav, fistate

Topic  4
umatilla, bookentri, thereof, attach, nomine, waiver, bankruptci, constru, warranti, insolv, clear, cede, duli, lien, reorgan

Topic  5
firecent, distanc, reopen, developmentsfl, nomine, fifinanci, bookentri, render, planTM, lien, indirect, clear, thereof, emma, relianc

Topic  6
thereof, hereund, bookentryonli, bookentri, constru, cede, hereto, bankruptci, nomine, uncondit, lien, waiver, duli, indirect, indirectli

Topic  7
knowledg, cfr, insolv,


Maximum number of iterations 500 reached. Increase it to improve convergence.



### We can see that on the right are some bonds generally geared towards vetern support and higher education as well as some community improvement projects.

# Florida

In [51]:
FL_index = muni_df[muni_df.State=='FL'].index.tolist()
fl_txt = [muni_df['clean_text'][i] for i in FL_index]

vectorizer = CountVectorizer(strip_accents = 'unicode',
                                #ngram_range=(1,2),
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                stop_words = secondary_stops+['miami','miamidad'],
                                max_df = 0.60, #remove words in over 50% of dataset
                                min_df = 5)

doc_word = vectorizer.fit_transform(fl_txt)
nmf_model = NMF(10, max_iter = 500)

doc_topic = nmf_model.fit_transform(doc_word)


topic_word = pd.DataFrame(nmf_model.components_.round(10),
             index = ["component_1","component_2","component_3","component_4","component_5","component_6","component_7","component_8","component_9","component_10"],
             columns = vectorizer.get_feature_names())


display_topics(nmf_model, vectorizer.get_feature_names(), 15)

H = pd.DataFrame(doc_topic.round(10),
             columns = ["component_1","component_2","component_3","component_4","component_5","component_6","component_7","component_8","component_9","component_10"],
                 index=muni_df['Issuer Name'][FL_index])
H



import plotly.express as px
# Take all of the data and plot it on 2 dimensions
svd = TruncatedSVD(n_components=2)
svd.fit_transform(doc_word)

chrt_dat= pd.DataFrame(svd.transform(doc_word))
chrt_dat.columns = ['x','y']

fl_df = muni_df.iloc[FL_index]

import plotly.express as px
df = px.data.iris()
fig = px.scatter(fl_df, x=chrt_dat['x'], y=chrt_dat["y"],color=H.idxmax(axis=1).tolist(),
                 hover_name='Issuer Name', 
                 hover_data=['State','Description','ID'])
fig.show()


Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['auditortm', 'boroughtm', 'citytm', 'countytm', 'districttm', 'dtctm', 'statetm', 'statetmsession', 'subdivisiontm', 'towntmvehicl', 'villagetm'] not in stop_words.




Topic  0
aviat, solid, appendix, bondhold, airport, rescu, swap, peopl, visitor, mortgag, environment, surtax, formul, expir, neighborhood

Topic  1
superintend, student, appendix, size, teacher, advertis, veteran, airport, colleg, museum, appeal, governor, display, avenu, spous

Topic  2
garden, citi, council, stormwat, varianc, block, franchis, neighborhood, nonpay, sidewalk, halfcent, reach, cola, crime, drainag

Topic  3
floridaschedul, fundfor, solid, fundsfor, fundsseptemb, wastewat, sheetnonmajor, balancesnonmajor, bondhold, chairman, managementTM, resourcesdef, tourist, halfcent, floridastat

Topic  4
leasepurchas, student, grade, elementari, middl, superintend, chairman, spring, foundat, motor, treasur, fixtur, nonexchang, fundsfor, furnitur

Topic  5
bondhold, solid, floridaschedul, surtax, stormwat, compound, mortgag, appendix, redevelop, wastewat, appreci, environment, fundsfor, depositari, retainag

Topic  6
citi, bondhold, enlarg, dilig, mortgag, inabl, holiday, sunday, 

To the far right again is a document outlining a community building program, and farther up we see school issuers. It seems we are getting closer to removing locations however into topical separations.

This indicates that possibly with better stop words to eliminate geographic bias we can potentially reach a better understanding of underlying use of proceeds.