Let's get our data into a dataframe!

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import spacy
nlp = spacy.load("en_core_web_sm")
import nltk
lemma = nltk.wordnet.WordNetLemmatizer()
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

In [2]:
# read it into a dataframe and turn it into type string and strip the html tags
df = pd.read_csv("raw.csv")

def stripped_strings(text):
    return (' '.join(BeautifulSoup(text).stripped_strings))


df["essay"] = df["essay"].apply(stripped_strings)

Let's map out where all the titles are from:

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point, Polygon
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="geoapiExercises")

values = df['latlong'].value_counts()
sizes = []

def parselat(tagliststr):
    if (tagliststr != "none"):
        tagliststr = tagliststr.strip('][').split(', ')
        return float(tagliststr[0])

def parselong(tagliststr):
    if (tagliststr != "none"):
        sizes.append(values[tagliststr]*10)
        tagls = tagliststr
        tagliststr = tagliststr.strip('][').split(', ')
        if (float(tagliststr[1]) > 0):
            print(df[df['latlong']==tagls]['title'])
        return float(tagliststr[1])
    sizes.append(0)

map = gpd.read_file('States_shapefile.shp')

def printcities(tagliststr):
    if (tagliststr != "none"):
        tagliststr = tagliststr.strip('][').split(', ')
        location = geolocator.reverse(tagliststr[0]+","+tagliststr[1])
        return location.raw['address'].get('state', '')

df['locs'] = df['latlong'].apply(printcities)
print(df['locs'].value_counts())
df['lat'] = df['latlong'].apply(parselat)
df['long'] = df['latlong'].apply(parselong)

geo = [Point(xy) for xy in zip(df['long'], df['lat'])]
geo_df = gpd.GeoDataFrame(geometry=geo)


fig, ax = plt.subplots(figsize = (30,30))
map.plot(ax = ax, alpha=.4, color='gray', edgecolor='black')
plt.xlim([-128, -60])
plt.ylim([15,51])

# For the Alaska and Hawaii Maps:
# plt.xlim([-180, -128])
# [5-30]
print(values)
print(len(df['long']))
geo_df.plot(ax = ax, markersize=sizes, color = "blue", marker = "o")



Let's figure out which title essays are duplicated

In [16]:
import collections
from collections import defaultdict

def parse(tagliststr):
    return tagliststr.strip('][').split(', ')

# group by duplicate essays and look at patterns across: subject, title, marc records, contributors!
marc_records_df = df.groupby('essay')['subjects'].apply(list).reset_index(name="subject_groups")
titles_df = df.groupby('essay')['title'].apply(list).reset_index(name="title_groups")
contributors_df = df.groupby('essay')['essay_contributor'].apply(list).reset_index(name="cont_groups")

percent_sim = []
afamera_tags = []
nontags = []

overall_occurences= defaultdict(int)
for marc_record,title,contributor in zip(marc_records_df["subject_groups"], titles_df["title_groups"], contributors_df["cont_groups"]): # for each title, we have the subject list, title, contributor
    if (type(marc_record) == list and len(marc_record) > 1):
        s = set()
        l = list()

        # what percent of taglists are the same
        total_occurences= defaultdict(int)
        total_words = 0
        for taglist in marc_record: # if a tag contains the same tag as anything before it, dont count it as different
            tlist = parse(taglist)
            for t in tlist:
                total_occurences[t] += 1
                overall_occurences[t] += 1
            total_words += len(tlist)
        
        val_list = list(total_occurences.values())
        sim = 1-(total_words - sum(i for i in val_list if i > 1))/total_words
        percent_sim.append(sim)

        # printing some statistics for our special groups: 
        if (len(title) > 400):
            print("above 400")
            print(title)
            print(contributor)
            nontags.append(marc_record)
        elif (len(title) > 90):
            print("above 90")
            print(title)
            print(contributor)
            afamera_tags.append(marc_record)

# printing top MARC tags overall
sorted_marc_occurences = sorted(overall_occurences.items(), key=lambda x:x[1], reverse=True)
converted_dict = dict(sorted_marc_occurences)

print(converted_dict)
        # the number of titles per group
        # [2, 2, 2, 2, 3, 2, 2, 3, 2, 4, 2, 2, 2, 4, 2, 2, 2, 3, 2, 3, 3, 95, 4, 3, 3, 5, 2, 2, 8, 3, 3, 2, 5, 6, 2, 3, 2, 2, 2, 4, 5, 2, 6, 4, 2, 2, 2, 2, 2, 2, 6, 3, 2, 2, 5, 6, 2, 2, 4, 2, 5, 2, 6, 2, 2, 3, 3, 4, 2, 5, 2, 5, 8, 4, 9, 4, 7, 3, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 4, 2, 2, 4, 3, 2, 7, 6, 2, 2, 2, 2, 3, 2, 5, 7, 2, 3, 3, 2, 2, 2, 7, 2, 4, 3, 3, 3, 3, 5, 2, 2, 4, 2, 8, 3, 3, 4, 3, 6, 2, 3, 2, 5, 8, 2, 2, 2, 4, 6, 5, 4, 6, 3, 2, 4, 7, 4, 2, 2, 3, 2, 2, 2, 5, 4, 2, 3, 2, 4, 2, 2, 2, 2, 4, 2, 2, 2, 4, 3, 10, 2, 5, 2, 3, 2, 5, 5, 2, 8, 2, 2, 3, 2, 6, 2, 2, 2, 3, 3, 4, 2, 2, 2, 3, 4, 2, 3, 4, 3, 3, 2, 7, 3, 2, 4, 2, 3, 2, 2, 2, 2, 4, 2, 2, 2, 9, 3, 3, 7, 2, 2, 3, 2, 2, 3, 3, 3, 6, 2, 2, 4, 2, 4, 4, 2, 2, 9, 3, 2, 2, 2, 2, 3, 2, 4, 2, 2, 2, 3, 2, 2, 3, 4, 7, 2, 4, 2, 2, 4, 2, 2, 3, 3, 2, 2, 3, 2, 2, 2, 2, 3, 2, 5, 3, 9, 2, 2, 2, 3, 5, 3, 2, 3, 6, 3, 8, 2, 2, 2, 3, 2, 2, 3, 2, 2, 2, 2, 2, 3, 6, 3, 2, 3, 2, 2, 3, 5, 2, 3, 5, 4, 2, 2, 2, 2, 2, 8, 3, 10, 2, 4, 2, 3, 9, 2, 3, 2, 2, 2, 6, 2, 2, 3, 6, 2, 2, 2, 6, 2, 2, 2, 4, 2, 2, 3, 3, 3, 3, 2, 3, 2, 2, 8, 3, 2, 2, 2, 2, 4, 4, 4, 3, 2, 2, 2, 4, 5, 2, 6, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 6, 2, 6, 4, 3, 8, 2, 4, 4, 2, 4, 2, 3, 5, 2, 3, 2, 4, 5, 2, 3, 5, 12, 4, 4, 4, 3, 3, 2, 2, 2, 2, 3, 3, 2, 3, 5, 2, 3, 3, 2, 6, 2, 2, 2, 4, 5, 6, 9, 5, 5, 2, 2, 4, 6, 2, 3, 3, 2, 2, 2, 3, 2, 2, 3, 2, 3, 3, 2, 2, 7, 2, 2, 3, 3, 2, 7, 2, 2, 3, 2, 2, 2, 10, 2, 9, 2, 5, 2, 2, 2, 2, 2, 4, 2, 8, 2, 2, 2, 3, 5, 6, 2, 2, 3, 7, 5, 3, 2, 6, 2, 4, 2, 3, 2, 2, 9, 2, 6, 2, 3, 3, 2, 3, 2, 3, 2, 3, 3, 3, 2, 2, 2, 4, 2, 2, 3, 2, 2, 5, 7, 2, 2, 3, 5, 2, 2, 3, 2, 2, 2, 4, 5, 5, 2, 2, 2, 5, 2, 2, 3, 11, 2, 3, 2, 2, 2, 2, 3, 3, 7, 2, 2, 8, 4, 2, 2, 2, 3, 2, 4, 2, 3, 2, 6, 6, 5, 2, 3, 2, 4, 2, 2, 2, 2, 3, 9, 2, 5, 2, 2, 6, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 5, 2, 419]

above 90
['Africo-American Presbyterian.', 'The Afro-American.', 'The Aliened American.', 'The American citizen.', 'The American citizen.', 'The American eagle.', 'The American guide.', 'The American Negro.', 'The Arkansas freeman.', 'Atlanta age.', 'The Augusta union.', 'The benevolent banner.', 'The Black Republican.', 'The Boston advance.', 'The Boston courant.', 'The bulletin.', 'The Chicago world.', 'The Christian banner.', 'The colored American.', 'The colored citizen.', 'The colored patriot.', 'The colored Tennessean.', 'The colored visitor.', 'Columbus chronicle.', 'The commonwealth.', 'The conservator.', 'The crusader.', 'The defender.', 'The Detroit informer.', 'The echo.', 'The enterprise.', 'The Florida evangelist.', 'The Florida sentinel.', 'The free American.', "The free man's press.", 'The free state.', "The freedman's press.", 'The Gazette and land bulletin.', 'The gazette.', 'Herald of Kansas.', 'The Huntsville star.', 'The independent.', 'The Indianapolis world.', 'Io

In [17]:
# let's see what the top tags are for our second largest group
    # replace afamera_tags with nontags to look at the first largest group
c = []
for item in afamera_tags[0]:
    l = parse(item)
    c += l
print(collections.Counter(c))

Counter({"'African Americans'": 95, "'African American newspapers'": 85, "'African American newspapers--Kansas'": 12, "'Kansas'": 12, "'African American newspapers--Georgia'": 9, "'Georgia'": 9, "'United States--Kansas--Shawnee--Topeka'": 6, "'African American newspapers--Texas'": 6, "'Texas'": 6, "'African American newspapers--Ohio'": 5, "'Ohio'": 5, "'Topeka (Kan.)--Newspapers'": 5, "'Kansas--Topeka'": 5, "'African American newspapers--Missouri'": 5, "'African Americans--Missouri--Newspapers'": 5, "'Missouri'": 5, "'African American newspapers--Louisiana'": 5, "'New Orleans (La.)--Newspapers'": 5, "'Louisiana'": 5, "'Louisiana--New Orleans'": 5, "'United States--Louisiana--Orleans--New Orleans'": 5, "'African Americans--Georgia--Newspapers'": 5, "'African American newspapers--Virginia'": 5, "'African Americans--Virginia--Newspapers'": 5, "'Virginia'": 5, "'Baltimore (Md.)--Newspapers'": 4, "'Maryland'": 4, "'Maryland--Baltimore'": 4, "'United States--Maryland--Baltimore'": 4, "'Afric

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

# used to create bar chart of number of titles in each group
# the distribution of similarity across the records
# commented our parts are for the box and whisker plots
def plot_frequency(arr):
    frequency = dict(Counter(arr))
    print(frequency)
    x = list(frequency.keys())
    print(x)
    y = list(frequency.values())
    plt.bar(x, y, width=0.08)
    # plt.boxplot(x, vert=False)
    plt.xlabel('Percent similarity in MARC records')
    plt.ylabel('Frequency')
    # plt.yticks([])
    plt.show()

plot_frequency(percent_sim)

Now that we're done with duplicate work, let's drop duplicates and do some topic modeling

In [4]:
# before and after: 3683, 1865
print(len(df))
df.drop_duplicates(subset=['essay'], inplace=True)
print(len(df))

3683
1865


Let's now pre-process our essays!

In [5]:
# let's go ahead and do some preprocessing!
common_words = ["editorial","editor", "edit", "include", "article", "state", "new", "page", "herald", "press", "title", "tribune", "city", "later", "time", "paper", "edition", "newspaper", "publication", "news"]
stop_words=nltk.corpus.stopwords.words('english')+common_words

def preprocess(essay):
    # remove tags we don't want (also parses out any HTML tags)
        # source: https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
        # Explanation: replaces not (^) word characters or spaces with the empty string
    essay = re.sub(r'[^\w\s]', '', essay)

    # here we are applying a model that will tag our essay (cateogories as well as entities, etc )
        # https://spacy.io/models#:~:text=For%20example%2C%20en_core_web_sm%20is%20a,includes%20vocabulary%2C%20syntax%20and%20entities
    doc = nlp(essay)

    # NOTE: may be purning too many ents, but this is based off a couple samples of unwanted ents 
    # NOTE: not every word comes with a label
    # let's start by pruning pretty heavily and seeing what some top words are 
    # this essentially took out states, classic english stopwords, all that's left is domain-specific stopwords 
    unwanted_ents = ['PERSON', 'DATE', 'CARDINAL', 'ORDINAL', 'FAC', 'GPE', 'TIME'] 
    wanted_ents = ['NORP']
    pos_tags = ['ADJ', 'NOUN', 'VERB'] 
    important_words = []
    non_important_words = []
    for tok in doc:
        word = tok.text.lower()
        lemmatized_word = lemma.lemmatize(word)
        # check if it's a stop wrod before AND after lemmatization just in case (but usually, only needed after lemmatization)
        if (word in stop_words or lemmatized_word in stop_words):
            non_important_words.append(lemmatized_word)
        else:
            # if it's a specically wanted entity, we don't care what part of speech (POS) it is
            if (tok.ent_type_ in wanted_ents):
                important_words.append(lemmatized_word)
            # non-digit, a tag that we want, and an accepted entity type 
            elif (not word.isdigit() and tok.pos_ in pos_tags and not tok.ent_type_ in unwanted_ents):
                important_words.append(lemmatized_word)
            else:
                non_important_words.append(lemmatized_word)
    important_words_sentence = " ".join(important_words)
    # print(non_important_words) # this could be interesting to look at more! what is taken out?

    return important_words_sentence

df["preprocessed"] = df["essay"].apply(preprocess)

Now that we're all preprocessed let's do some fun stuff!

WordClouds:

In [None]:
# let's make a quick word cloud
# this can be helpful to see if any unwanted words are appearing (then we can add them to our stopword list and re run preprocessing)
# source: https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0 

from wordcloud import WordCloud

long_string = ','.join(list(df["preprocessed"].values))
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
wordcloud.generate(long_string)
wordcloud.to_image()

LDA:

In [6]:
# lets run lda: https://medium.com/analytics-vidhya/topic-modeling-using-gensim-lda-in-python-48eaa2344920
# general other help: https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html 

from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.models.wrappers import LdaMallet
import os

def split_string(text):
    return text.split()

df["split"] = df["preprocessed"].apply(split_string) #don't want a string of the tokens, want an array of the tokens
id2word = corpora.Dictionary(df["split"])
# this will remove even more words, because it it occurs in more than 40% of docs, like if half the docs contain a word, it'll discard it 
id2word.filter_extremes(no_above=0.4) # https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html
texts = df["split"] 
corpus = [id2word.doc2bow(text) for text in texts] 

models = []
coherence = []

#TODO: expirement with parameters! https://stackoverflow.com/questions/50805556/understanding-parameters-in-gensim-lda-model 
os.environ['MALLET_HOME'] = 'C:\\Users\\aashn\\Downloads\\mallet-2.0.8\\mallet-2.0.8'
mallet_path = "C:\\Users\\aashn\\Downloads\\mallet-2.0.8\\mallet-2.0.8\\bin\\mallet"

# mallet models -- 
# mallet_model5 = gensim.models.wrappers.LdaMallet(mallet_path, corpus, num_topics=5, id2word = id2word, random_seed=42)
# models.append(mallet_model5)
# coherence.append(CoherenceModel(model=mallet_model5, texts=texts, dictionary=id2word, coherence='c_v').get_coherence())

# mallet_model10 = gensim.models.wrappers.LdaMallet(mallet_path, corpus, num_topics=10, id2word = id2word, random_seed=42)
# models.append(mallet_model10)
# coherence.append(CoherenceModel(model=mallet_model10, texts=texts, dictionary=id2word, coherence='c_v').get_coherence())

# mallet_model15 = gensim.models.wrappers.LdaMallet(mallet_path, corpus, num_topics=15, id2word = id2word, random_seed=42)
# models.append(mallet_model15)
# coherence.append(CoherenceModel(model=mallet_model15, texts=texts, dictionary=id2word, coherence='c_v').get_coherence())

# mallet_model20 = gensim.models.wrappers.LdaMallet(mallet_path, corpus, num_topics=20, id2word = id2word, random_seed=42)
# models.append(mallet_model20)
# coherence.append(CoherenceModel(model=mallet_model20, texts=texts, dictionary=id2word, coherence='c_v').get_coherence())

# mallet_model25 = gensim.models.wrappers.LdaMallet(mallet_path, corpus, num_topics=25, id2word = id2word, random_seed=42)
# models.append(mallet_model25)
# coherence.append(CoherenceModel(model=mallet_model25, texts=texts, dictionary=id2word, coherence='c_v').get_coherence())

mallet_model30 = gensim.models.wrappers.LdaMallet(mallet_path, corpus, num_topics=30, id2word = id2word, random_seed=42)
models.append(mallet_model30)
coherence.append(CoherenceModel(model=mallet_model30, texts=texts, dictionary=id2word, coherence='c_v').get_coherence())

# mallet_model35 = gensim.models.wrappers.LdaMallet(mallet_path, corpus, num_topics=35, id2word = id2word, random_seed=42)
# models.append(mallet_model35)
# coherence.append(CoherenceModel(model=mallet_model35, texts=texts, dictionary=id2word, coherence='c_v').get_coherence())

mallet_model40 = gensim.models.wrappers.LdaMallet(mallet_path, corpus, num_topics=40, id2word = id2word, random_seed=42)
models.append(mallet_model40)
coherence.append(CoherenceModel(model=mallet_model40, texts=texts, dictionary=id2word, coherence='c_v').get_coherence())

# non_mallet models
# ldamodel5 = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word = id2word, random_state=42, passes=20)
# models.append(ldamodel5)
# print(CoherenceModel(model=ldamodel5, texts=texts, dictionary=id2word, coherence='c_v').get_coherence())

# ldamodel10 = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = id2word, random_state=42, passes=20)
# models.append(ldamodel10)
# print(CoherenceModel(model=ldamodel10, texts=texts, dictionary=id2word, coherence='c_v').get_coherence())

# ldamodel15 = gensim.models.ldamodel.LdaModel(corpus, num_topics=15, id2word = id2word, random_state=42, passes=20)
# models.append(ldamodel15)
# print(CoherenceModel(model=ldamodel15, texts=texts, dictionary=id2word, coherence='c_v').get_coherence())

# ldamodel30 = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word = id2word, random_state=42, passes=20)
# models.append(ldamodel30)
# print(CoherenceModel(model=ldamodel30, texts=texts, dictionary=id2word, coherence='c_v').get_coherence())

# ldamodel40 = gensim.models.ldamodel.LdaModel(corpus, num_topics=40, id2word = id2word, random_state=42, passes=20)
# models.append(ldamodel40)
# print(CoherenceModel(model=ldamodel40, texts=texts, dictionary=id2word, coherence='c_v').get_coherence())

In [None]:
# Plotting the coherence values

import matplotlib.pyplot as plt
x = [5, 10, 15, 20, 25, 30, 35, 40]
plt.plot(x, coherence)
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Values')
plt.show()

In [None]:
# https://towardsdatascience.com/the-complete-guide-for-topics-extraction-in-python-a6aaa6cedbbc
# looking at words in each topic + finding which titles are in each topic

from operator import itemgetter
from collections import defaultdict
from collections import OrderedDict

def pretty_print_topics(model):
    arry = model.show_topics(num_topics=40, num_words=10, log=False, formatted=False)
    for num,topics in arry:
        print(list(zip(*topics))[0])

# WARNING this method can take very long to run (2+ hrs)
def print_titles_per_topic(model):
    topic_to_title_dict = defaultdict(list)
    for item,title in zip(corpus,df["title"]):
        max_topic_idx = max(model[item],key=itemgetter(1))[0] 
        topic_to_title_dict[max_topic_idx].append(title)
    return topic_to_title_dict

# easiest to do for one model at a time, but could put in for-loop
print("Printing Model Stats for Model\n")
print("Word Probabilities: \n")
# pretty_print_topics(mallet_model30)
print(mallet_model30.print_topics(40))
topic_to_title_dict = print_titles_per_topic(mallet_model30)
topic_to_title_dict = OrderedDict(sorted(topic_to_title_dict.items()))
for k,v in topic_to_title_dict.items():
    print("Topic #" + str(k+1) + " \n")
    print("Len: " + str(len(v)) + ", Percent: " + str(round(len(v)/1865*100, 2)) + "\n")
    print(v)
    print("\n")


In [21]:
#visualize the topics

import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()

# mallet to lda model for visualization: https://neptune.ai/blog/pyldavis-topic-modelling-exploration-tool-that-every-nlp-data-scientist-should-know
def convertToldaGen(mallet_model):
    model_gensim = gensim.models.ldamodel.LdaModel(
        id2word=mallet_model.id2word, num_topics=mallet_model.num_topics,
        alpha=mallet_model.alpha, eta=0,
    )
    model_gensim.state.sstats[...] = mallet_model.wordtopics
    model_gensim.sync_state()
    return model_gensim

# unfortunately, must visualize each model separately via array indexing
model = convertToldaGen(models[0]) # only need this line for mallet models
vis = pyLDAvis.gensim_models.prepare(model, corpus, id2word, sort_topics=False)
vis

  from imp import reload
  result = func(self.values, **kwargs)
  default_term_info = default_term_info.sort_values(
  result = func(self.values, **kwargs)


Case Study into How Women are Portrayed

In [25]:
def get_fem_essays(arry):
    word_set = set(arry)
    fem_set = {"she", "her", "herself", "woman", "wife", "newspaperwoman", "women", "female"}
    if word_set.intersection(fem_set):
        return arry
    return "nan"

# this column now contains all title essays that have any mention of words in our fem set
df["fem"] = df["split"].apply(get_fem_essays)

In [26]:
# let's create a data frame of only fem title essays 
df_fem = df.drop(df[(df['fem'] == "nan")].index)
print(len(df_fem))

id2word = corpora.Dictionary(df_fem["split"])
id2word.filter_extremes(no_above=0.4)
texts = df_fem["split"] 
corpus = [id2word.doc2bow(text) for text in texts] 
models = []

# fmallet_model5 = gensim.models.wrappers.LdaMallet(mallet_path, corpus, num_topics=5, id2word = id2word, random_seed=42)
# models.append(fmallet_model5)
# print(CoherenceModel(model=fmallet_model5, texts=texts, dictionary=id2word, coherence='c_v').get_coherence())

# fmallet_model10 = gensim.models.wrappers.LdaMallet(mallet_path, corpus, num_topics=10, id2word = id2word, random_seed=42)
# models.append(fmallet_model10)
# print(CoherenceModel(model=fmallet_model10, texts=texts, dictionary=id2word, coherence='c_v').get_coherence())

fmallet_model15 = gensim.models.wrappers.LdaMallet(mallet_path, corpus, num_topics=15, id2word = id2word, random_seed=42)
models.append(fmallet_model15)
print(CoherenceModel(model=fmallet_model15, texts=texts, dictionary=id2word, coherence='c_v').get_coherence())

# fmallet_model30 = gensim.models.wrappers.LdaMallet(mallet_path, corpus, num_topics=30, id2word = id2word, random_seed=42)
# models.append(fmallet_model30)
# print(CoherenceModel(model=fmallet_model30, texts=texts, dictionary=id2word, coherence='c_v').get_coherence())

0.37012506145677737


In [30]:
from operator import itemgetter
from collections import defaultdict
from collections import OrderedDict

def pretty_print_topics(model):
    arry = model.show_topics(num_topics=40, num_words=10, log=False, formatted=False)
    for num,topics in arry:
        print(list(zip(*topics))[0])

# WARNING this method can take very long to run (2+ hrs)
def print_titles_per_topic(model):
    topic_to_title_dict = defaultdict(list)
    for item,title in zip(corpus,df_fem["title"]):
        max_topic_idx = max(model[item],key=itemgetter(1))[0] 
        topic_to_title_dict[max_topic_idx].append(title)
    return topic_to_title_dict

# easiest to do for one model at a time, but could put in for-loop
print("Printing Model Stats for Model\n")
print("Word Probabilities: \n")
pretty_print_topics(fmallet_model15)
print(fmallet_model15.print_topics(40))
topic_to_title_dict = print_titles_per_topic(fmallet_model15)
topic_to_title_dict = OrderedDict(sorted(topic_to_title_dict.items()))
for k,v in topic_to_title_dict.items():
    print("Topic #" + str(k+1) + " \n")
    print("Len: " + str(len(v)) + ", Percent: " + str(round(len(v)/332*100, 2)) + "\n")
    print(v)
    print("\n")

Printing Model Stats for Model

Word Probabilities: 

('reader', 'movement', 'suffrage', 'printed', 'organ', 'good', 'man', 'campaign', 'special', 'great')
('republican', 'political', 'democratic', 'politics', 'party', 'supported', 'interest', 'election', 'candidate', 'public')
('community', 'american', 'african', 'served', 'black', 'business', 'serving', 'member', 'church', 'including')
('railroad', 'family', 'founded', 'worked', 'event', 'located', 'established', 'son', 'fire', 'moved')
('labor', 'reported', 'men', 'work', 'organization', 'immigrant', 'child', 'active', 'mining', 'union')
('war', 'story', 'made', 'short', 'life', 'native', 'writer', 'career', 'letter', 'religious')
('population', 'industry', 'small', 'region', 'resident', 'people', 'company', 'oil', 'world', 'operation')
('county', 'democrat', 'seat', 'general', 'early', 'interest', 'nearby', 'notice', 'land', 'fourpage')
('printing', 'owner', 'continued', 'left', 'subscription', 'print', 'returned', 'wrote', 'appear

In [37]:
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()

model = convertToldaGen(models[2])
vis = pyLDAvis.gensim_models.prepare(model, corpus, id2word, sort_topics=False)
vis

  result = func(self.values, **kwargs)
  default_term_info = default_term_info.sort_values(
  result = func(self.values, **kwargs)


Extra Resources: \
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/