# Topic modeling with NMF

## NMF (Non-negative Matrix Factorization) 

We are going to start with one algorithm that can be used for topic modeling NMF

Some of the imports we'll need:

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# This is the new one
from sklearn.decomposition import NMF
import matplotlib
import matplotlib.pyplot as plt

Here's a functions will let us display matrices nicely.

In [2]:
def display_matrix(mat, row_labels, col_labels, rows=10):
    return pd.DataFrame(mat, index=row_labels, columns=col_labels).head(rows).round(3)

def matrix_heatmap(mtx, row_labels, col_labels, cmap='YlOrBr'):
    fig=plt.figure(figsize=(10, 10), dpi= 80, facecolor='w', edgecolor='k')
    x_tick_marks = np.arange(len(col_labels))
    y_tick_marks = np.arange(len(row_labels))
    plt.xticks(x_tick_marks, col_labels, fontsize=8, rotation=90)
    plt.yticks(y_tick_marks, row_labels, fontsize=8)
    plt.tick_params("x", top=True, labeltop=True, bottom=False, labelbottom=False)
    plt.imshow(mtx, norm=matplotlib.colors.LogNorm(), interpolation='nearest', cmap=cmap)

In [49]:
corpus_df = pd.read_parquet("corpora/engineer_df_segmented.parquet")
corpus_df.rename(columns={"text": "Document"}, inplace=True)
corpus_df.drop(columns=["tokenized", "doc"], inplace=True)
corpus_df.head()

Unnamed: 0,Document
0,it s called the shaft module it s something li...
1,electrical jupiter i worked toward the project...
2,s since last year yeah exactly you remember ri...
3,s that one so before we ship it out to an actu...
4,an actual job i just wanted to make sure that ...


Some parameters we might want to vary

In [50]:
stop_list_source = "engineer_stop_list.txt"
vocabulary_size = 100
n_topics = 10
norm = False
max_ngram = 1
# Vectorizer = TfidfVectorizer
Vectorizer = CountVectorizer

extra_stop_words = []

### Create the document x term matrix

We are also going to create a frequency distribution for use later

In [52]:
if stop_list_source == "english":
    stopwords = "english"
else:
    with open('lists/' + stop_list_source, 'r') as f:
        stopwords = f.read().splitlines()
    stopwords += extra_stop_words

vectorizer = Vectorizer(max_features=vocabulary_size, 
                        stop_words=stopwords, 
                        ngram_range=(1, max_ngram))

doc_term_matrix = vectorizer.fit_transform(corpus_df['Document'].values)
if norm:
    doc_term_matrix = normalize(X, norm='l2')
corpus_df["vector"] = [x for x in doc_term_matrix.toarray()]

word_counts = np.array(doc_term_matrix.sum(axis=0)).flatten()
feature_names = vectorizer.get_feature_names_out()
fdist = dict(zip(feature_names, word_counts))

### Do the topic analysis

In [53]:
nmf_model = NMF(init="nndsvd", n_components=n_topics) 
doc_topic_matrix = nmf_model.fit_transform(doc_term_matrix)
topic_term_matrix = nmf_model.components_

#### Some functions that can make interpreting these topic_term matrices easier to examine

The function displays the top-weighted terms in each topic.

It also has a fancier ability that we'll get to later

In [54]:
lambda_val = .7

from IPython.display import display_html
import math

def display_side_by_side(dfs, round_to=3):
    html_str = ''
    for i, df in enumerate(dfs):
        df_html = df.round(round_to).to_html(index=False)
        html_str += f'<div style="display:inline-block; vertical-align:top; margin-right:10px"><h4>Topic {i}</h4>{df_html}</div>'
    display_html(html_str, raw=True)

def wprob(w, cfdist, total_words):
    return cfdist[w] / total_words

def relevance(w, ld, pwt, cfdist, total_words, use_log=True):
    import math
    if use_log and pwt == 0:
        return -99999
    if use_log:
        return ld * math.log(pwt) + (1 - ld) * math.log(pwt / wprob(w, cfdist, total_words))
    else:
        return ld * pwt + (1 - ld) * (pwt / wprob(w, cfdist, total_words))
    
def display_topics(model, vectorizer, fdist, lbda, n=10, use_log=True):
    total_words = sum(fdist.values())
    feature_names = vectorizer.get_feature_names_out()
    topic_rel_dfs = []
    for topic in model.components_:
        topic_sum = sum(topic)
        word_rel_dict = {}
        for idx, word in enumerate(feature_names):
            pwt = topic[idx] / topic_sum
            word_rel_dict[word] = relevance(word, lbda, pwt, fdist, total_words, use_log=use_log)

        df = pd.DataFrame(sorted(list(word_rel_dict.items()), key=lambda x: x[1], reverse=True)[:n], columns=["word", "relevance"])
        topic_rel_dfs.append(df)
    display_side_by_side(topic_rel_dfs)
    return topic_rel_dfs

topic_rel_dfs = display_topics(nmf_model, vectorizer, fdist, lambda_val, 10, True)

topic_labels = []
for topic_df in topic_rel_dfs:
    words = topic_df["word"].tolist()[:2]
    label = words[0] + "-" + words[1]
    topic_labels.append(label)
topic_labels

word,relevance
engineering,0.317
design,-2.042
class,-2.054
engineer,-2.179
science,-2.257
math,-2.357
northwestern,-2.402
general,-2.533
guess,-2.555
department,-2.592

word,relevance
people,0.287
person,-2.483
talk,-2.607
team,-2.697
feel,-2.791
big,-2.82
office,-2.825
company,-2.831
talking,-2.865
engineers,-2.975

word,relevance
work,0.313
day,-1.861
working,-2.597
life,-2.669
job,-2.787
hard,-2.838
doesn,-2.838
make,-2.86
team,-2.894
wanted,-2.998

word,relevance
things,0.358
make,-2.498
side,-2.731
learn,-2.833
feel,-2.844
basically,-2.935
client,-2.99
learned,-3.036
engineers,-3.039
important,-3.107

word,relevance
time,0.26
back,-1.91
long,-1.988
part,-2.208
day,-2.529
week,-2.581
worked,-2.736
working,-2.779
talk,-2.983
summer,-3.035

word,relevance
good,0.338
job,-1.866
engineer,-1.988
experience,-2.493
math,-2.572
great,-2.651
feel,-2.654
idea,-2.802
person,-2.814
thought,-2.942

word,relevance
project,0.146
working,-1.387
manager,-1.674
projects,-1.927
client,-2.374
basically,-2.448
part,-2.62
big,-2.765
couple,-2.825
person,-2.862

word,relevance
ve,0.192
before,-2.024
company,-2.151
years,-2.34
worked,-2.351
working,-2.552
learned,-2.647
back,-2.675
experience,-2.675
projects,-2.71

word,relevance
stuff,0.208
working,-1.973
cool,-2.095
guess,-2.286
make,-2.314
data,-2.385
design,-2.544
client,-2.788
week,-2.813
sort,-2.968

word,relevance
didn,-0.836
school,-0.949
year,-1.173
high,-1.497
guess,-2.347
back,-2.381
wanted,-2.442
years,-2.463
job,-2.505
wasn,-2.547


['engineering-design',
 'people-person',
 'work-day',
 'things-make',
 'time-back',
 'good-job',
 'project-working',
 've-before',
 'stuff-working',
 'didn-school']

In [55]:
topic_df = pd.DataFrame(doc_topic_matrix, columns=topic_labels)
corpus_df = corpus_df.reset_index(drop=True)
topic_df = topic_df.reset_index(drop=True)
corpus_df = pd.concat([corpus_df, topic_df], axis=1)

In [56]:
corpus_df.head()

Unnamed: 0,Document,vector,engineering-design,people-person,work-day,things-make,time-back,good-job,project-working,ve-before,stuff-working,didn-school
0,it s called the shaft module it s something li...,"[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",0.000705,0.00178,0.00773,0.002988,0.006644,0.009945,0.124989,0.017778,0.007551,0.10752
1,electrical jupiter i worked toward the project...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0,0.001706,0.007186,0.002957,0.005718,0.010181,0.121841,0.016975,0.006573,0.103911
2,s since last year yeah exactly you remember ri...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0,0.001974,0.00553,0.003271,0.003011,0.009892,0.0,0.011439,0.137054,0.096609
3,s that one so before we ship it out to an actu...,"[0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0,0.002736,0.008552,0.008691,0.011688,0.013906,0.003538,0.028027,0.142018,0.042748
4,an actual job i just wanted to make sure that ...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0,0.003118,0.008994,0.006536,0.009567,0.014001,0.001059,0.018236,0.141016,0.035755


In [None]:

topic_label = "engineering-design"

from IPython.display import display, HTML
top_10 = corpus_df.sort_values(by=topic_labels, ascending=False).head(10)
top_10.drop(columns=["vector"], inplace=True)

# Assuming top_10_rows is your DataFrame containing the top 10 rows
for index, row in top_10.iterrows():
    document = row['Document']  # Assuming 'Document' is the column name
    display(HTML(f"<div style='border: 1px solid #ccc; padding: 10px; margin: 10px;'><strong>Document {index + 1}:</strong><br>{document}</div>"))