# Topic modeling with NMF

## NMF (Non-negative Matrix Factorization) 

We are going to start with one algorithm that can be used for topic modeling NMF

Some of the imports we'll need:

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# This is the new one
from sklearn.decomposition import NMF
import matplotlib
import matplotlib.pyplot as plt

from utilities import *

In [None]:
# file_id_segmented = "1ukY3sWTM3v26100ZYchGr66HPN8JI0UB-"
# url = f'https://drive.google.com/uc?id={file_id_segmented}'
url = "corpora/engineer_df_segmented.parquet"

corpus_df = pd.read_parquet(url)
corpus_df.rename(columns={"text": "Document"}, inplace=True)
corpus_df.drop(columns=["tokenized", "doc"], inplace=True)
corpus_df.head()

Some parameters we might want to vary

In [3]:
stop_list_source = "engineer_stop_list.txt"
vocabulary_size = 100
n_topics = 10
norm = False
max_ngram = 1
# Vectorizer = TfidfVectorizer
Vectorizer = CountVectorizer

extra_stop_words = []

### Create the document x term matrix

We are also going to create a frequency distribution for use later

In [4]:
if stop_list_source == "english":
    stopwords = "english"
else:
    with open('lists/' + stop_list_source, 'r') as f:
        stopwords = f.read().splitlines()
    stopwords += extra_stop_words

vectorizer = Vectorizer(max_features=vocabulary_size, 
                        stop_words=stopwords, 
                        ngram_range=(1, max_ngram))

doc_term_matrix = vectorizer.fit_transform(corpus_df['Document'].values)
if norm:
    doc_term_matrix = normalize(X, norm='l2')
corpus_df["vector"] = [x for x in doc_term_matrix.toarray()]

word_counts = np.array(doc_term_matrix.sum(axis=0)).flatten()
feature_names = vectorizer.get_feature_names_out()
fdist = dict(zip(feature_names, word_counts))

### Do the topic analysis

In [5]:
nmf_model = NMF(init="nndsvd", n_components=n_topics) 
doc_topic_matrix = nmf_model.fit_transform(doc_term_matrix)
topic_term_matrix = nmf_model.components_

#### Display the topics

In [None]:
lambda_val = .7

topic_rel_dfs = display_topics(nmf_model, vectorizer, fdist, lambda_val, 10, True)

topic_labels = []
for topic_df in topic_rel_dfs:
    words = topic_df["word"].tolist()[:2]
    label = words[0] + "-" + words[1]
    topic_labels.append(label)
topic_labels

In [7]:
topic_df = pd.DataFrame(doc_topic_matrix, columns=topic_labels)
corpus_df = corpus_df.reset_index(drop=True)
topic_df = topic_df.reset_index(drop=True)
corpus_df = pd.concat([corpus_df, topic_df], axis=1)

In [None]:
corpus_df.head()

In [None]:

topic_label = "engineering-design"

from IPython.display import display, HTML
top_10 = corpus_df.sort_values(by=topic_labels, ascending=False).head(10)
top_10.drop(columns=["vector"], inplace=True)

# Assuming top_10_rows is your DataFrame containing the top 10 rows
for index, row in top_10.iterrows():
    document = row['Document']  # Assuming 'Document' is the column name
    display(HTML(f"<div style='border: 1px solid #ccc; padding: 10px; margin: 10px;'><strong>Document {index + 1}:</strong><br>{document}</div>"))