In [1]:
import pandas as pd

df = pd.read_csv("../data/clean_reviews.csv")
df.shape


(4914, 6)

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    max_df=0.9,
    min_df=20,
    stop_words="english"
)

X_counts = vectorizer.fit_transform(df["clean_review"])
X_counts.shape


(4914, 721)

In [3]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(
    n_components=5,
    random_state=42
)

lda.fit(X_counts)


0,1,2
,"n_components  n_components: int, default=10 Number of topics. .. versionchanged:: 0.19  ``n_topics`` was renamed to ``n_components``",5
,"doc_topic_prior  doc_topic_prior: float, default=None Prior of document topic distribution `theta`. If the value is None, defaults to `1 / n_components`. In [1]_, this is called `alpha`.",
,"topic_word_prior  topic_word_prior: float, default=None Prior of topic word distribution `beta`. If the value is None, defaults to `1 / n_components`. In [1]_, this is called `eta`.",
,"learning_method  learning_method: {'batch', 'online'}, default='batch' Method used to update `_component`. Only used in :meth:`fit` method. In general, if the data size is large, the online update will be much faster than the batch update. Valid options: - 'batch': Batch variational Bayes method. Use all training data in each EM  update. Old `components_` will be overwritten in each iteration. - 'online': Online variational Bayes method. In each EM update, use mini-batch  of training data to update the ``components_`` variable incrementally. The  learning rate is controlled by the ``learning_decay`` and the  ``learning_offset`` parameters. .. versionchanged:: 0.20  The default learning method is now ``""batch""``.",'batch'
,"learning_decay  learning_decay: float, default=0.7 It is a parameter that control learning rate in the online learning method. The value should be set between (0.5, 1.0] to guarantee asymptotic convergence. When the value is 0.0 and batch_size is ``n_samples``, the update method is same as batch learning. In the literature, this is called kappa.",0.7
,"learning_offset  learning_offset: float, default=10.0 A (positive) parameter that downweights early iterations in online learning. It should be greater than 1.0. In the literature, this is called tau_0.",10.0
,"max_iter  max_iter: int, default=10 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the :meth:`fit` method, and not the :meth:`partial_fit` method.",10
,"batch_size  batch_size: int, default=128 Number of documents to use in each EM iteration. Only used in online learning.",128
,"evaluate_every  evaluate_every: int, default=-1 How often to evaluate perplexity. Only used in `fit` method. set it to 0 or negative number to not evaluate perplexity in training at all. Evaluating perplexity can help you check convergence in training process, but it will also increase total training time. Evaluating perplexity in every iteration might increase training time up to two-fold.",-1
,"total_samples  total_samples: int, default=1e6 Total number of documents. Only used in the :meth:`partial_fit` method.",1000000.0


In [4]:
import numpy as np

feature_names = vectorizer.get_feature_names_out()

def display_topics(model, feature_names, top_words=10):
    for idx, topic in enumerate(model.components_):
        print(f"\nTopic {idx + 1}:")
        print(", ".join(
            feature_names[i]
            for i in topic.argsort()[-top_words:][::-1]
        ))

display_topics(lda, feature_names)



Topic 1:
galaxy, samsung, works, great, gb, note, fast, storage, card, tablet

Topic 2:
camera, video, use, card, works, hd, tablet, fast, data, speed

Topic 3:
phone, card, memory, music, great, good, works, storage, space, pictures

Topic 4:
card, sandisk, sd, great, cards, works, price, memory, micro, good

Topic 5:
card, gb, phone, sandisk, sd, read, cards, class, format, write


In [5]:
topic_distributions = lda.transform(X_counts)
df["dominant_topic"] = topic_distributions.argmax(axis=1)
df[["clean_review", "dominant_topic"]].head()


Unnamed: 0,clean_review,dominant_topic
0,issues,3
1,purchased device worked advertised never much ...,2
2,works expected sprung higher capacity think ma...,4
3,think worked greathad diff bran gb card went s...,4
4,bought retail packaging arrived legit orange e...,1


In [6]:
df["dominant_topic"].value_counts()


dominant_topic
3    1265
4    1122
2    1021
0     986
1     520
Name: count, dtype: int64

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

tfidf_vectorizer = TfidfVectorizer(
    max_df=0.9,
    min_df=20,
    stop_words="english"
)

X_tfidf_topics = tfidf_vectorizer.fit_transform(df["clean_review"])

nmf = NMF(
    n_components=5,
    random_state=42
)

nmf.fit(X_tfidf_topics)


0,1,2
,"n_components  n_components: int or {'auto'} or None, default='auto' Number of components. If `None`, all features are kept. If `n_components='auto'`, the number of components is automatically inferred from W or H shapes. .. versionchanged:: 1.4  Added `'auto'` value. .. versionchanged:: 1.6  Default value changed from `None` to `'auto'`.",5
,"init  init: {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None Method used to initialize the procedure. Valid options: - `None`: 'nndsvda' if n_components <= min(n_samples, n_features),  otherwise random. - `'random'`: non-negative random matrices, scaled with:  `sqrt(X.mean() / n_components)` - `'nndsvd'`: Nonnegative Double Singular Value Decomposition (NNDSVD)  initialization (better for sparseness) - `'nndsvda'`: NNDSVD with zeros filled with the average of X  (better when sparsity is not desired) - `'nndsvdar'` NNDSVD with zeros filled with small random values  (generally faster, less accurate alternative to NNDSVDa  for when sparsity is not desired) - `'custom'`: Use custom matrices `W` and `H` which must both be provided. .. versionchanged:: 1.1  When `init=None` and n_components is less than n_samples and n_features  defaults to `nndsvda` instead of `nndsvd`.",
,"solver  solver: {'cd', 'mu'}, default='cd' Numerical solver to use: - 'cd' is a Coordinate Descent solver. - 'mu' is a Multiplicative Update solver. .. versionadded:: 0.17  Coordinate Descent solver. .. versionadded:: 0.19  Multiplicative Update solver.",'cd'
,"beta_loss  beta_loss: float or {'frobenius', 'kullback-leibler', 'itakura-saito'}, default='frobenius' Beta divergence to be minimized, measuring the distance between X and the dot product WH. Note that values different from 'frobenius' (or 2) and 'kullback-leibler' (or 1) lead to significantly slower fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input matrix X cannot contain zeros. Used only in 'mu' solver. .. versionadded:: 0.19",'frobenius'
,"tol  tol: float, default=1e-4 Tolerance of the stopping condition.",0.0001
,"max_iter  max_iter: int, default=200 Maximum number of iterations before timing out.",200
,"random_state  random_state: int, RandomState instance or None, default=None Used for initialisation (when ``init`` == 'nndsvdar' or 'random'), and in Coordinate Descent. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `.",42
,"alpha_W  alpha_W: float, default=0.0 Constant that multiplies the regularization terms of `W`. Set it to zero (default) to have no regularization on `W`. .. versionadded:: 1.0",0.0
,"alpha_H  alpha_H: float or ""same"", default=""same"" Constant that multiplies the regularization terms of `H`. Set it to zero to have no regularization on `H`. If ""same"" (default), it takes the same value as `alpha_W`. .. versionadded:: 1.0",'same'
,"l1_ratio  l1_ratio: float, default=0.0 The regularization mixing parameter, with 0 <= l1_ratio <= 1. For l1_ratio = 0 the penalty is an elementwise L2 penalty (aka Frobenius Norm). For l1_ratio = 1 it is an elementwise L1 penalty. For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. .. versionadded:: 0.17  Regularization parameter *l1_ratio* used in the Coordinate Descent  solver.",0.0


In [8]:
feature_names = tfidf_vectorizer.get_feature_names_out()
display_topics(nmf, feature_names)



Topic 1:
card, sd, gb, micro, class, adapter, cards, read, using, microsd

Topic 2:
great, works, gopro, price, use, camera, fast, hero, problems, issues

Topic 3:
galaxy, samsung, note, bought, tab, tablet, works, problems, ii, fast

Topic 4:
good, memory, sandisk, price, product, cards, quality, brand, buy, ive

Topic 5:
phone, music, storage, space, pictures, movies, gb, store, videos, got
