# Topic Modeling on Each Cluster

## Libraries Needed

In [1]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
import numpy as np
import pandas as pd
import pickle
import string
import os

In [2]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
!python -m spacy download en_core_web_lg

[38;5;2m[+] Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


## Load Data

In [3]:
df = pickle.load(open("plot_data/df_spins.p", "rb"))

y_pred = pickle.load(open("plot_data/y_pred.p", "rb"))

# kmeans = pickle.load(open("plot_data/kmeans.p", "rb"))

In [4]:
df['y'] = y_pred

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16491 entries, 0 to 17999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        16491 non-null  object
 1   word_count  16491 non-null  int64 
 2   y           16491 non-null  int32 
dtypes: int32(1), int64(1), object(1)
memory usage: 450.9+ KB


## Example

In [6]:
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

In [7]:
df["text"][23]

'en este breve artculo vamos a intentar ayudarte a comprender el negocio a averiguar cundo invertir y especialmente cmo hacerlo'

In [8]:
doc = nlp(df["text"][20])

In [9]:
# Lemmatization
review = str(" ".join([i.lemma_ for i in doc]))

In [10]:
doc = nlp(review)
spacy.displacy.render(doc, style='ent',jupyter=True)

  "__main__", mod_spec)


## Vectorize for Each Cluster using CountVectorizer

In [22]:
vectorizers = []
    
for ii in range(0, 15):
    # Creating a vectorizer
    vectorizers.append(CountVectorizer(stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}'))

In [23]:
vectorizers[0]

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='[a-zA-Z\\-][a-zA-Z\\-]{2,}',
                tokenizer=None, vocabulary=None)

In [24]:
vectorized_data = []

current_cluster = 0
for cvec in vectorizers:
    try:
        vectorized_data.append(cvec.fit_transform(df.loc[df['y'] == current_cluster, 'text']))
    except Exception as e:
        raise e
        print("Not enough instances in cluster: " + str(current_cluster))
        pass
    current_cluster += 1

In [25]:
vectorized_data[0]

<953x32 sparse matrix of type '<class 'numpy.int64'>'
	with 18886 stored elements in Compressed Sparse Row format>

In [26]:
len(vectorized_data)

15

## Get the Topics Per Cluster Using LatentDirichletAllocation

In [30]:
# number of topics per cluster
NUM_TOPICS_PER_CLUSTER = 20

In [31]:
lda_models = []

for ii in range(0, 15):
    # Latent Dirichlet Allocation Model
    lda = LatentDirichletAllocation(n_components=NUM_TOPICS_PER_CLUSTER, max_iter=10, learning_method='online',verbose=False)
    lda_models.append(lda)
    
lda_models[0]

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=20, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=False)

In [32]:
clusters_lda_data = []

current_cluster = 0
for lda in tqdm(lda_models):
    clusters_lda_data.append((lda.fit_transform(vectorized_data[current_cluster])))
    current_cluster+=1

100%|██████████| 15/15 [01:10<00:00,  4.72s/it]


## Get the Keywords from Each Topic on Each Cluster

In [33]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=3):
    current_words = []
    keywords = []
    
    for idx, topic in enumerate(model.components_):
        words = [(vectorizer.get_feature_names()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]]
        for word in words:
            if word[0] not in current_words:
                keywords.append(word)
                current_words.append(word[0])
                
    keywords.sort(key = lambda x: x[1])  
    keywords.reverse()
    return_values = []
    for ii in keywords:
        return_values.append(ii[0])
    return return_values

In [34]:
current_vectorizer = 0
all_keywords = []
for lda in lda_models:
    #print("Current Cluster: " + str(current_vectorizer) +"\n")
    
    #if current_vectorizer == 14:
    #    current_vectorizer += 1
        #print("Current Cluster Changed to (14 is empty): " + str(current_vectorizer))
    all_keywords.append(selected_topics(lda, vectorizers[current_vectorizer]))
    current_vectorizer += 1
    #print("____________________________________________________________\n")

In [35]:
all_keywords[0][:10]

['estudio',
 'este',
 'mejor',
 'invertir',
 'artculo',
 'muy',
 'importante',
 'gua',
 'esta',
 'comprar']

In [36]:
len(all_keywords)

15

## Output to a Text File

Note that cluster 14 didn't have enough instances to get the keywords.

In [37]:
f=open('lib/topics.txt','w')

count = 0
for ii in all_keywords:
    #if count == 14:
    #    f.write("Not enough instances to be determined. \n")
    #    f.write(', '.join(ii) + "\n")
    #else:
    f.write(', '.join(ii) + "\n")
    count += 1
f.close()    