In [28]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis.lda_model
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

RANDOM_SEED = 42

In [25]:
def filter_null(df, verbose=True):
    '''filter null translations'''
    clean_corp = df[(df != ' ') & (df != '') & (df != 'source language unavailable for translation')]  
    if verbose:
        print(f'{len(df) - len(clean_corp)} null records dropped out of {len(df)}')  
    return clean_corp

In [26]:
fp = '../data/processed/desc_title_translated.pkl'
corpus = pd.read_pickle(fp)

clean_corpus = filter_null(corpus['title_en'])
train_data, test_data = train_test_split(clean_corpus, test_size=0.2, random_state=RANDOM_SEED)

# vectorize
tf_vectorizer = CountVectorizer(#preprocessor=custom_preprocessor,
                                       #max_features = 20000, # only top 10k by freq
                                       lowercase = True, # keep capitalization
                                       ngram_range = (2,3), # include 2-word phrases
                                       min_df=1,  # note: absolute count of doc
                                       max_df=0.90,   # note: % of docs
                                       stop_words='english') # default English stopwords

tf_doc_train = tf_vectorizer.fit_transform(train_data)
tf_doc_test = tf_vectorizer.transform(test_data)
tf_feature_names = tf_vectorizer.get_feature_names_out()

# train
n_topics = 4
lda = LatentDirichletAllocation(n_components=n_topics, random_state=RANDOM_SEED)
lda.fit(tf_doc_train)
topic_models = lda.components_

# scoring 
score = lda.score(tf_doc_test)
perplexity = lda.perplexity(tf_doc_test)
print(f'{score = }')
print(f'{perplexity = }')

26 null records dropped out of 3895
score = -23528.688577070636
perplexity = 1375118646231556.5


In [27]:
# visualize
pyLDAvis.enable_notebook()
panel = pyLDAvis.lda_model.prepare(lda, tf_doc_train, tf_vectorizer, mds='tsne')
pyLDAvis.save_html(panel, 'lda.html')

### sandbox

In [6]:
len(tf_feature_names)

17816

In [7]:
tf_docs, tf_cols = count_vectorize(filter_null(df['desc_en']))

NameError: name 'df' is not defined

In [None]:
len(tf_cols)

In [None]:
tf_docs.shape

In [None]:
tf_cols[:300]

In [None]:
tf_cols[-250:]

In [None]:
df[-40:].to_list()