### Load packages

In [1]:
# pip install bertopic
import os
import pandas as pd
import numpy as np
import time

from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer

### Load data

In [2]:
data = pd.read_csv("data/02b_topic_modeling_prep.csv", sep=",", encoding='utf-8').drop(columns=["Unnamed: 0"])

In [3]:
data.shape

(6976833, 2)

### Set some automatic analytic parameters according to the size of the dataset

In [4]:
samplesize = 1000000
mintopic = int(samplesize/1000)
mintopic

1000

In [5]:
mindfset = int(samplesize/10000)
mindfset

100

In [6]:
mindfset2 = int(mindfset/10)
mindfset2

10

### Get a random subsample of the data

In [7]:
honors_docs = data['honors'].astype(str).sample(n=samplesize, random_state=1234).tolist()
details_docs = data['details'].astype(str).sample(n=samplesize, random_state=5678).tolist()

### Initialize some model parameters and convenience functions

In [8]:
def rescale(x, inplace=False):
    """ Rescale an embedding so optimization will not have convergence issues.
    """
    if not inplace:
        x = np.array(x, copy=True)

    x /= np.std(x[:, 0]) * 10000

    return x

In [9]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")
hdbscan_model = HDBSCAN(min_cluster_size=mintopic, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model_1 = CountVectorizer(min_df=mindfset)
vectorizer_model_2 = CountVectorizer(stop_words="english", ngram_range=(1,3), min_df=mindfset2)

### Run BERTopic on the Honors data first

In [None]:
start_time = time.time()
honors_embeddings = sentence_model.encode(honors_docs, show_progress_bar=False)
print("--- %s seconds ---" % (time.time() - start_time))

In [9]:
start_time = time.time()
# Initialize and rescale PCA embeddings
honors_pca_embeddings = rescale(PCA(n_components=5).fit_transform(honors_embeddings))

# Start UMAP from PCA embeddings
honors_umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    init=honors_pca_embeddings,
    low_memory=True
)
print("--- %s seconds ---" % (time.time() - start_time))

--- 5.2391791343688965 seconds ---


In [10]:
start_time = time.time()
# Pass the model to BERTopic:
honors_topic_model = BERTopic(umap_model=honors_umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model_1)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.0 seconds ---


In [11]:
start_time = time.time()
honors_topics, honors_probs = honors_topic_model.fit_transform(honors_docs, honors_embeddings)
print("--- %s seconds ---" % (time.time() - start_time))

--- 818.3645710945129 seconds ---


In [47]:
start_time = time.time()
honors_topic_model.update_topics(honors_docs, honors_topics, vectorizer_model=vectorizer_model_2)
print("--- %s seconds ---" % (time.time() - start_time))

--- 18.5928053855896 seconds ---


### Get some of the various output datasets from the run of BERTopic

In [48]:
honors_topic_info = honors_topic_model.get_topic_info()
honors_topic_info

Unnamed: 0,Topic,Count,Name
0,-1,410215,-1_club_president_member_team
1,0,57472,0_band_choir_orchestra_marching
2,1,31474,1_food_cashier_sales_associate
3,2,18022,2_church_st_youth group_church youth
4,3,16766,3_theater_theatre_drama_actor
...,...,...,...
149,148,1018,148_player_team_varsity_club
150,149,1017,149_art_president national_national_member nat...
151,150,1012,150_singles_player_2nd_player varsity
152,151,1007,151_crew_crew member_tech crew_tech


In [49]:
honors_topic_term_matrix = honors_topic_model.c_tf_idf

In [50]:
honors_words = vectorizer_model_2.get_feature_names()

### Prepare and then save the  output

In [51]:
honors_output = pd.DataFrame(honors_topic_term_matrix.toarray())
honors_output.columns = honors_words
honors_output = honors_output.T

In [52]:
honors_output

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,144,145,146,147,148,149,150,151,152,153
10,0.000951,0.001454,0.000386,0.000134,0.001093,0.000670,0.000295,0.000152,0.000735,0.000788,...,0.0,0.000000,0.008956,0.0,0.001124,0.0,0.000325,0.004650,0.000837,0.0
10 11,0.000290,0.000425,0.000000,0.000050,0.000462,0.000000,0.000000,0.000038,0.000237,0.000198,...,0.0,0.000000,0.009289,0.0,0.000847,0.0,0.000000,0.000000,0.001263,0.0
10 11 12,0.000195,0.000323,0.000000,0.000058,0.000377,0.000000,0.000000,0.000044,0.000181,0.000226,...,0.0,0.000000,0.007719,0.0,0.000968,0.0,0.000000,0.000000,0.000000,0.0
10 12,0.000030,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000238,0.000000,...,0.0,0.000000,0.002534,0.0,0.000000,0.0,0.000737,0.002105,0.000000,0.0
10 member,0.000048,0.000036,0.000000,0.000000,0.000100,0.000000,0.000000,0.000058,0.000000,0.000000,...,0.0,0.000000,0.001279,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zero,0.000245,0.000245,0.000306,0.000000,0.000000,0.000000,0.000411,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0
zion,0.000035,0.000095,0.000071,0.002311,0.000000,0.000113,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0
zionsville,0.000018,0.000111,0.000000,0.000000,0.000000,0.000265,0.000000,0.000000,0.000623,0.000000,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0
zone,0.000324,0.000079,0.000473,0.000113,0.000074,0.000282,0.000099,0.000000,0.000088,0.000000,...,0.0,0.001371,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0


In [53]:
honors_output.to_csv("02c_honors_topics.csv")

### Run the whole thing again only looking at the Details field data

In [11]:
start_time = time.time()
details_embeddings = sentence_model.encode(details_docs, show_progress_bar=False)
print("--- %s seconds ---" % (time.time() - start_time))

--- 203.89643502235413 seconds ---


In [12]:
start_time = time.time()
# Initialize and rescale PCA embeddings
details_pca_embeddings = rescale(PCA(n_components=5).fit_transform(details_embeddings))

# Start UMAP from PCA embeddings
details_umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    init=details_pca_embeddings,
    low_memory=True
)
print("--- %s seconds ---" % (time.time() - start_time))

--- 4.981011867523193 seconds ---


In [13]:
start_time = time.time()
# Pass the model to BERTopic:
details_topic_model = BERTopic(umap_model=details_umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model_1)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.0 seconds ---


In [14]:
start_time = time.time()
details_topics, details_probs = details_topic_model.fit_transform(details_docs, details_embeddings)
print("--- %s seconds ---" % (time.time() - start_time))

--- 794.9176087379456 seconds ---


In [15]:
start_time = time.time()
details_topic_model.update_topics(details_docs, details_topics, vectorizer_model=vectorizer_model_2)
print("--- %s seconds ---" % (time.time() - start_time))

--- 58.52931308746338 seconds ---


In [16]:
details_topic_info = details_topic_model.get_topic_info()
details_topic_info

Unnamed: 0,Topic,Count,Name
0,-1,358411,-1_school_team_students_events
1,0,34148,0_customers_customer_orders_store
2,1,26912,1_president_student_class_council
3,2,23949,2_musical_productions_theater_plays
4,3,21010,3_church_bible_faith_youth
...,...,...,...
126,125,1074,125_korean_korea_culture_language
127,126,1071,126_gymnastics_beam_vault_gymnastics team
128,127,1061,127_social media_media_social_accounts
129,128,1045,128_games_dragons_game_board games


In [17]:
details_topic_term_matrix = details_topic_model.c_tf_idf

In [18]:
details_words = vectorizer_model_2.get_feature_names()

In [19]:
details_output = pd.DataFrame(details_topic_term_matrix.toarray())
details_output.columns = details_words
details_output = details_output.T

In [20]:
details_output

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,121,122,123,124,125,126,127,128,129,130
05,0.000022,0.000000,0.000000,0.000000,0.000140,0.000030,0.000000,0.000000,0.000000,0.000035,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
10,0.001879,0.000556,0.001221,0.002770,0.001244,0.003129,0.000934,0.003641,0.000435,0.001781,...,0.002589,0.001167,0.002714,0.000735,0.004547,0.001144,0.009237,0.000786,0.001244,0.000363
10 11,0.000316,0.000039,0.000364,0.000345,0.000033,0.000865,0.000019,0.000066,0.000000,0.000317,...,0.000547,0.000000,0.000000,0.000000,0.001240,0.000000,0.000315,0.000277,0.000000,0.000000
10 11 12,0.000088,0.000000,0.000069,0.000170,0.000022,0.000233,0.000000,0.000029,0.000000,0.000055,...,0.000354,0.000000,0.000000,0.000000,0.000401,0.000000,0.000000,0.000000,0.000000,0.000000
10 11 grade,0.000007,0.000000,0.000020,0.000000,0.000000,0.000033,0.000000,0.000000,0.000000,0.000078,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zone,0.000247,0.000321,0.000069,0.000622,0.000054,0.000331,0.000209,0.000310,0.000051,0.000091,...,0.000296,0.000000,0.000000,0.000280,0.000000,0.000000,0.000000,0.000000,0.000711,0.000276
zones,0.000055,0.000000,0.000016,0.000024,0.000000,0.000000,0.000142,0.000000,0.000035,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
zoning,0.000012,0.000023,0.000000,0.000000,0.000000,0.000000,0.000070,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
zoo,0.000047,0.000164,0.000000,0.000019,0.000000,0.000000,0.000202,0.000000,0.000055,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [21]:
details_output.to_csv("02c_details_topics.csv")