In [1]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import nltk, re, spacy, pickle


In [2]:
collections = ['20ng', 'wsj', 'wiki']
topic_models = ['lda','bertopic']

# Placeholder for future Bertopic‐specific topic counts
OPTIMAL_BERTOPIC_TOPICS = {
    'wsj': 50,
    'wiki':80,
    '20ng':40
}

In [3]:
def identity(x):         # returns tokens unchanged
    return x


In [7]:
for coll in collections:
    # -------------------------------------------------
    # 1) Load the raw-trained model
    # -------------------------------------------------
    topic_model = BERTopic.load(f"Results/BERTOPIC/{coll}_bertopic_{OPTIMAL_BERTOPIC_TOPICS[coll]}.model")

    # -------------------------------------------------
    # 2) Prepare cleaned documents  (same ordering!)
    # -------------------------------------------------
    clean_tokens = pickle.load(open(f"Processed{coll.upper()}/{coll}_preprocessed.pkl",'rb'))
    clean_docs = [" ".join(toks) for toks in clean_tokens]

    # -------------------------------------------------
    # 3) Create a new vectorizer (stop-word free, n-grams, etc.)
    # -------------------------------------------------
    vectorizer = CountVectorizer(
    tokenizer=str.split,
    preprocessor=None,
    lowercase=False,
    stop_words=None,
    min_df=5,
    ngram_range=(1, 2),   # still allowed—uses token tuples
)

    # -------------------------------------------------
    # 4) Refresh topic representations ONLY
    #    (clusters/embeddings stay untouched)
    # -------------------------------------------------
    topic_model.update_topics(
    clean_docs,         #  <-- pass list[list[str]]  NOT strings
    vectorizer_model=vectorizer,
    top_n_words=10
)

    # -------------------------------------------------
    # 5) Inspect or save
    # -------------------------------------------------
    print(topic_model.get_topic(0))           # now shows cleaned keywords
    topic_model.save(f"Results/BERTOPIC/{coll}_bertopic_{OPTIMAL_BERTOPIC_TOPICS[coll]}.model")



[('game', 0.05048387741717325), ('team', 0.039625324396245235), ('play', 0.030656911662835344), ('player', 0.03000357281171763), ('win', 0.02812369911995167), ('year', 0.023166878391370306), ('season', 0.0231021892285651), ('hockey', 0.020585262253893136), ('league', 0.018054549370035915), ('hit', 0.017042076975244756)]




[('share', 0.015063530306401387), ('say', 0.014771119921314552), ('company', 0.013742347801186456), ('million', 0.013100449619726609), ('stock', 0.012566718860859303), ('year', 0.008534055545636196), ('market', 0.008489784884636444), ('inc', 0.008078880288366229), ('new', 0.007830484376173022), ('sale', 0.00690521927735708)]




[('roman', 0.027518215393759274), ('rome', 0.020245745603117368), ('emperor', 0.013552410545035027), ('empire', 0.012133531555414022), ('city', 0.009376609408084734), ('caesar', 0.008756551622107584), ('century', 0.008219158485480839), ('senate', 0.00817566289578052), ('romans', 0.007783947419325011), ('military', 0.00763470483812735)]
