In [1]:
import numpy as np
import spacy
import random
import pandas as pd 
from tqdm import tqdm
from collections import Counter

In [6]:
df = pd.read_csv(r'D:\From scratch series\LDA\train.csv')

In [7]:
df

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
20967,20968,Contemporary machine learning: a guide for pra...,Machine learning is finding increasingly bro...,1,1,0,0,0,0
20968,20969,Uniform diamond coatings on WC-Co hard alloy c...,Polycrystalline diamond coatings have been g...,0,1,0,0,0,0
20969,20970,Analysing Soccer Games with Clustering and Con...,We present a new approach for identifying si...,1,0,0,0,0,0
20970,20971,On the Efficient Simulation of the Left-Tail o...,The sum of Log-normal variates is encountere...,0,0,1,1,0,0


In [1]:
pip install pandas

Collecting pandasNote: you may need to restart the kernel to use updated packages.

  Downloading pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp312-cp312-win_amd64.whl (11.5 MB)
   ---------------------------------------- 0.0/11.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.5 MB 960.0 kB/s eta 0:00:12
   ---------------------------------------- 0.1/11.5 MB 1.3 MB/s eta 0:00:09
    --------------------------------------- 0.2/11.5 MB 1.9 MB/s eta 0:00:06
   - -------------------------------------- 0.5/11.5 MB 3.0 MB/s eta 0:00:04
   --- ------------------------------------ 0.9/11.5 MB 3.7 MB/s eta 0:00:03
   ---- ----------------------------------- 1.4/11.5 MB 4.8 MB/s eta 0:00:03
   ------- -------------------------------


[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Dirichlet, hyperparams, tokenizer
ALPHA = 0.1
BETA = 0.1
NUM_TOPICS = 20 #COHERENCE SCORE AND SEE HOW MANY YOU COULD NEED
sp = spacy.load("en_core_web_sm")


np.random.seed(42)
random.seed(42)

In [7]:
def generate_frequencies(data, max_docs = 10000):
    freqs = Counter()
    all_stopwords = sp.Default.stop_words
    all_stopwords.add("enron")
    nr_tokens = 0

    for doc in data[:max_docs]:
        tokens = sp.tokenizer(doc)
        for token in tokens:
            token_text = token.text.lower()
            if token_text not in all_stopwords and token.is_alpha:
                nr_tokens += 1
                freqs[token_text] += 1

    return freqs


def get_vocab(freqs, freq_threshold = 3):
    vocab  = {}
    vocab_idx_str = {}
    vocab_idx = 0

    for word in freqs:
        if freqs[word] >= freq_threshold:
            vocab[word] = vocab_idx
            vocab_idx_str[vocab_idx] = word
            vocab_idx += 1
    return vocab, vocab_idx_str
         

def tokenize_dataset(data, vocab, max_docs = 10000):
    nr_tokens = 0
    nr_docs = 0
    docs = []
    for doc in data[:max_docs]:
        tokens = sp.tokenizer(doc)

        if len(tokens) > 1:
            doc = []
            for token in tokens:
                token_text = token.text.lower()
                if token_text in vocab:
                    doc.append(token_text)
                    nr_tokens += 1
            nr_docs += 1
            docs.append(doc)

    print(f"Number of emails: {nr_docs}")
    print(f"Number of tokens: {nr_tokens}")

    #Numericalize 

    corpus = []
    for doc in docs:
        corpus_d = []

        for token in doc:
            corpus.append(vocab[token])

        corpus.append(np.asarray(corpus_d))

    return docs, corpus

In [None]:
df = pd.read_csv('emails_messages.csv')

In [None]:
data = df['message_text'].sample(frac = 0.5, random_state = 42).values
freqs = generate_frequencies(data)
vocab, vocab_idx_str = get_vocab(freqs)
docs, corpus = tokenize_dataset(data, vocab)
vocab_size = len(vocab)

In [None]:
def LDA_collapsed_Gibbs(corpus, num_iter = 200):
    # INITIALIZING COUNTS AND Z
    Z = []
    num_docs = len(corpus)

    for doc_idx, doc in enumerate(corpus):
        Zd = np.random.randint(low = 0, high = NUM_TOPICS, size = len(doc))
        Z.append(Zd)

    ndk = np.zeroes((num_docs, NUM_TOPICS))
    for d in range(num_docs):
        for k in range(NUM_TOPICS):
            ndk[d,k] = np.sum(Z[d] == k)

    nkw = np.zeroes((NUM_TOPICS, vocab_size))
    for doc_idx, doc in enumerate(corpus):
        for i, word in enumerate(doc):
            topic = Z[doc_idx][i]
            nkw[topic, word] += 1

    nk = np.sum(nkw, axis = 1)
    topic_list = [i for i in range(NUM_TOPICS)]

    for _ in tqdm(range(num_iter)):
        for doc_idx, doc in enumerate(corpus):
            for i in range(len(doc)):
                word = doc[i]
                topic = Z[doc_idx][i]

                #removing Z_i because conditioned on Z_(-i)
                ndk[doc_idx, topic] -= 1
                nkw[topic, word] -= 1
                nk[topic] -= 1

                p_z = (ndk[doc_idx, :] + ALPHA) * (nkw[:, word] + BETA) / (nk[:] + BETA*vocab_size)
                topic = random.choices(topic_list, weights = p_z, k=1)[0]


                #update the n paramtrs
                Z[doc_idx][i] = topic
                ndk[doc_idx, topic] += 1
                nkw[topic, word] += 1
                nk[topic] += 1

    return Z, ndk, nkw, nk

Z, ndk, nkw, nk = LDA_collapsed_Gibbs(corpus)

In [None]:
phi = nkw / nk.reshape(NUM_TOPICS, 1)

num_words = 10

for k in range(NUM_TOPICS):
    most_common_words = np.argsort(phi[k])[::-1][:num_words]
    print(f"Topic {k} and its most common words are: ")

    for word in most_common_words:
        print(vocab_idx_str[word])

    print('\n')