# LDA

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel

In [2]:
df_docs = pd.read_csv("data_cleaned/dostoevsky_chunks.csv")
df_docs.head(4)

Unnamed: 0,book_id,chunk_index,chunk_id,text_base,text_lda,text_bert
0,Crime_and_Punishment,0,Crime_and_Punishment__0000,CRIME AND PUNISHMENT PART I CHAPTER I On an ex...,crime punishment part chapter exceptionally ho...,crime and punishment part i chapter i on an ex...
1,Crime_and_Punishment,1,Crime_and_Punishment__0001,of mind; he walked along not observing what wa...,mind walk along observe care observe mutter so...,of mind; he walked along not observing what wa...
2,Crime_and_Punishment,2,Crime_and_Punishment__0002,the other into the street. This house was let ...,street house let tiny tenement inhabit work pe...,the other into the street. this house was let ...
3,Crime_and_Punishment,3,Crime_and_Punishment__0003,"paused, as though hesitating; then stepped on ...",pause hesitate step side point door room let v...,"paused, as though hesitating; then stepped on ..."


In [3]:
vectorizer = CountVectorizer(
    min_df=15,        # adjust later if vocab too small/large
    max_df=0.95
)

dtm = vectorizer.fit_transform(df_docs["text_lda"])
vocab = vectorizer.get_feature_names_out()

print("dtm shape:", dtm.shape)   # (n_docs, n_terms)

dtm shape: (2395, 4499)


In [4]:
print("n_docs:", dtm.shape[0])
print("vocab_size:", dtm.shape[1])

n_docs: 2395
vocab_size: 4499


In [5]:
k = 20
lda = LatentDirichletAllocation(
    n_components=k,
    random_state=1881
)

doc_topic = lda.fit_transform(dtm)  # shape: (n_docs, k)

In [6]:
def show_topics(model, feature_names, n_top_words=20):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_ids = np.argsort(topic)[::-1][:n_top_words]
        top_words = [feature_names[i] for i in top_ids]
        topics.append((topic_idx, top_words))
    return topics

topics = show_topics(lda, vocab, n_top_words=20)
for tid, words in topics:
    print(f"topic {tid}: {', '.join(words)}")

topic 0: take, run, old, murder, open, door, house, away, think, hear, kill, get, father, find, window, might, begin, last, put, day
topic 1: pavlovitch, father, speak, rakitin, evgenie, elder, son, sov, old, suddenly, word, ask, begin, moment, talk, much, doctor, take, superior, laugh
topic 2: old, good, get, take, day, wife, young, think, little, friend, crocodile, sir, dear, give, marie, begin, gentleman, poor, last, year
topic 3: upon, yet, old, also, take, yes, money, think, good, never, grandmother, may, give, thing, stake, mlle, much, leave, lose, fact
topic 4: father, boy, day, good, brother, take, little, cry, hand, think, lise, ilusha, suddenly, laugh, woman, begin, alexey, get, want, give
topic 5: russian, believe, criminal, nothing, may, crime, question, law, article, take, men, right, exist, church, think, fact, thing, idea, become, speak
topic 6: prince, think, much, good, quite, word, seem, take, little, speak, ask, day, gania, last, great, fact, moment, question, upon, 

In [7]:
topic_cols = [f"topic_{i}" for i in range(k)]
df_topic = pd.DataFrame(doc_topic, columns=topic_cols)
df_topic["book_id"] = df_docs["book_id"].values

book_topic = df_topic.groupby("book_id")[topic_cols].mean()

# most prominent topic per book
top_topic = book_topic.idxmax(axis=1)

print("most prominent topic per book:")
print(top_topic)

book_topic

most prominent topic per book:
book_id
Crime_and_Punishment              topic_11
Notes_from_the_Underground         topic_9
Poor_Folk                          topic_3
Short_Stories                      topic_2
The_Brothers_Karamazov             topic_4
The_Gambler                        topic_3
The_Grand_Inquisitor              topic_14
The_Idiot                          topic_6
The_Possessed _or_The_Devils      topic_17
White_Nights_and_Other_Stories    topic_11
dtype: str


Unnamed: 0_level_0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Crime_and_Punishment,0.099731,0.000551,0.016764,0.009445,0.020625,0.020958,0.079693,0.012434,0.050905,0.044201,0.012607,0.260155,0.251549,0.009586,0.027017,0.001837,0.006092,0.010249,0.046519,0.019081
Notes_from_the_Underground,0.006002,0.007904,0.069975,0.004015,0.01119,0.012442,0.002166,0.000185,0.045842,0.355561,0.098065,0.205014,0.033959,0.047912,0.005896,0.055178,0.003756,0.011531,0.018743,0.004665
Poor_Folk,0.020048,0.002023,0.026329,0.655487,0.01022,0.002843,0.050192,0.000179,0.00549,0.022997,0.003723,0.057121,0.00758,0.020246,0.021516,0.001242,0.007628,0.000179,0.078861,0.006098
Short_Stories,0.043754,0.000913,0.268128,0.013322,0.009377,0.043905,0.018394,0.000183,0.028519,0.055006,0.024075,0.137848,0.054973,0.026008,0.039175,0.007618,0.027438,0.153884,0.038971,0.00851
The_Brothers_Karamazov,0.133826,0.07861,0.019449,0.007685,0.163136,0.020287,0.008296,0.000218,0.144604,0.045406,0.012483,0.109207,0.034589,0.019777,0.10238,0.001672,0.008503,0.007462,0.036892,0.045518
The_Gambler,0.002192,0.006841,0.004172,0.731515,0.002183,0.006678,0.061123,0.000182,0.008417,0.025027,0.030696,0.055183,0.008049,0.004008,0.003798,0.002696,0.013388,0.010127,0.007325,0.016401
The_Grand_Inquisitor,0.000171,0.004373,0.000171,0.010434,0.000171,0.000171,0.002893,0.000171,0.000171,0.003731,0.000171,0.045047,0.000171,0.00108,0.914469,0.002132,0.000171,0.000171,0.008843,0.005285
The_Idiot,0.016208,0.02644,0.030631,0.006097,0.003531,0.015393,0.449061,0.001442,0.035864,0.028929,0.004292,0.045066,0.005741,0.018048,0.019999,0.006239,0.251271,0.00267,0.024229,0.00885
The_Possessed _or_The_Devils,0.050445,0.00241,0.015203,0.0043,0.031111,0.009295,0.013853,0.000173,0.051632,0.043498,0.02442,0.133068,0.222479,0.013504,0.04797,0.000981,0.005711,0.268515,0.053286,0.008149
White_Nights_and_Other_Stories,0.058572,0.004293,0.080368,0.019458,0.053704,0.006479,0.019586,0.000218,0.040952,0.17556,0.050889,0.221692,0.058692,0.042533,0.007783,0.021563,0.008354,0.008045,0.109849,0.011412


In [8]:
from nltk.corpus import wordnet
import numpy as np

# doc frequency for each term
df = np.asarray((dtm > 0).sum(axis=0)).ravel()
terms = np.array(vocab)

# take the most widespread terms
top_ids = np.argsort(df)[::-1][:400]
candidates = []

for t in terms[top_ids]:
    # skip obvious non-candidates
    if len(t) < 3:
        continue
    # heuristic: if wordnet doesn't know it, it might be a name
    if len(wordnet.synsets(t)) == 0:
        candidates.append(t)

print("candidate tokens (inspect and pick names):")
print(candidates[:])

candidate tokens (inspect and pick names):
['something', 'without', 'upon', 'everything', 'anything', 'since', 'else', 'whether', 'cannot', 'ought', 'towards', 'among', 'beside', 'others', 'anyone']


In [9]:
from collections import Counter

freq = Counter(" ".join(df_docs["text_lda"]).split())
print(freq.most_common(10))

[('take', 4372), ('think', 3904), ('day', 3373), ('good', 3237), ('give', 3057), ('get', 3055), ('begin', 2960), ('hand', 2723), ('prince', 2680), ('nothing', 2660)]


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(book_topic.values)
df_sim = pd.DataFrame(sim, index=book_topic.index, columns=book_topic.index)

df_sim

book_id,Crime_and_Punishment,Notes_from_the_Underground,Poor_Folk,Short_Stories,The_Brothers_Karamazov,The_Gambler,The_Grand_Inquisitor,The_Idiot,The_Possessed _or_The_Devils,White_Nights_and_Other_Stories
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Crime_and_Punishment,1.0,0.491842,0.134931,0.491587,0.556949,0.107965,0.103498,0.281977,0.705116,0.73581
Notes_from_the_Underground,0.491842,1.0,0.096857,0.488978,0.392889,0.086991,0.040932,0.123354,0.360736,0.863806
Poor_Folk,0.134931,0.096857,1.0,0.134057,0.116184,0.991427,0.049298,0.103366,0.082529,0.1983
Short_Stories,0.491587,0.488978,0.134057,1.0,0.392392,0.095213,0.129645,0.195028,0.620833,0.639047
The_Brothers_Karamazov,0.556949,0.392889,0.116184,0.392392,1.0,0.077344,0.340888,0.161941,0.431827,0.606819
The_Gambler,0.107965,0.086991,0.991427,0.095213,0.077344,1.0,0.020822,0.103657,0.066597,0.145423
The_Grand_Inquisitor,0.103498,0.040932,0.049298,0.129645,0.340888,0.020822,1.0,0.04656,0.141304,0.061651
The_Idiot,0.281977,0.123354,0.103366,0.195028,0.161941,0.103657,0.04656,1.0,0.112571,0.201152
The_Possessed _or_The_Devils,0.705116,0.360736,0.082529,0.620833,0.431827,0.066597,0.141304,0.112571,1.0,0.516105
White_Nights_and_Other_Stories,0.73581,0.863806,0.1983,0.639047,0.606819,0.145423,0.061651,0.201152,0.516105,1.0
