In [1]:
pip install gensim





In [2]:
from scipy import sparse
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import pandas as pd
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
from gensim.matutils import Sparse2Corpus

In [3]:
df = pd.read_csv('processced text.csv')


### Preprocessing the text for calculating Coherence score

In [4]:
df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: x.split())
texts = df['preprocessed_text'].tolist()
# create a gensim dictionary from the texts
dictionary = corpora.Dictionary(texts)
#create a gensim corpus
corpus = [dictionary.doc2bow(text) for text in texts]

### Model

In [5]:
bow_matrix = sparse.load_npz("bow_matrix.npz")

In [6]:
num_topics = 10
lda_model = LatentDirichletAllocation(n_components=num_topics, max_iter=50, random_state=42)

In [7]:
lda_model.fit(bow_matrix)

In [8]:
import pickle

with open('out/count_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)
    
feature_names = vectorizer.get_feature_names_out()

In [9]:


num_top_words = 50

topic_words = []
for topic_idx, topic in enumerate(lda_model.components_):
    print(f"Topic {topic_idx + 1}:")
    top_words_idx = topic.argsort()[:-num_top_words - 1:-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(", ".join(top_words))
    topic_words.append(top_words)

Topic 1:
new, york, city, year, nyc, mayor, ny, child, jersey, theater, brooklyn, restaurant, manhattan, island, state, connecticut, food, long, number, street, bronx, michael, day, play, school, open, week, blasio, lottery, bloomberg, say, art, square, guide, mental, family, come, public, event, citys, parent, center, look, cultural, times, east, disorder, old, childhood, yearold
Topic 2:
new, city, york, coronavirus, year, real, state, estate, home, housing, travel, ny, ncov, park, space, area, building, safety, hurricane, residential, gas, water, change, oil, andrew, global, cuomo, energy, center, climate, pandemic, long, work, hotel, say, time, traffic, india, manhattan, national, week, vacation, accident, storm, service, million, leave, warming, road, administration
Topic 3:
united, states, trump, government, president, politic, election, party, donald, state, washington, house, republican, obama, international, say, presidential, ukraine, democratic, senate, biden, military, russ

### Topic Coherence Measurement 

In [None]:
id2word = {i: token for i, token in enumerate(feature_names)}

num_top_words = 50
topics = []
for topic_idx, topic in enumerate(lda_model.components_):
    top_features_ind = topic.argsort()[:-num_top_words - 1:-1]
    topic_words2 = [feature_names[i] for i in top_features_ind]
    topics.append(topic_words2)
    

# compute Coherence Score 
coherence_model_lda = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score (C_v): ', coherence_lda)

# Assessing how similar (or dissimilar each topic is)

In [None]:
bow = vectorizer.transform([' '.join(words) for words in topic_words])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

similarity_matrix = []

for topic_1 in bow:
    for topic_2 in bow:
        similarity_matrix.append(cosine_similarity(topic_1, topic_2)[0][0])

similarity_matrix = np.matrix(similarity_matrix).reshape(10, 10)

similarity_matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
# plt.imshow(, cmap='YlGnBu', interpolation='nearest')
sns.heatmap(similarity_matrix.reshape((10, 10)), annot=True, cmap="YlGnBu", fmt=".2f")

plt.xticks(np.arange(10), np.arange(1, 11))
plt.yticks(np.arange(10), np.arange(1, 11))

plt.title("Topic Similarity")
plt.show()


### Visualizing the distribution of articles across the 10 topics

In [None]:

# get document-topic distribution
doc_topic_dist = lda_model.transform(bow_matrix)

# counts how many documents have their highest probability in each topic
topic_counts = np.argmax(doc_topic_dist, axis=1)
topic_counts = np.bincount(topic_counts, minlength=lda_model.n_components)

# Step 3: Visualize the Distribution
plt.figure(figsize=(12, 6))
plt.bar(range(lda_model.n_components), topic_counts, tick_label=['Topic {}'.format(i+1) for i in range(lda_model.n_components)])
plt.xlabel('Topic')
plt.ylabel('Number of Articles')
plt.title('Distribution of Articles Across Topics')
plt.xticks(rotation=45)
plt.show()


### Hyperparameter Tuning

# Save the pickle for future use (i.e. for a new notebook to use classification)

In [None]:
import pickle

with open('out/lda_model.pkl', 'wb') as f:
    pickle.dump(lda_model, f)