# 📚 Topic Modeling of Research Papers
This notebook demonstrates topic modeling using LDA on research paper abstracts.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from wordcloud import WordCloud


## 📥 Load Dataset

In [None]:
df = pd.read_csv("data/simulated_research_papers.csv")
df.head()


## 🔍 Data Exploration

In [None]:
df['Abstract'].str.len().hist(bins=10)
plt.title("Distribution of Abstract Length")
plt.xlabel("Characters")
plt.ylabel("Frequency")
plt.show()


## 🧹 Text Vectorization

In [None]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_counts = vectorizer.fit_transform(df['Abstract'])
X_counts.shape


## 🧠 Topic Modeling with LDA

In [None]:
lda_model = LatentDirichletAllocation(n_components=5, max_iter=10, learning_method='online', random_state=42)
lda_model.fit(X_counts)


## 🔍 Top Words per Topic

In [None]:
def display_topics(model, feature_names, no_top_words):
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx+1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics(lda_model, vectorizer.get_feature_names_out(), 10)


## 📊 Topic Distribution for Documents

In [None]:
topic_distribution = lda_model.transform(X_counts)
df_topic = pd.DataFrame(topic_distribution, columns=[f"Topic_{i+1}" for i in range(topic_distribution.shape[1])])
df = pd.concat([df, df_topic], axis=1)
df.head()


## 💾 Save Model and Vectorizer

In [None]:
import os
os.makedirs("model", exist_ok=True)
with open("model/lda_model.pkl", "wb") as f:
    pickle.dump(lda_model, f)
with open("model/vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)
