# Clustering and Topic Modeling

This notebook demonstrates TF-IDF vectorization and K-Means clustering on article abstracts.

In [None]:
import sys
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Colab Setup
if 'google.colab' in sys.modules:
    !pip install -q pandas numpy scikit-learn
    if not os.path.exists('aire-researcher-sandbox'):
        !git clone https://github.com/YOUR_USERNAME/aire-researcher-sandbox.git
    %cd aire-researcher-sandbox

In [None]:
data_path = 'data/sample_texts/articles_sample.csv'
df = pd.read_csv(data_path)
texts = df['abstract'].fillna('')

In [None]:
# Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(texts)
print(f"Matrix shape: {X.shape}")

In [None]:
# Clustering
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X)
df['cluster'] = clusters
display(df[['title', 'cluster']].head(10))