# 🗞️ News Article Clustering Project

This notebook clusters news articles based on their content using TF-IDF vectorization and KMeans clustering.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score


## 📥 Load Dataset

In [None]:
df = pd.read_csv("data/simulated_news.csv")
df.head()


## 🔍 Exploratory Data Analysis

In [None]:
# Basic info
df.info()


In [None]:
# Article length distribution
df['length'] = df['Article'].apply(len)
sns.histplot(df['length'], kde=True)
plt.title("Article Length Distribution")
plt.xlabel("Length (characters)")
plt.ylabel("Count")
plt.show()


## 🧹 Data Preprocessing and Feature Extraction

In [None]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['Article'])
X.shape


## 🧠 Model Training

In [None]:
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X)
df["Cluster"] = kmeans.predict(X)


## 📊 Model Evaluation

In [None]:
score = silhouette_score(X, df["Cluster"])
print(f"Silhouette Score: {score:.4f}")


## 📉 PCA Visualization

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())
df["PCA1"], df["PCA2"] = X_pca[:, 0], X_pca[:, 1]

plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x="PCA1", y="PCA2", hue="Cluster", palette="tab10", s=100)
plt.title("News Article Clusters (PCA View)")
plt.show()


## 💾 Save Model and Vectorizer

In [None]:
import os
os.makedirs("model", exist_ok=True)

with open("model/kmeans_model.pkl", "wb") as f:
    pickle.dump(kmeans, f)

with open("model/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)
