In [None]:
# ==========================
# ðŸ§  DOSHA CLASSIFICATION NOTEBOOK
# ==========================

# --- IMPORT LIBRARIES ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import pickle
import os

# --- LOAD DATA ---
file_path = "dataset.csv"   # Change to your dataset name

if file_path.endswith(".csv"):
    df = pd.read_csv(file_path)
else:
    df = pd.read_excel(file_path)

print("âœ… Data Loaded Successfully\n")
print("Shape:", df.shape)
display(df.head())

# --- CHECK FOR MISSING VALUES ---
print("\nMissing Values:\n", df.isnull().sum())

# --- HANDLE MISSING VALUES IF ANY ---
df = df.dropna().reset_index(drop=True)

# --- ENCODE CATEGORICAL VARIABLES ---
label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

# --- DEFINE FEATURES & TARGET ---
X = df.drop('Dosha', axis=1)
y = df['Dosha']

# --- STANDARDIZE FEATURES ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- TRAIN/TEST SPLIT ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# ==========================
# ðŸ“Š DATA VISUALIZATION
# ==========================

# Distribution of target
plt.figure(figsize=(8,5))
sns.countplot(x=y)
plt.title("Dosha Distribution", fontsize=14)
plt.xlabel("Dosha Type")
plt.ylabel("Count")
plt.show()

# Correlation Heatmap
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), cmap='coolwarm', annot=False)
plt.title("Feature Correlation Heatmap", fontsize=14)
plt.show()

# Pairplot of first few features
sns.pairplot(df.iloc[:, :5].join(y), hue="Dosha", diag_kind="kde")
plt.suptitle("Pairplot of Selected Features", y=1.02)
plt.show()

# ==========================
# ðŸ¤– MODEL TRAINING
# ==========================

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "SVM": SVC(kernel='rbf', probability=True, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

results = {}

for name, model in models.items():
    print(f"\n==============================")
    print(f"ðŸ”¹ Training {name}")
    print("==============================")
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # --- Classification Report ---
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # --- Confusion Matrix ---
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y))
    disp.plot(cmap='Blues')
    plt.title(f"{name} - Confusion Matrix")
    plt.show()

    # --- Save Model ---
    model_filename = f"{name.replace(' ', '_').lower()}_model.pkl"
    pickle.dump(model, open(model_filename, "wb"))
    print(f"ðŸ’¾ Model saved as {model_filename}")

    # --- Accuracy ---
    acc = model.score(X_test, y_test)
    results[name] = acc
    print(f"âœ… Accuracy: {acc:.3f}")

# ==========================
# ðŸ“ˆ PERFORMANCE SUMMARY
# ==========================
print("\nMODEL PERFORMANCE SUMMARY:\n")
for model_name, acc in results.items():
    print(f"{model_name:25s} -> Accuracy: {acc:.3f}")

# --- Accuracy Bar Plot ---
plt.figure(figsize=(8,5))
sns.barplot(x=list(results.keys()), y=list(results.values()), palette="viridis")
plt.title("Model Accuracy Comparison", fontsize=14)
plt.ylabel("Accuracy")
plt.ylim(0, 1)
plt.xticks(rotation=15)
plt.show()

print("\nðŸŽ¯ All models trained and saved successfully!")
