In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import seaborn as sns


# ================================
# CONFIG & LOAD DATA
# ================================
TARGET = "NObeyesdad"

train = pd.read_csv("/kaggle/input/ai-201-b-mse-2-aiml-a/train.csv")
test  = pd.read_csv("/kaggle/input/ai-201-b-mse-2-aiml-a/test.csv")

train_df = train.copy()
test_df  = test.copy()
print("Dataset loaded successfully!")

# Drop ID from test BEFORE processing
test_id = test_df["id"]
test_df = test_df.drop(columns=["id"])
# üîç EDA PART 1: NULL VALUE TABLE + BAR PLOT
# ==========================================
print("\nChecking null values...\n")
nulls = train_df.isnull().sum()

print(nulls)

plt.figure(figsize=(10,4))
sns.barplot(x=nulls.index, y=nulls.values)
plt.xticks(rotation=90)
plt.title("Null Values in Each Column")
plt.show()
# ================================
# 1Ô∏è‚É£ Split X & y
# ================================
X = train_df.drop(columns=[TARGET])
y = train_df[TARGET]

# ================================
# 2Ô∏è‚É£ Numeric & Categorical Columns
# ================================
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric columns:", len(num_cols))
print("Categorical columns:", len(cat_cols))
# ==========================================
# üîç EDA PART 2: NUMERICAL COLUMN DISTRIBUTIONS
# ==========================================
print("\nPlotting numeric distributions...\n")
X[num_cols].hist(bins=30, figsize=(12,10))
plt.tight_layout()
plt.show()
# üîç EDA PART 3: CATEGORY-WISE DISTRIBUTION (COUNT PLOTS)
# ==========================================
print("\nPlotting category-wise countplots...\n")
for col in cat_cols:
    plt.figure(figsize=(8,4))
    sns.countplot(data=X, x=col)
    plt.xticks(rotation=45)
    plt.title(f"Countplot of {col}")
    plt.show()

# ================================
# 3Ô∏è‚É£ Missing value imputation
# ================================
num_imputer = SimpleImputer(strategy="median")
X[num_cols] = num_imputer.fit_transform(X[num_cols])
test_df[num_cols] = num_imputer.transform(test_df[num_cols])

cat_imputer = SimpleImputer(strategy="most_frequent")
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
test_df[cat_cols] = cat_imputer.transform(test_df[cat_cols])

# üîç EDA PART 4: OUTLIER BOXPLOTS FOR NUMERIC COLUMNS
# ==========================================
print("\nPlotting numeric outliers (boxplots)...\n")
for col in num_cols:
    plt.figure(figsize=(6,4))
    sns.boxplot(x=X[col])
    plt.title(f"Outlier Boxplot for {col}")
    plt.show()
# ================================
# 4Ô∏è‚É£ Outlier Capping Category-wise
# ================================
def cap_iqr_categorywise(df, cat_cols, num_cols):
    df = df.copy()

    def cap_group(group):
        group = group.copy()
        for col in num_cols:
            Q1 = group[col].quantile(0.25)
            Q3 = group[col].quantile(0.75)
            IQR = Q3 - Q1
            low = Q1 - 1.5 * IQR
            high = Q3 + 1.5 * IQR
            group[col] = group[col].clip(low, high)
        return group

    for c in cat_cols:
        df = df.groupby(c, group_keys=False, observed=True).apply(cap_group)

    return df

print("Capping Outliers...")
X = cap_iqr_categorywise(X, cat_cols, num_cols)

# ================================
# 5Ô∏è‚É£ One-Hot Encoding
# ================================
print("Encoding categorical columns...")
ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

X_cat = ohe.fit_transform(X[cat_cols])
test_cat = ohe.transform(test_df[cat_cols])

X_cat_df = pd.DataFrame(X_cat, index=X.index, columns=ohe.get_feature_names_out(cat_cols))
test_cat_df = pd.DataFrame(test_cat, index=test_df.index, columns=ohe.get_feature_names_out(cat_cols))

X = pd.concat([X.drop(columns=cat_cols), X_cat_df], axis=1)
test_df = pd.concat([test_df.drop(columns=cat_cols), test_cat_df], axis=1)

# ================================
# 6Ô∏è‚É£ Standard Scaler
# ================================
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

# ================================
# 7Ô∏è‚É£ Train/Valid split
# ================================
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ================================
# 8Ô∏è‚É£ RandomForest + RandomSearch
# ================================
rf = RandomForestClassifier(
    random_state=42,
    class_weight="balanced"
)

param_dist = {
    "n_estimators": [200, 400, 600, 800],
    "max_depth": [None, 10, 20, 25],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=25,
    cv=cv,
    scoring="f1_macro",
    random_state=42,
    verbose=1,
    n_jobs=-1
)

print("\nTraining RandomForest...")
search.fit(X_train, y_train)

best_model = search.best_estimator_
print("\nBest Parameters:", search.best_params_)
print("="*70)

# ================================
# 9Ô∏è‚É£ Validation Metrics
# ================================
pred_valid = best_model.predict(X_valid)
# ==========================================
# üîµ CONFUSION MATRIX
# ==========================================
cm = confusion_matrix(y_valid, pred_valid)
plt.figure(figsize=(10,6))
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ==========================================
# üîµ CLASSIFICATION REPORT HEATMAP
# ==========================================
report = classification_report(y_valid, pred_valid, output_dict=True)
report_df = pd.DataFrame(report).T

plt.figure(figsize=(12,6))
sns.heatmap(report_df.iloc[:-1, :-1], annot=True, cmap="Greens")
plt.title("Classification Report Heatmap")
plt.show()

# ==========================================
# üîµ MULTICLASS ROC CURVE (One-vs-Rest)
# ==========================================

y_valid_bin = label_binarize(y_valid, classes=best_model.classes_)
valid_probs = best_model.predict_proba(X_valid)

plt.figure(figsize=(10,7))

for i, cls in enumerate(best_model.classes_):
    fpr, tpr, _ = roc_curve(y_valid_bin[:, i], valid_probs[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"{cls} (AUC = {roc_auc:.2f})")

plt.plot([0,1], [0,1], 'k--')  # diagonal line
plt.title("Multiclass ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()


print("Validation Accuracy:", accuracy_score(y_valid, pred_valid))
print("Macro F1:", f1_score(y_valid, pred_valid, average='macro'))
print(classification_report(y_valid, pred_valid))

# ==================================================
# üîü FIX: Make sure test_df columns match training
# ==================================================
test_df = test_df[X_train.columns]

# ================================
# üîü Predict on Test
# ================================
test_probs = best_model.predict_proba(test_df)
classes = best_model.classes_

submission = pd.DataFrame(test_probs, columns=[f"{TARGET}_{c}" for c in classes])
submission.insert(0, "id", test_id)

submission.to_csv("submission.csv", index=False)
print("\nsubmission.csv saved!")

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/ai-201-b-mse-2-aiml-a/train.csv'

In [None]:
# Predict class labels
test_pred = best_model.predict(test_df)

# Build submission
submission = pd.DataFrame({
    "id": test_id,
    TARGET: test_pred
})

submission.to_csv("verma.csv", index=False)
print("Submission saved!")





# test_probs = best_model.predict_proba(test_df)
# classes = best_model.classes_

# submission = pd.DataFrame(test_probs, columns=[f"{TARGET}_{c}" for c in classes])
# submission.insert(0, "id", test_id)

# submission.to_csv("submission.csv", index=False)
# print("\nsubmission.csv saved!")