In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# --------------------------------------------------------
# 1. LOAD DATA
# --------------------------------------------------------
train_path = "/kaggle/input/mock-test-2-mse-2/train.csv"
test_path = "/kaggle/input/mock-test-2-mse-2/test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# --------------------------------------------------------
# 2. SEPARATE FEATURES & TARGET
# --------------------------------------------------------
y = train_df["Status"]
X = train_df.drop(columns=["Status"])

test_ids = test_df["id"]
X_test = test_df.copy()

# --------------------------------------------------------
# 3. IDENTIFY COLUMN TYPES
# --------------------------------------------------------
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

if "id" in num_cols: num_cols.remove("id")
if "id" in cat_cols: cat_cols.remove("id")

# --------------------------------------------------------
# 4. VISUALIZATIONS
# --------------------------------------------------------
for col in num_cols[:5]:
    plt.figure(figsize=(5,3))
    plt.hist(X[col].dropna(), bins=30)
    plt.title(f"Histogram - {col}")
    plt.tight_layout()
    plt.show()

plt.figure(figsize=(10,6))
sns.heatmap(X[num_cols].corr(), annot=False)
plt.title("Correlation Heatmap")
plt.show()

# --------------------------------------------------------
# 5. OUTLIER CAPPING (with warnings ignored)
# --------------------------------------------------------
def cap_outliers(df, cols):
    df = df.copy()
    for col in cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        # Ignore runtime warnings for NaN or invalid comparisons
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            df[col] = df[col].clip(lower, upper)
    return df

for col in num_cols:
    X[col] = cap_outliers(X[[col]], [col])[col]
    X_test[col] = cap_outliers(X_test[[col]], [col])[col]

# --------------------------------------------------------
# 6. PREPROCESSING PIPELINE
# --------------------------------------------------------
numeric_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer([
    ("num", numeric_tf, num_cols),
    ("cat", categorical_tf, cat_cols)
])

# --------------------------------------------------------
# 7. FINAL MODEL (RandomForest)
# --------------------------------------------------------
model = Pipeline([
    ("preprocess", preprocess),
    ("rf", RandomForestClassifier(
        n_estimators=500,
        max_depth=25,
        min_samples_split=3,
        min_samples_leaf=1,
        max_features='sqrt',
        bootstrap=True,
        random_state=42,
        class_weight="balanced"
    ))
])

# --------------------------------------------------------
# 8. TRAIN/VALIDATION SPLIT
# --------------------------------------------------------
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --------------------------------------------------------
# 9. TRAIN MODEL
# --------------------------------------------------------
model.fit(X_train, y_train)

# --------------------------------------------------------
# 10. VALIDATION ACCURACY
# --------------------------------------------------------
preds = model.predict(X_valid)
print("Validation Accuracy:", accuracy_score(y_valid, preds))

# --------------------------------------------------------
# 11. PREDICT TEST PROBABILITIES
# --------------------------------------------------------
probs = model.predict_proba(X_test)
class_order = model.named_steps["rf"].classes_

# --------------------------------------------------------
# 12. SAVE SUBMISSION
# --------------------------------------------------------
submission = pd.DataFrame()
submission["id"] = test_ids
submission["Status_C"]  = probs[:, list(class_order).index("C")]
submission["Status_CL"] = probs[:, list(class_order).index("CL")]
submission["Status_D"]  = probs[:, list(class_order).index("D")]

submission.to_csv("submission.csv", index=False)
print("submission.csv generated successfully!") 