In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import (
    accuracy_score, 
    log_loss, 
    precision_score, 
    recall_score, 
    f1_score, 
    classification_report
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.calibration import CalibratedClassifierCV

# ==================== Load Data ====================
df = pd.read_csv("/kaggle/input/mock-test-2-mse-2/train.csv")
df["Status_bin"] = df["Status"].map({"D": 0, "C": 1, "CL": 2})

X = df.drop(["Status", "Status_bin", "id"], axis=1)
y = df["Status_bin"]

# ==================== Encode categorical columns ====================
cat_cols = X.select_dtypes(include=["object"]).columns

# Option 1: Use OrdinalEncoder for all categorical columns at once

oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X[cat_cols] = oe.fit_transform(X[cat_cols].astype(str))

# ==================== Fill missing values ====================
X = X.fillna(X.median(numeric_only=True))

# ==================== Cap outliers ====================
def cap_outliers(df, cols, lower=1, upper=99):
    for col in cols:
        low_val = df[col].quantile(lower / 100)
        high_val = df[col].quantile(upper / 100)
        df[col] = df[col].clip(low_val, high_val)
    return df

num_cols = X.select_dtypes(include=['float64', 'int64']).columns
X = cap_outliers(X, num_cols)

# ==================== Train-test split ====================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.02, random_state=42, stratify=y
)

# ==================== Random Forest Classifier ====================
num_classes = len(np.unique(y_train))

rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

# Optional: Use calibrated classifier for probabilities
model = CalibratedClassifierCV(base_estimator=rf, method='isotonic', cv=5)
model.fit(X_train, y_train)

# ==================== Evaluate on train/test ====================
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)


y_pred_prob = model.predict_proba(X_test)
log_l = log_loss(y_test, y_pred_prob, labels=model.classes_)


precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# ======== DISPLAY ========
print("Accuracy:", acc)
print("Log Loss:", log_l)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ==================== Prepare test data ====================
# submission = pd.DataFrame({
#     "id": test_ids,
#     "Status": y_pred_labels
# })

# submission.to_csv("professor_rf_classes.csv", index=False)
# print("Submission saved as professor_rf_classes.csv")
# print(submission.head())







# # ==================== Predict encoded classes ====================
# y_pred_test = model.predict(x_test)

# # ==================== Create new sequential ID ====================
# new_ids = np.arange(1, len(x_test) + 1)

# # ==================== Create final CSV ====================
# submission = pd.DataFrame({
#     "id": new_ids,
#     "Status": y_pred_test
# })

# submission.to_csv("professor_rf_encoded_classes.csv", index=False)
# print("Saved as professor_rf_encoded_classes.csv")
# print(submission.head())





#  If your dataset is SMALL (<10k rows)
# rf = RandomForestClassifier(
#     n_estimators=300,
#     max_depth=None,
#     min_samples_split=3,
#     min_samples_leaf=1,
#     max_features='sqrt',
#     random_state=42,
#     n_jobs=-1
# )
# ⭐ If your dataset is MEDIUM (10k–100k rows)
# rf = RandomForestClassifier(
#     n_estimators=600,
#     max_depth=20,
#     min_samples_split=5,
#     min_samples_leaf=2,
#     max_features='sqrt',
#     random_state=42,
#     n_jobs=-1
# )
# ⭐ If your dataset is LARGE (>100k rows)
# rf = RandomForestClassifier(
#     n_estimators=1200,
#     max_depth=15,
#     min_samples_split=10,
#     min_samples_leaf=5,
#     max_features='log2',
#     random_state=42,
#     n_jobs=-1
# )
