In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mock-test-2-mse-2/sample_submission.csv
/kaggle/input/mock-test-2-mse-2/train.csv
/kaggle/input/mock-test-2-mse-2/test.csv


In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.calibration import CalibratedClassifierCV

# ==================== Load Data ====================
df = pd.read_csv("/kaggle/input/mock-test-2-mse-2/train.csv")
df["Status_bin"] = df["Status"].map({"D": 0, "C": 1, "CL": 2})

X = df.drop(["Status", "Status_bin", "id"], axis=1)
y = df["Status_bin"]

# ==================== Encode categorical columns ====================
cat_cols = X.select_dtypes(include=["object"]).columns

# Option 1: Use OrdinalEncoder for all categorical columns at once

oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X[cat_cols] = oe.fit_transform(X[cat_cols].astype(str))

# ==================== Fill missing values ====================
X = X.fillna(X.median(numeric_only=True))

# ==================== Cap outliers ====================
def cap_outliers(df, cols, lower=1, upper=99):
    for col in cols:
        low_val = df[col].quantile(lower / 100)
        high_val = df[col].quantile(upper / 100)
        df[col] = df[col].clip(low_val, high_val)
    return df

num_cols = X.select_dtypes(include=['float64', 'int64']).columns
X = cap_outliers(X, num_cols)

# ==================== Train-test split ====================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.02, random_state=42, stratify=y
)

# ==================== Random Forest Classifier ====================
num_classes = len(np.unique(y_train))

rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

# Optional: Use calibrated classifier for probabilities
model = CalibratedClassifierCV(base_estimator=rf, method='isotonic', cv=5)
model.fit(X_train, y_train)

# ==================== Evaluate on train/test ====================
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

y_pred_prob = model.predict_proba(X_test)
log_l = log_loss(y_test, y_pred_prob, labels=model.classes_)
print("Log Loss:", log_l)

# ==================== Prepare test data ====================
df_test = pd.read_csv("/kaggle/input/mock-test-2-mse-2/test.csv")
test_ids = df_test["id"]
x_test = df_test.drop(["id"], axis=1)

# Encode categorical columns same as train
if len(cat_cols) > 0:
    # Use ordinal encoding fitted on train
    x_test[cat_cols] = oe.transform(x_test[cat_cols].astype(str))

# Fill missing values
x_test = x_test.fillna(x_test.median(numeric_only=True))

# ==================== Predict ====================
y_pred1 = model.predict(x_test)
y_pred_prob1 = model.predict_proba(x_test)

# ==================== Create submission ====================
class_labels = model.classes_
submission_cols = [f"Status_{label}" for label in class_labels]

submission = pd.DataFrame(y_pred_prob1, columns=submission_cols)
submission.insert(0, "id", test_ids)
submission = submission.rename(columns={
    "Status_0": "Status_D",
    "Status_1": "Status_C",
    "Status_2": "Status_CL"
})

# Reorder columns: id, Status_C, Status_CL, Status_D
submission = submission[["id", "Status_C", "Status_CL", "Status_D"]]

submission.to_csv("professor_rf.csv", index=False)
print("Submission file saved as professor_rf.csv")
print(submission.head())




Accuracy: 0.8533333333333334
Log Loss: 0.38290025653796705
Submission file saved as professor_rf.csv
      id  Status_C  Status_CL  Status_D
0  15000  0.963867   0.005101  0.031032
1  15001  0.954017   0.002515  0.043468
2  15002  0.949566   0.005294  0.045139
3  15003  0.225272   0.077335  0.697394
4  15004  0.945342   0.002450  0.052208
