In [None]:
ðŸ”µ TYPE-1 : LABEL SUBMISSION
(Metric: Accuracy / F1 etc.)
Submission: id,Status
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

# ================= LOAD DATA =================
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
test_ids = test["id"]

# ================= DATA CLEANING =================
train.fillna(train.median(numeric_only=True), inplace=True)
test.fillna(test.median(numeric_only=True), inplace=True)

for c in train.select_dtypes(include="object"):
    train[c].fillna(train[c].mode()[0], inplace=True)
    if c in test.columns:
        test[c].fillna(test[c].mode()[0], inplace=True)

# ================= EDA =================
sns.countplot(x="Status", data=train); plt.title("Class Distribution"); plt.show()
sns.boxplot(data=train.select_dtypes(include=np.number)); plt.title("Outliers"); plt.show()
train.select_dtypes(include=np.number).hist(figsize=(10,6)); plt.show()
sns.heatmap(train.select_dtypes(include=np.number).corr(), cmap="coolwarm"); plt.show()

# ================= FEATURES & TARGET =================
y = train["Status"]
X = train.drop(columns=["Status","id"], errors="ignore")
test = test.drop(columns=["id"], errors="ignore")

# ================= ENCODING =================
cat_cols = X.select_dtypes(include="object").columns
oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
X[cat_cols] = oe.fit_transform(X[cat_cols])
test[cat_cols] = oe.transform(test[cat_cols])

le = LabelEncoder()
y_enc = le.fit_transform(y)

# ================= ALIGN & SCALE =================
X, test = X.align(test, axis=1, fill_value=0)
scaler = StandardScaler()
X = scaler.fit_transform(X)
test = scaler.transform(test)

# ================= TRAIN / VALIDATION =================
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y_enc, test_size=0.2, stratify=y_enc, random_state=42
)

rf = RandomForestClassifier(random_state=42)
rf.fit(X_tr, y_tr)

# ================= EVALUATION =================
val_pred = rf.predict(X_val)

print("Accuracy :", accuracy_score(y_val, val_pred))
print("Precision:", precision_score(y_val, val_pred, average="weighted"))
print("Recall   :", recall_score(y_val, val_pred, average="weighted"))
print("F1 Score :", f1_score(y_val, val_pred, average="weighted"))

print("\nClassification Report:\n", classification_report(y_val, val_pred))

sns.heatmap(confusion_matrix(y_val, val_pred),
            annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

# ================= HYPERPARAMETER TUNING =================
grid = GridSearchCV(
    rf, {"n_estimators":[100,200]}, cv=3, scoring="accuracy"
)
grid.fit(X_tr, y_tr)

# ================= TEST PREDICTION =================
test_pred = grid.best_estimator_.predict(test)
test_pred = le.inverse_transform(test_pred)

# ================= SUBMISSION =================
pd.DataFrame({"id":test_ids,"Status":test_pred}).to_csv(
    "submission_type1_labels.csv", index=False
)

ðŸ”´ TYPE-2 : PROBABILITY SUBMISSION
(Metric: Multiclass Log Loss)
Submission: id,Status_C,Status_CL,Status_D
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

# ================= LOAD DATA =================
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
test_ids = test["id"]

# ================= CLEANING =================
train.fillna(train.median(numeric_only=True), inplace=True)
test.fillna(test.median(numeric_only=True), inplace=True)

for c in train.select_dtypes(include="object"):
    train[c].fillna(train[c].mode()[0], inplace=True)
    if c in test.columns:
        test[c].fillna(test[c].mode()[0], inplace=True)

# ================= EDA =================
sns.countplot(x="Status", data=train); plt.show()
train["Status"].value_counts().plot.pie(autopct="%1.1f%%"); plt.show()
sns.boxplot(data=train.select_dtypes(include=np.number)); plt.show()
sns.heatmap(train.select_dtypes(include=np.number).corr(), cmap="coolwarm"); plt.show()

# ================= FEATURES & TARGET =================
y = train["Status"]
X = train.drop(columns=["Status","id"], errors="ignore")
test = test.drop(columns=["id"], errors="ignore")

# ================= ENCODING =================
cat_cols = X.select_dtypes(include="object").columns
oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
X[cat_cols] = oe.fit_transform(X[cat_cols])
test[cat_cols] = oe.transform(test[cat_cols])

le = LabelEncoder()
y_enc = le.fit_transform(y)

# ================= ALIGN & SCALE =================
X, test = X.align(test, axis=1, fill_value=0)
scaler = StandardScaler()
X = scaler.fit_transform(X)
test = scaler.transform(test)

# ================= TRAIN =================
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y_enc, test_size=0.2, stratify=y_enc, random_state=42
)

rf = RandomForestClassifier(random_state=42)
rf.fit(X_tr, y_tr)

# ================= EVALUATION =================
val_pred = rf.predict(X_val)

print("Accuracy :", accuracy_score(y_val, val_pred))
print("Precision:", precision_score(y_val, val_pred, average="weighted"))
print("Recall   :", recall_score(y_val, val_pred, average="weighted"))
print("F1 Score :", f1_score(y_val, val_pred, average="weighted"))

sns.heatmap(confusion_matrix(y_val, val_pred),
            annot=True, fmt="d", cmap="Blues")
plt.show()

# ================= TUNING =================
grid = GridSearchCV(
    rf, {"n_estimators":[100,200]}, cv=3
)
grid.fit(X_tr, y_tr)

# ================= PROBABILITY OUTPUT =================
proba = grid.best_estimator_.predict_proba(test)

submission = pd.DataFrame(proba, columns=le.classes_)
submission.insert(0,"id",test_ids)
submission.to_csv("submission_type2_probabilities.csv", index=False)

ðŸŸ¢ TYPE-3 : ONE-HOT SUBMISSION
Submission: id,Status_C,Status_CL,Status_D (0/1)
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Load
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
test_ids = test["id"]

# Cleaning
train.fillna(train.median(numeric_only=True), inplace=True)
test.fillna(test.median(numeric_only=True), inplace=True)

# Target & features
y = train["Status"]
X = train.drop(columns=["Status","id"], errors="ignore")
test = test.drop(columns=["id"], errors="ignore")

# Encoding
cat_cols = X.select_dtypes(include="object").columns
oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
X[cat_cols] = oe.fit_transform(X[cat_cols])
test[cat_cols] = oe.transform(test[cat_cols])

le = LabelEncoder()
y_enc = le.fit_transform(y)

# Align & scale
X, test = X.align(test, axis=1, fill_value=0)
scaler = StandardScaler()
X = scaler.fit_transform(X)
test = scaler.transform(test)

# Train
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X, y_enc)

# One-hot output
proba = rf.predict_proba(test)
one_hot = (proba == proba.max(axis=1, keepdims=True)).astype(int)

submission = pd.DataFrame(one_hot, columns=le.classes_)
submission.insert(0,"id",test_ids)
submission.to_csv("submission_type3_onehot.csv", index=False)