In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#       UNIVERSAL CLASSIFICATION NOTEBOOK FOR KAGGLE
#   Outlier Capping + Label Encoding + Robust/Standard Scaling
# IMPORTS
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, confusion_matrix,
                             classification_report, f1_score)

print("üî• Kaggle Universal Classification Notebook Loaded")

In [None]:
# CHANGE THESE FOR YOUR COMPETITION
train_filename = ""
test_filename = ""
target_col = "target"   #CHANGE THIS FOR YOUR COMPETITION

train_path = find_dataset(train_filename)
test_path = find_dataset(test_filename)

print("Train Path:", train_path)
print("Test Path:", test_path)


In [None]:
# LOAD DATA
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

print("Train Shape:", train.shape)
print("Test Shape:", test.shape)


In [None]:
# NULL VALUES
print("\nMissing values in Train:\n", train.isnull().sum())
print("\nMissing values in Test:\n", test.isnull().sum())


In [None]:
# COLUMN SPLIT
numeric_cols = train.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = train.select_dtypes(include=["object"]).columns.tolist()

if target_col in numeric_cols:
    numeric_cols.remove(target_col)

print("Numeric Columns:", numeric_cols)
print("Categorical Columns:", categorical_cols)

In [None]:
# HANDLE MISSING VALUES
for col in numeric_cols:
    median_val = train[col].median()
    train[col].fillna(median_val, inplace=True)
    test[col].fillna(median_val, inplace=True)

for col in categorical_cols:
    mode_val = train[col].mode()[0]
    train[col].fillna(mode_val, inplace=True)
    test[col].fillna(mode_val, inplace=True)

print("‚úî Missing values handled.")

In [None]:
# LABEL ENCODING (store encoders)
encoders = {}

for col in categorical_cols + [target_col]:
    if str(train[col].dtype) == "object":
        le = LabelEncoder()
        combined = pd.concat([train[col], test[col]], axis=0).astype(str)
        le.fit(combined)
        train[col] = le.transform(train[col].astype(str))
        if col in test.columns:
            test[col] = le.transform(test[col].astype(str))
        encoders[col] = le

print("‚úî Label Encoding completed.")

In [None]:
# OUTLIER ANALYSIS & CAPPING
def plot_box(df, title):
    plt.figure(figsize=(15, 6))
    df[numeric_cols].boxplot()
    plt.title(title)
    plt.show()

print("\nüì¶ BEFORE Outlier Capping")
plot_box(train, "Before Outlier Capping")

def cap_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = np.clip(df[col], lower, upper)
    return df

for col in numeric_cols:
    train = cap_outliers(train, col)
    test = cap_outliers(test, col)

print("\nüì¶ AFTER Outlier Capping")
plot_box(train, "After Outlier Capping")


In [None]:
# SCALING (RobustScaler OR StandardScaler)

# SELECT SCALER HERE:
# scaler = StandardScaler()       # Option A: StandardScaler
scaler = RobustScaler()           # Option B: RobustScaler (recommended with outliers)

train[numeric_cols] = scaler.fit_transform(train[numeric_cols])
test[numeric_cols] = scaler.transform(test[numeric_cols])

print("‚úî Scaling completed using:", scaler.__class__.__name__)

In [None]:
# TRAIN-VALID SPLIT
X = train.drop(columns=[target_col])
y = train[target_col]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("\nTrain shapes:")
print(X_train.shape, X_valid.shape)

In [None]:
# RANDOMIZED SEARCH CV (RF tuning)
rf = RandomForestClassifier(random_state=42)

param_dist = {
    "n_estimators": [200, 300, 400, 500, 600],
    "max_depth": [None, 5, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

print("\nüîç Running Hyperparameter Tuning...")

rs = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    scoring="f1_macro",
    n_jobs=-1,
    verbose=1,
    random_state=42
)

rs.fit(X_train, y_train)

best_model = rs.best_estimator_

print("Best Parameters:", rs.best_params_)

In [None]:
# EVALUATION
pred = best_model.predict(X_valid)

print("\n================ METRICS ================")
print("Accuracy:", accuracy_score(y_valid, pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_valid, pred))
print("\nClassification Report:\n", classification_report(y_valid, pred))
print("Macro F1:", f1_score(y_valid, pred, average='macro'))

In [None]:
# FINAL PREDICTION
test_pred = best_model.predict(test)

# Inverse transform (if target is categorical)
if target_col in encoders:
    try:
        test_pred = encoders[target_col].inverse_transform(test_pred)
    except:
        pass

In [None]:
# Multi-column submission for multi-class probability submissions
# Optional: uncomment if competition requires probs
# proba = best_model.predict_proba(test)
# classes = encoders[target_col].classes_
# submission = pd.DataFrame(proba, columns=classes)

In [None]:
# SUBMISSION FILE
submission = pd.DataFrame({
    "id": test.index,       # change if competition gives an ID column
    target_col: test_pred
})

submission.to_csv("submission.csv", index=False)

print("\n‚úÖ submission.csv saved successfully!")