In [None]:
# ================================
# IMPORTS
# ================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

sns.set(style="whitegrid")

# ================================
# CONFIG
# ================================
TARGET = "quality_grade"
RANDOM_STATE = 42

# ================================
# LOAD DATA
# ================================
train = pd.read_csv("/kaggle/input/mle-ese-mock/train (5).csv")
test  = pd.read_csv("/kaggle/input/mle-ese-mock/test (4).csv")

train_df = train.copy()
test_df  = test.copy()

print("Data loaded successfully")

In [None]:
# ================================
# SAVE ID & DROP
# ================================
test_id = test_df["id"]
train_df.drop(columns=["id"], inplace=True)
test_df.drop(columns=["id"], inplace=True)

In [None]:
# ================================
# ---------- EDA ----------
# ================================
# Missing values
print("\nMissing values per column:")
print(train_df.isnull().sum())

plt.figure(figsize=(10,4))
train_df.isnull().sum().plot(kind="bar")
plt.title("Missing Values per Column")
plt.show()

In [None]:
# Target distribution
plt.figure(figsize=(8,4))
sns.countplot(x=train_df[TARGET])
plt.title("Target Distribution (quality_grade)")
plt.xticks(rotation=45)
plt.show()

In [None]:
print("\nTarget distribution (normalized):")
print(train_df[TARGET].value_counts(normalize=True))

In [None]:
# ================================
# TARGET IMPUTATION (MODE)
# ================================
train_df[TARGET] = train_df[TARGET].fillna(train_df[TARGET].mode()[0])

In [None]:
# ================================
# SPLIT X & y
# ================================
X = train_df.drop(columns=[TARGET])
y = train_df[TARGET]

In [None]:
# ================================
# COLUMN TYPES
# ================================
num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(exclude=np.number).columns.tolist()

In [None]:
# ================================
# IMPUTATION
# ================================
num_imputer = SimpleImputer(strategy="median")
X[num_cols] = num_imputer.fit_transform(X[num_cols])
test_df[num_cols] = num_imputer.transform(test_df[num_cols])

cat_imputer = SimpleImputer(strategy="most_frequent")
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
test_df[cat_cols] = cat_imputer.transform(test_df[cat_cols])

In [None]:
# ================================
# ONE HOT ENCODING
# ================================
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_cat = ohe.fit_transform(X[cat_cols])
test_cat = ohe.transform(test_df[cat_cols])

X_cat_df = pd.DataFrame(X_cat, columns=ohe.get_feature_names_out(cat_cols), index=X.index)
test_cat_df = pd.DataFrame(test_cat, columns=ohe.get_feature_names_out(cat_cols), index=test_df.index)

X = pd.concat([X.drop(columns=cat_cols), X_cat_df], axis=1)
test_df = pd.concat([test_df.drop(columns=cat_cols), test_cat_df], axis=1)

In [None]:
# ================================
# TRAIN / VALID SPLIT
# ================================
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

In [None]:
# ================================
# RANDOM FOREST + RANDOMIZED SEARCH
# ================================
rf = RandomForestClassifier(class_weight="balanced", random_state=RANDOM_STATE, n_jobs=-1)

param_dist = {
    "n_estimators": [200, 400, 600, 800],
    "max_depth": [None, 10, 20, 25],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=25,
    scoring="f1_macro",
    cv=cv,
    random_state=RANDOM_STATE,
    verbose=1,
    n_jobs=-1
)

print("\nTraining RandomForest with RandomizedSearchCV...")
search.fit(X_train, y_train)

best_model = search.best_estimator_
print("\nBest Parameters Found:")
print(search.best_params_)

In [None]:
# ================================
# VALIDATION METRICS
# ================================
pred_valid = best_model.predict(X_valid)
print("\nValidation Accuracy:", accuracy_score(y_valid, pred_valid))
print("Macro F1 Score:", f1_score(y_valid, pred_valid, average="macro"))
print("\nClassification Report:\n")
print(classification_report(y_valid, pred_valid))


In [None]:
# ================================
# ALIGN TEST FEATURES
# ================================
test_df = test_df[X_train.columns]

# ================================
# TEST PREDICTION
# ================================
test_probs = best_model.predict_proba(test_df)
classes = best_model.classes_

# ================================
# SUBMISSION FILE
# ================================
submission = pd.DataFrame(test_probs, columns=[f"Status_{c}" for c in classes])
submission.insert(0, "id", test_id)

submission.to_csv("submission.csv", index=False)
print("\nsubmission.csv saved successfully!")

In [None]:
# ================================
# TEST PREDICTION (LABELS)
# ================================
# predict class labels directly
test_labels = best_model.predict(test_df)

# ================================
# LABEL SUBMISSION FILE
# ================================
submission_labels = pd.DataFrame({
    "id": test_id,
    "Status": test_labels  # predicted labels in one column
})

submission_labels.to_csv("submission_labels.csv", index=False)
print("\nsubmission_labels.csv saved successfully!")
print(submission_labels.head())


In [None]:
# ================================
# TEST PREDICTION (PROBABILITIES)
# ================================
test_probs = best_model.predict_proba(test_df)  # shape: [n_samples, n_classes]
classes = best_model.classes_

# Create submission DataFrame
submission = pd.DataFrame(
    test_probs,
    columns=[f"quality_grade_{c}" for c in classes]  # probabilities per class
)

submission.insert(0, "id", test_id)

# Save submission
submission.to_csv("sub_probs.csv", index=False)
print("\nsub_probs.csv saved successfully!")
print(submission.head())
