In [1]:
# 1 code
# ================================
# FINAL EXAM-SAFE ML CLASSIFICATION CODE
# (Binary + Multiclass supported)
# ================================

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# ================================
# 1. LOAD DATA
# ================================
train = pd.read_csv("/kaggle/input/mse-2-ai-201-b-aiml-a/train.csv")
test = pd.read_csv("/kaggle/input/mse-2-ai-201-b-aiml-a/test.csv")

# ❌ ERROR FIX:
# If path error comes → check dataset name in Kaggle input

# ================================
# 2. BASIC EDA (OPTIONAL IN EXAM)
# ================================
print(train.head())
print(train.info())
print(train.isnull().sum())

# ❌ ERROR FIX:
# If 'Class' column not found → print(train.columns) and change name
print(train['Class'].value_counts())

# ================================
# 3. FEATURES & TARGET
# ================================
y = train["Class"]          # ❌ If KeyError → change target column name
X = train.drop("Class", axis=1)

num_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# ================================
# 4. CATEGORY-WISE OUTLIER CAPPING
# ================================
def cap_outliers_categorywise_all(df, cat_col, num_cols):
    df = df.copy()
    for col in num_cols:
        Q1 = df.groupby(cat_col)[col].transform(lambda x: x.quantile(0.25))
        Q3 = df.groupby(cat_col)[col].transform(lambda x: x.quantile(0.75))
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df[col] = df[col].clip(lower, upper)
    return df

# ❌ ERROR FIX:
# If groupby error / time issue → COMMENT THIS WHOLE LOOP
for c in cat_cols:
    X = cap_outliers_categorywise_all(X, c, num_cols)
    test = cap_outliers_categorywise_all(test, c, num_cols)

# ================================
# 5. HANDLE MISSING VALUES
# ================================

# Numeric columns (SAFE – NEVER FAILS)
X[num_cols] = X[num_cols].fillna(X[num_cols].median())
test[num_cols] = test[num_cols].fillna(test[num_cols].median())

# Categorical columns (EXAM-SAFE VERSION)
# ❌ If ANY error comes → COMMENT THIS BLOCK COMPLETELY
if len(cat_cols) > 0:
    for col in cat_cols:
        if X[col].isnull().any():
            X[col].fillna(X[col].mode()[0], inplace=True)
        if test[col].isnull().any():
            test[col].fillna(test[col].mode()[0], inplace=True)

# ================================
# 6. RESET INDEX
# ================================
X = X.reset_index(drop=True)
test = test.reset_index(drop=True)

# ================================
# 7. LABEL ENCODE TARGET
# ================================
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ✔ Multiclass handled automatically

# ================================
# 8. PREPROCESS + MODEL
# ================================
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(random_state=42))
])

# ================================
# 9. TRAIN / VALID SPLIT
# ================================
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

# ================================
# 10. TRAIN MODEL
# ================================
model.fit(X_train, y_train)

# ================================
# 11. EVALUATION
# ================================
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)

print("Accuracy :", accuracy_score(y_val, y_pred))
print("Precision:", precision_score(y_val, y_pred, average='macro'))
print("Recall   :", recall_score(y_val, y_pred, average='macro'))
print("F1 Score :", f1_score(y_val, y_pred, average='macro'))

# ❌ If ROC AUC error comes → COMMENT THIS LINE
print("ROC AUC  :", roc_auc_score(y_val, y_prob, multi_class='ovr'))

# ================================
# 12. TRAIN FULL MODEL
# ================================
model.fit(X, y_encoded)

# ================================
# 13. TEST PREDICTION
# ================================
test_pred = model.predict(test)
test_pred_labels = le.inverse_transform(test_pred)

# ================================
# 14. SUBMISSION FILE
# ================================

# ❌ If 'id' column missing → replace test["id"] with range(len(test))
submission = pd.DataFrame({
    "id": test["id"],
    "NObeyesdad": test_pred_labels
})

submission.to_csv("submission.csv", index=False)
print("submission.csv CREATED SUCCESSFULLY!")


In [3]:
# =========================================================
# UNIVERSAL TABULAR CLASSIFICATION TEMPLATE WITH VISUALS
# Change only TRAIN_PATH, TEST_PATH, TARGET_COL, ID_COL
# =========================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



TRAIN_PATH = "/kaggle/input/mock-test-2-mse-2/train.csv"
TEST_PATH  = "/kaggle/input/mock-test-2-mse-2/test.csv"
TARGET_COL = "Status"
ID_COL     = "id"


# ---------------- CHANGE ONLY THESE ----------------
# TRAIN_PATH = "/kaggle/input/mock-test-2-mse-2/test.csv"
# TEST_PATH  = "/kaggle/input/mock-test-2-mse-2/test.csv"
# TARGET_COL = "Status"
# ID_COL     = "id"      # set None if not present
# --------------------------------------------------

# ================= LOAD DATA ======================
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

train.columns = train.columns.str.strip()
test.columns = test.columns.str.strip()

# ================= BASIC VISUALIZATION =============
print("\nDataset Shape:", train.shape)
print("\nTarget Distribution:")
print(train[TARGET_COL].value_counts())

plt.figure(figsize=(6,4))
train[TARGET_COL].value_counts().plot(kind="bar")
plt.title("Target Distribution")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

# ================= HANDLE ID ======================
test_ids = None
if ID_COL and ID_COL in test.columns:
    test_ids = test[ID_COL]
    test = test.drop(columns=[ID_COL])

if ID_COL and ID_COL in train.columns:
    train = train.drop(columns=[ID_COL])

# ================= SPLIT X & y ===================
X = train.drop(columns=[TARGET_COL])
y = train[TARGET_COL]

# ================= COLUMN TYPES ==================
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object", "category"]).columns

# ================= MISSING VALUE VISUAL ===========
plt.figure(figsize=(8,4))
train.isnull().sum().sort_values(ascending=False).head(10).plot(kind="bar")
plt.title("Top Missing Values per Column")
plt.ylabel("Missing Count")
plt.show()

# ================= PREPROCESSOR ==================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols),

        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ]
)

# ================= MODELS ========================
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "SVM": SVC(probability=True)
}

best_model = None
best_score = -1
best_name = ""

# ================= TRAIN & SELECT =================
for name, clf in models.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", clf)
    ])

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42,
        stratify=y if y.nunique() < 20 else None
    )

    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_val)
    score = accuracy_score(y_val, preds)

    print(f"{name} Accuracy: {score:.4f}")

    if score > best_score:
        best_score = score
        best_model = pipeline
        best_name = name
        best_preds = preds
        best_yval = y_val

print("\nBEST MODEL:", best_name)

# ================= CONFUSION MATRIX VISUAL =========
cm = confusion_matrix(best_yval, best_preds)

plt.figure(figsize=(5,4))
plt.imshow(cm, cmap="Blues")
plt.title(f"Confusion Matrix ({best_name})")
plt.colorbar()
plt.xlabel("Predicted")
plt.ylabel("Actual")
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, cm[i, j], ha="center", va="center")
plt.show()

print("\nClassification Report:")
print(classification_report(best_yval, best_preds))

# ================= FINAL TRAIN ====================
best_model.fit(X, y)



# ================= TEST PREDICTION (PROBABILITIES) ======
sample_sub = pd.read_csv("/kaggle/input/mock-test-2-mse-2/sample_submission.csv")

probs = best_model.predict_proba(test)

submission = pd.DataFrame(
    probs,
    columns=sample_sub.columns.drop("id")   # Status_C, Status_CL, Status_D
)

submission.insert(0, "id", test_ids.values)
submission.to_csv("submission.csv", index=False)

print("submission.csv CREATED CORRECTLY (PROBABILITIES)!")

In [None]:
# ============================================
# MULTI-CLASS LOG LOSS – FULL KAGGLE CODE
# ============================================

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# ML
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier

# ============================================
# 1. LOAD DATA
# ============================================

train = pd.read_csv("/kaggle/input/your-dataset-name/train.csv")
test  = pd.read_csv("/kaggle/input/your-dataset-name/test.csv")

print(train.shape, test.shape)

# ============================================
# 2. TARGET & FEATURES
# ============================================

TARGET = "Status"
ID_COL = "id"

X = train.drop([TARGET], axis=1)
y = train[TARGET]

# ============================================
# 3. LABEL ENCODING (C, CL, D → 0,1,2)
# ============================================

le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("Class mapping:")
for cls, val in zip(le.classes_, range(len(le.classes_))):
    print(cls, "→", val)

# ============================================
# 4. HANDLE CATEGORICAL FEATURES
# ============================================

cat_cols = X.select_dtypes(include="object").columns

for col in cat_cols:
    combined = pd.concat([X[col], test[col]], axis=0)
    le_col = LabelEncoder()
    le_col.fit(combined.astype(str))
    X[col] = le_col.transform(X[col].astype(str))
    test[col] = le_col.transform(test[col].astype(str))

# ============================================
# 5. TRAIN–VALIDATION SPLIT
# ============================================

X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# ============================================
# 6. MODEL (MULTI-CLASS PROBABILITY)
# ============================================

model = LGBMClassifier(
    objective="multiclass",
    num_class=3,
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    random_state=42
)

model.fit(X_train, y_train)

# ============================================
# 7. VALIDATION LOG LOSS
# ============================================

val_preds = model.predict_proba(X_val)
val_loss = log_loss(y_val, val_preds)

print("Validation Log Loss:", val_loss)

# ============================================
# 8. TRAIN ON FULL DATA
# ============================================

model.fit(X, y_encoded)

# ============================================
# 9. TEST PREDICTIONS (PROBABILITIES)
# ============================================

test_preds = model.predict_proba(test)

# ============================================
# 10. CREATE SUBMISSION FILE
# ============================================

submission = pd.DataFrame({
    "id": test[ID_COL],
    "Status_C":  test_preds[:, le.transform(["C"])[0]],
    "Status_CL": test_preds[:, le.transform(["CL"])[0]],
    "Status_D":  test_preds[:, le.transform(["D"])[0]],
})

submission.to_csv("submission.csv", index=False)

print("submission.csv created successfully!")
print(submission.head())
