In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# ------------------------------
# Load Dataset
# ------------------------------
df = pd.read_csv("Yatharth Kumar Saxena - ml_preprocessing_dataset_1000.csv")
df.columns = df.columns.str.strip()

target_col = "Target"
id_col = "Legacy_Customer_ID"

# Backup
df_raw = df.copy()
df_clean = df.copy()

# --------------------------------------------
# 🔧 Label Encode Target for both datasets
# --------------------------------------------
if df[target_col].dtype == "object":
    df_raw[target_col] = LabelEncoder().fit_transform(df_raw[target_col])
    df_clean[target_col] = LabelEncoder().fit_transform(df_clean[target_col])

# --------------------------------------------
# 🔧 Encode Categorical Columns in RAW (only)
# --------------------------------------------
for col in df_raw.columns:
    if df_raw[col].dtype == 'object' and col not in [target_col, id_col]:
        df_raw[col] = LabelEncoder().fit_transform(df_raw[col].astype(str))

# --------------------------------------------
# 🧹 Preprocessing for Cleaned Dataset
# --------------------------------------------
for col in df_clean.columns:
    if col in [target_col, id_col]:
        continue
    if df_clean[col].dtype == 'object':
        df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])
        df_clean[col] = LabelEncoder().fit_transform(df_clean[col].astype(str))
    else:
        df_clean[col] = df_clean[col].fillna(df_clean[col].mean())

# ------------------------------
# Split X and y
# ------------------------------
def separate_X_y(df):
    X = df.drop([target_col, id_col], axis=1)
    y = df[target_col]
    return X, y

# ------------------------------
# Evaluate
# ------------------------------
def evaluate_model(X, y, split_ratio, preprocess=False):
    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=(1 - split_ratio), random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

    if preprocess:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(X_test)

    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print(f"\n📊 Results | Split: {int(split_ratio*100)}:{int((1-split_ratio)*100)} | Preprocessed: {preprocess}")
    print("✅ Accuracy:", clf.score(X_test, y_test))
    print("📄 Classification Report:\n", classification_report(y_test, y_pred))
    print("📦 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ------------------------------
# Run All Evaluations
# ------------------------------
X_raw, y_raw = separate_X_y(df_raw)
X_clean, y_clean = separate_X_y(df_clean)

# 70:30
evaluate_model(X_raw, y_raw, split_ratio=0.7, preprocess=False)
evaluate_model(X_clean, y_clean, split_ratio=0.7, preprocess=True)

# 80:20
evaluate_model(X_raw, y_raw, split_ratio=0.8, preprocess=False)
evaluate_model(X_clean, y_clean, split_ratio=0.8, preprocess=True)



📊 Results | Split: 70:30 | Preprocessed: False
✅ Accuracy: 0.5913621262458472
📄 Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.86      0.72       188
           1       0.39      0.15      0.22       113

    accuracy                           0.59       301
   macro avg       0.51      0.50      0.47       301
weighted avg       0.54      0.59      0.53       301

📦 Confusion Matrix:
 [[161  27]
 [ 96  17]]

📊 Results | Split: 70:30 | Preprocessed: True
✅ Accuracy: 0.5780730897009967
📄 Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.86      0.72       188
           1       0.33      0.12      0.17       113

    accuracy                           0.58       301
   macro avg       0.47      0.49      0.44       301
weighted avg       0.51      0.58      0.51       301

📦 Confusion Matrix:
 [[161  27]
 [100  13]]

📊 Results | Split: 80:19 | Preprocessed: False
✅

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# ------------------------------
# Load & Prepare Dataset
# ------------------------------
df = pd.read_csv("Yatharth Kumar Saxena - ml_preprocessing_dataset_1000.csv")
df.columns = df.columns.str.strip()

target_col = "Target"
id_col = "Legacy_Customer_ID"

# Raw and clean copies
df_raw = df.copy()
df_clean = df.copy()

# ------------------------------
# Encode Target Variable
# ------------------------------
if df[target_col].dtype == "object":
    df_raw[target_col] = LabelEncoder().fit_transform(df_raw[target_col])
    df_clean[target_col] = LabelEncoder().fit_transform(df_clean[target_col])

# ------------------------------
# Encode RAW dataset (no imputation)
# ------------------------------
for col in df_raw.columns:
    if col not in [target_col, id_col] and df_raw[col].dtype == 'object':
        df_raw[col] = LabelEncoder().fit_transform(df_raw[col].astype(str))

# ------------------------------
# Preprocess CLEAN dataset (fill + encode)
# ------------------------------
for col in df_clean.columns:
    if col in [target_col, id_col]:
        continue
    if df_clean[col].dtype == 'object':
        df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])
        df_clean[col] = LabelEncoder().fit_transform(df_clean[col].astype(str))
    else:
        df_clean[col] = df_clean[col].fillna(df_clean[col].mean())

# ------------------------------
# Split Features and Target
# ------------------------------
def separate_X_y(df):
    X = df.drop([target_col, id_col], axis=1)
    y = df[target_col]
    return X, y

# ------------------------------
# Evaluate 80:20
# ------------------------------
def evaluate_80_20(X, y, preprocess=False):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    if preprocess:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    acc = clf.score(X_test, y_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    conf = confusion_matrix(y_test, y_pred)

    # Terminal output
    print(f"\n📊 Results | Split: 80:20 | Preprocessed: {preprocess}")
    print("✅ Accuracy:", acc)
    print("📄 Classification Report:\n", classification_report(y_test, y_pred))
    print("📦 Confusion Matrix:\n", conf)

# ------------------------------
# Run 80:20 Evaluation
# ------------------------------
X_raw, y_raw = separate_X_y(df_raw)
X_clean, y_clean = separate_X_y(df_clean)

evaluate_80_20(X_raw, y_raw, preprocess=False)
evaluate_80_20(X_clean, y_clean, preprocess=True)



📊 Results | Split: 80:20 | Preprocessed: False
✅ Accuracy: 0.615
📄 Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.88      0.75       128
           1       0.40      0.14      0.21        72

    accuracy                           0.61       200
   macro avg       0.52      0.51      0.48       200
weighted avg       0.56      0.61      0.55       200

📦 Confusion Matrix:
 [[113  15]
 [ 62  10]]

📊 Results | Split: 80:20 | Preprocessed: True
✅ Accuracy: 0.615
📄 Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.89      0.75       128
           1       0.39      0.12      0.19        72

    accuracy                           0.61       200
   macro avg       0.52      0.51      0.47       200
weighted avg       0.55      0.61      0.55       200

📦 Confusion Matrix:
 [[114  14]
 [ 63   9]]
