In [2]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.4 imblearn-0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
data = pd.read_csv("churn.csv")


In [None]:

data_cleaned = data.drop(columns=["RowNumber", "CustomerId", "Surname"])


In [None]:
categorical_columns = ["Geography", "Gender"]
numerical_columns = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary"]
target_column = "Exited"


In [None]:
numerical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(drop="first"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_pipeline, numerical_columns),
    ("cat", categorical_pipeline, categorical_columns)
])


In [None]:

X = data_cleaned.drop(columns=[target_column])
y = data_cleaned[target_column]

X_preprocessed = preprocessor.fit_transform(X)

In [None]:

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_preprocessed, y)

In [None]:

onehot_columns = preprocessor.named_transformers_["cat"].named_steps["onehot"].get_feature_names_out(categorical_columns)
processed_columns = list(numerical_columns) + list(onehot_columns)

X_smote_df = pd.DataFrame(X_smote, columns=processed_columns)
final_dataset = pd.concat([X_smote_df, y_smote.reset_index(drop=True)], axis=1)

In [None]:
final_dataset.to_csv("preprocessed_with_smote.csv", index=False)