In [77]:
DATA_PATH = '../artifacts/data_preprocessed/1_preprocessed_df.pkl'

In [78]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [79]:
df = pd.read_pickle(DATA_PATH)
df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,IsActiveMember,EstimatedSalary,Exited
0,619,0,0,42,2,0,1,1,101348,1
1,608,2,0,41,1,83807,1,1,112542,0
2,502,0,0,42,8,159660,3,0,113931,1
3,699,0,0,39,1,0,2,0,93826,0
4,850,2,0,43,2,125510,1,1,79084,0
...,...,...,...,...,...,...,...,...,...,...
9995,771,0,1,39,5,0,2,0,96270,0
9996,516,0,1,35,10,57369,1,1,101699,0
9997,709,0,0,36,7,0,1,1,42085,1
9998,772,1,1,42,3,75075,2,0,92888,1


In [80]:
numbCol = ['EstimatedSalary', 'Balance', 'CreditScore', 'Age']
nomCol =  ['IsActiveMember', 'Geography', 'Gender', 'NumOfProducts', 'Tenure']

### 1. Standardizing Numerical Data with StandardScaler

In [81]:
sc = StandardScaler()
df[numbCol] = sc.fit_transform(df[numbCol])
df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,IsActiveMember,EstimatedSalary,Exited
0,-0.326221,0,0,0.293517,2,-1.225847,1,1,0.021880,1
1,-0.440036,2,0,0.198164,1,0.117342,1,1,0.216532,0
2,-1.536794,0,0,0.293517,8,1.333050,3,0,0.240686,1
3,0.501521,0,0,0.007457,1,-1.225847,2,0,-0.108920,0
4,2.063884,2,0,0.388871,2,0.785723,1,1,-0.365269,0
...,...,...,...,...,...,...,...,...,...,...
9995,1.246488,0,1,0.007457,5,-1.225847,2,0,-0.066422,0
9996,-1.391939,0,1,-0.373958,10,-0.306384,1,1,0.027983,0
9997,0.604988,0,0,-0.278604,7,-1.225847,1,1,-1.008645,1
9998,1.256835,1,1,0.293517,3,-0.022607,2,0,-0.125231,1


### 2. Splitting Data into Features and Target Variable

In [82]:
x1 = df.drop(columns=['Exited'])
y1 = df['Exited']

In [83]:
x1

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,IsActiveMember,EstimatedSalary
0,-0.326221,0,0,0.293517,2,-1.225847,1,1,0.021880
1,-0.440036,2,0,0.198164,1,0.117342,1,1,0.216532
2,-1.536794,0,0,0.293517,8,1.333050,3,0,0.240686
3,0.501521,0,0,0.007457,1,-1.225847,2,0,-0.108920
4,2.063884,2,0,0.388871,2,0.785723,1,1,-0.365269
...,...,...,...,...,...,...,...,...,...
9995,1.246488,0,1,0.007457,5,-1.225847,2,0,-0.066422
9996,-1.391939,0,1,-0.373958,10,-0.306384,1,1,0.027983
9997,0.604988,0,0,-0.278604,7,-1.225847,1,1,-1.008645
9998,1.256835,1,1,0.293517,3,-0.022607,2,0,-0.125231


### 3. Oversampling Minority Class with SMOTE

In [84]:
y1.value_counts()

Exited
0    7963
1    2037
Name: count, dtype: int64

In [85]:
over = SMOTE(sampling_strategy=1) # number of samples in the minority class should be equal to the number of samples in the majority class after oversampling

x1_resampled = over.fit_resample(x1, y1)[0].values
y1_resampled = over.fit_resample(x1, y1)[1].values

x1_resampled, y1_resampled

(array([[-0.32622142,  0.        ,  0.        , ...,  1.        ,
          1.        ,  0.02187974],
        [-0.44003595,  2.        ,  0.        , ...,  1.        ,
          1.        ,  0.21653222],
        [-1.53679418,  0.        ,  0.        , ...,  3.        ,
          0.        ,  0.24068554],
        ...,
        [ 0.53667434,  0.        ,  0.        , ...,  1.        ,
          0.        , -0.35291971],
        [ 0.56534303,  0.        ,  0.        , ...,  2.        ,
          1.        ,  0.02175822],
        [ 0.77550661,  1.        ,  0.        , ...,  1.        ,
          0.        , -0.00553711]]),
 array([1, 0, 1, ..., 1, 1, 1], dtype=int64))

In [87]:
print("Before Over-Sampling \n-----------------------")
print([x1.shape, y1.shape])

print('\n')

print("After Over-Sampling\n-----------------------")
print([x1_resampled.shape, y1_resampled.shape])


Before Over-Sampling 
-----------------------
[(10000, 9), (10000,)]


After Over-Sampling
-----------------------
[(15926, 9), (15926,)]


In [93]:
x1_resampled

array([[-0.32622142,  0.        ,  0.        , ...,  1.        ,
         1.        ,  0.02187974],
       [-0.44003595,  2.        ,  0.        , ...,  1.        ,
         1.        ,  0.21653222],
       [-1.53679418,  0.        ,  0.        , ...,  3.        ,
         0.        ,  0.24068554],
       ...,
       [ 0.53667434,  0.        ,  0.        , ...,  1.        ,
         0.        , -0.35291971],
       [ 0.56534303,  0.        ,  0.        , ...,  2.        ,
         1.        ,  0.02175822],
       [ 0.77550661,  1.        ,  0.        , ...,  1.        ,
         0.        , -0.00553711]])

In [95]:
df_resampled = np.concatenate((x1_resampled, y1_resampled.reshape(-1, 1)), axis=1)

df_resampled

array([[-0.32622142,  0.        ,  0.        , ...,  1.        ,
         0.02187974,  1.        ],
       [-0.44003595,  2.        ,  0.        , ...,  1.        ,
         0.21653222,  0.        ],
       [-1.53679418,  0.        ,  0.        , ...,  0.        ,
         0.24068554,  1.        ],
       ...,
       [ 0.53667434,  0.        ,  0.        , ...,  0.        ,
        -0.35291971,  1.        ],
       [ 0.56534303,  0.        ,  0.        , ...,  1.        ,
         0.02175822,  1.        ],
       [ 0.77550661,  1.        ,  0.        , ...,  0.        ,
        -0.00553711,  1.        ]])

### 4. Splitting Oversampled Data for Training and Testing

In [96]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_resampled, test_size=0.2, random_state=0)

print(train.shape, test.shape)

(12740, 10) (3186, 10)


----------------------------------------------------

#### Update Entity

In [13]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [14]:
import os, sys
%pwd

'f:\\End-to-End-DS-Projects\\Bank Churn Prediction'

In [70]:
os.chdir('../')
%pwd

'f:\\End-to-End-DS-Projects\\Bank Churn Prediction'

#### Update Config manager

In [15]:
from src.BankChurn.constants import *
from src.BankChurn.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config



#### Update the components

In [16]:
import os
import pandas as pd
import numpy as np
from src.BankChurn import logger
import pickle
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

class DataTransformationClass:
    def __init__(self, data_path, config: DataTransformationConfig):
        self.data_path = data_path
        self.config = config
        logger.info('Initialized DataTransformationClass')

    def load_data(self):
        logger.info('Loading data from {}'.format(self.data_path))
        df = pd.read_pickle(self.data_path)
        logger.info('Loaded data with shape {}'.format(df.shape))
        return df

    def preprocess_data(self, df):
        numbCol = ['EstimatedSalary', 'Balance', 'CreditScore', 'Age']
        df[numbCol] = StandardScaler().fit_transform(df[numbCol])
        logger.info('Preprocessed data')
        return df

    def resample_data(self, df):
        x1 = df.drop(columns=['Exited'])
        y1 = df['Exited']
        over = SMOTE(sampling_strategy=1)
        x1_resampled, y1_resampled = over.fit_resample(x1, y1)
        logger.info('Oversampled data')
        return x1_resampled, y1_resampled

    def split_data(self, df_resampled):
        train, test = train_test_split(df_resampled, test_size=0.2, random_state=0)
        logger.info('Split data into train and test with shapes {} and {}'.format(train.shape, test.shape))
        return train, test

    def save_data(self, train, test):
        os.makedirs('artifacts/data_transformation', exist_ok=True)
        train.to_pickle(os.path.join(self.config.root_dir, "train.pkl"),protocol=pickle.HIGHEST_PROTOCOL)
        test.to_pickle(os.path.join(self.config.root_dir, "test.pkl"),protocol=pickle.HIGHEST_PROTOCOL)
        
    def transform_data(self):
        df = self.load_data()
        preprocessed_data = self.preprocess_data(df)
        resampled_data_x, resampled_data_y = self.resample_data(preprocessed_data)
        train, test = self.split_data(resampled_data_x)
        self.save_data(train, test)

#### Update Pipeline

In [18]:
try:
    DATA_PATH = 'artifacts/data_preprocessed/preprocessed_data.pkl'
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformationClass(data_path=DATA_PATH, config=data_transformation_config)
    data_transformation.transform_data()
except Exception as e:
    raise e

[2024-06-05 20:07:42,925: 31 - Bank Churn Project Logger: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-06-05 20:07:42,928: 31 - Bank Churn Project Logger: INFO: common: yaml file: params.yaml loaded successfully]
[2024-06-05 20:07:42,933: 31 - Bank Churn Project Logger: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-06-05 20:07:42,935: 51 - Bank Churn Project Logger: INFO: common: created directory at: artifacts]
[2024-06-05 20:07:42,938: 51 - Bank Churn Project Logger: INFO: common: created directory at: artifacts/data_transformation]
[2024-06-05 20:07:42,939: 14 - Bank Churn Project Logger: INFO: 671027987: Initialized DataTransformationClass]
[2024-06-05 20:07:42,940: 17 - Bank Churn Project Logger: INFO: 671027987: Loading data from artifacts/data_preprocessed/preprocessed_data.pkl]
[2024-06-05 20:07:42,953: 19 - Bank Churn Project Logger: INFO: 671027987: Loaded data with shape (10000, 10)]
[2024-06-05 20:07:42,968: 25 - Bank Churn Projec

In [3]:
import os
%pwd
os.chdir('../')
%pwd

'f:\\End-to-End-DS-Projects\\Bank Churn Prediction'

In [9]:
%pwd

'f:\\End-to-End-DS-Projects\\Bank Churn Prediction'