# Data Loading

In [24]:
import pandas as pd

# Loading Data
churn_data = pd.read_csv("ChurnData.csv")

# Drop columns that will not be useful during training
churn_data.drop(["RowNumber", "CustomerId", "Surname"], axis=1, inplace=True)

# Stratified Sampling

In [25]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(churn_data, churn_data["Geography"]):
    strat_train_set = churn_data.loc[train_index]
    strat_test_set = churn_data.loc[test_index]

In [26]:
strat_train_set["Geography"].value_counts() / len(strat_train_set)

France     0.501375
Germany    0.250875
Spain      0.247750
Name: Geography, dtype: float64

In [27]:
strat_test_set["Geography"].value_counts() / len(strat_test_set)

France     0.5015
Germany    0.2510
Spain      0.2475
Name: Geography, dtype: float64

In [28]:
# Store train-test split to csv
strat_train_set.to_csv("strat_train_set.csv")
strat_test_set.to_csv("strat_test_set.csv")

# Data Preparation

In [29]:
churn_train_data = strat_train_set.drop("Exited", axis=1)
churn_train_data_labels = strat_train_set["Exited"].copy()

churn_test_data = strat_test_set.drop("Exited", axis=1)
churn_test_data_labels = strat_test_set["Exited"].copy()

# Transformation Pipeline

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Transformation Pipeline

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")), # EstimatedSalary missing data handling
    ('std_scaler', StandardScaler())
])

num_attribs = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
cat_attribs = ["Gender", "Geography"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs) # Handles Gender missing values
])

churn_training_data_preprocessed = full_pipeline.fit_transform(churn_train_data)
pd.DataFrame(churn_training_data_preprocessed).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.988298,-0.950006,0.685885,-1.240112,0.804769,0.643372,0.965833,0.003099,0.0,1.0,0.0,1.0,0.0,0.0
1,0.097887,-1.806886,1.032598,-1.240112,-0.90614,0.643372,-1.035375,-0.021787,0.0,1.0,0.0,0.0,0.0,1.0
2,0.874408,2.287096,1.726025,-1.240112,0.804769,-1.554311,0.965833,-0.108617,1.0,0.0,0.0,1.0,0.0,0.0
3,0.253191,-0.950006,0.685885,0.878347,-0.90614,-1.554311,-1.035375,0.040728,0.0,1.0,0.0,1.0,0.0,0.0
4,-2.097079,1.239798,0.339172,-1.240112,0.804769,-1.554311,0.965833,-0.029546,0.0,1.0,0.0,1.0,0.0,0.0


In [31]:
# Transform testing data

churn_testing_data_preprocessed = full_pipeline.transform(churn_test_data)
pd.DataFrame(churn_testing_data_preprocessed).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.59486,0.382918,1.032598,-1.240112,0.804769,-1.554311,-1.035375,0.023929,1.0,0.0,0.0,1.0,0.0,0.0
1,-0.129893,-1.140424,-0.700967,1.180736,-0.90614,-1.554311,0.965833,-0.0416,0.0,1.0,0.0,0.0,1.0,0.0
2,0.087533,-0.093126,0.685885,-1.240112,0.804769,-1.554311,0.965833,0.006959,0.0,1.0,0.0,1.0,0.0,0.0
3,-0.668281,0.478127,-0.007541,0.775993,0.804769,0.643372,0.965833,-0.075902,1.0,0.0,0.0,0.0,1.0,0.0
4,-0.595805,1.620634,0.685885,-1.240112,-0.90614,0.643372,-1.035375,-0.088273,1.0,0.0,0.0,1.0,0.0,0.0


# Save The Transformation Pipeline

In [32]:
import joblib

# Store transformation pipeline
joblib.dump(full_pipeline, "full_pipeline.pkl")

['full_pipeline.pkl']