In [14]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [2]:
# create file path
file_path = 'https://raw.githubusercontent.com/arvioa/bank_customer_churn_prediction/main/assets/Customer-Churn-Transformed.csv'

# import csv file
df = pd.read_csv(file_path,sep=";")

# take a brief overview of the data
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,...,IsActiveMember,EstimatedSalary,Exited,Complain,SatisfactionScore,CardType,PointEarned,Age_segment,CreditScore_segment,Salary_segment
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,...,1,101348.88,1,1,2,DIAMOND,464,40-49,Medium,Middle Income
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,...,1,112542.58,0,1,3,DIAMOND,456,40-49,Medium,Middle Income
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,...,0,113931.57,1,1,3,DIAMOND,377,40-49,Low,Middle Income
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,...,0,93826.63,0,0,5,GOLD,350,30-39,Medium,Middle Income
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,...,1,79084.1,0,0,5,GOLD,425,40-49,Very High,Middle Income


First, we are going remove the outliers in the dataset.

In [3]:
df = df.drop(df[(df['Exited'] == 1) & (df['Age'] > 60)].index)
df = df.drop(df[(df['Exited'] == 0) & (df['Age'] > 53)].index)
df = df.drop(df[(df['Exited'] == 1) & (df['Age'] < 23)].index)
df = df.drop(df[(df['Exited'] == 1) & (df['CreditScore'] < 383)].index)

Next, we are going to keep the features that is good to predict our customer churn behavior.

In [4]:
selected_features = ['Tenure', 'Age', 'Balance', 'EstimatedSalary', 'CreditScore', 'Complain', 
                     'Geography', 'IsActiveMember', 'Gender']
num_col = ['Tenure', 'Age', 'Balance', 'EstimatedSalary', 'CreditScore']
cat_col = ['Complain','Geography','IsActiveMember','Gender']

Despite having good correlation with Exited column, Age_segment will be substituted by its numerical type column, Age.

In [5]:
X = df[selected_features]
y = df.Exited

In [6]:
df[num_col]

Unnamed: 0,Tenure,Age,Balance,EstimatedSalary,CreditScore
0,2,42,0.00,101348.88,619
1,1,41,83807.86,112542.58,608
2,8,42,159660.80,113931.57,502
3,1,39,0.00,93826.63,699
4,2,43,125510.82,79084.10,850
...,...,...,...,...,...
9995,5,39,0.00,96270.64,771
9996,10,35,57369.61,101699.77,516
9997,7,36,0.00,42085.58,709
9998,3,42,75075.31,92888.52,772


In [7]:
# Handle missing values using mean imputation
imputer = SimpleImputer(strategy='mean')
df[num_col] = imputer.fit_transform(df[num_col])

# Perform Min-Max scaling
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
df[num_col] = min_max_scaler.fit_transform(df[num_col])

In [8]:
scaled_df = pd.DataFrame(data=df, columns=num_col)

In [10]:
 # Apply one-hot encoding to the categorical columns
X = pd.get_dummies(X[cat_col], columns=cat_col)

In [12]:
# Create a DataFrame from scaled_data
scaled_df = pd.DataFrame(data=df, columns=num_col)

# Concatenate the one-hot encoded columns and the scaled numerical columns
X = pd.concat([X, scaled_df], axis=1)

In [32]:
X

Unnamed: 0,Complain_0,Complain_1,Geography_France,Geography_Germany,Geography_Spain,IsActiveMember_0,IsActiveMember_1,Gender_Female,Gender_Male,Tenure,Age,Balance,EstimatedSalary,CreditScore
0,0,1,1,0,0,0,1,1,0,0.2,0.571429,0.000000,0.506735,0.505353
1,0,1,0,0,1,0,1,1,0,0.1,0.547619,0.334031,0.562709,0.481799
2,0,1,1,0,0,1,0,1,0,0.8,0.571429,0.636357,0.569654,0.254818
3,1,0,1,0,0,1,0,1,0,0.1,0.500000,0.000000,0.469120,0.676660
4,1,0,0,0,1,0,1,1,0,0.2,0.595238,0.500246,0.395400,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,0,1,0,0,1,0,0,1,0.5,0.500000,0.000000,0.481341,0.830835
9996,1,0,1,0,0,0,1,0,1,1.0,0.404762,0.228657,0.508490,0.284797
9997,0,1,1,0,0,0,1,1,0,0.7,0.428571,0.000000,0.210390,0.698073
9998,0,1,0,1,0,1,0,0,1,0.3,0.571429,0.299226,0.464429,0.832976


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [19]:
# Save training data to CSV file
training_data = pd.concat([X_train, y_train], axis=1)
training_data.to_csv('C:/Users/user/Downloads/train_data_customer_churn.csv', index=False)

# Save testing data to CSV file
testing_data = pd.concat([X_test, y_test], axis=1)
testing_data.to_csv('C:/Users/user/Downloads/test_data_customer_churn.csv', index=False)