In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , LabelEncoder
import pickle

## Load Dataset

In [28]:
data = pd.read_csv('Churn_Modelling.csv')
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [10]:
# checking for null values
data.isna().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [11]:
# check for features that not numeric
data.select_dtypes(include=['object']).columns

Index(['Surname', 'Geography', 'Gender'], dtype='object')

In [14]:
data.Surname.value_counts()

Surname
Smith       32
Scott       29
Martin      29
Walker      28
Brown       26
            ..
Izmailov     1
Bold         1
Bonham       1
Poninski     1
Burbidge     1
Name: count, Length: 2932, dtype: int64

## Preprocess Data By droping irelevant columns 

In [15]:
data.head(2)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0


In [29]:
data.drop(['Surname', 'CustomerId','Surname'], axis=1, inplace=True)

In [17]:
data.head(2)

Unnamed: 0,RowNumber,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CreditScore      10000 non-null  int64  
 2   Geography        10000 non-null  object 
 3   Gender           10000 non-null  object 
 4   Age              10000 non-null  int64  
 5   Tenure           10000 non-null  int64  
 6   Balance          10000 non-null  float64
 7   NumOfProducts    10000 non-null  int64  
 8   HasCrCard        10000 non-null  int64  
 9   IsActiveMember   10000 non-null  int64  
 10  EstimatedSalary  10000 non-null  float64
 11  Exited           10000 non-null  int64  
dtypes: float64(2), int64(8), object(2)
memory usage: 937.6+ KB


In [20]:
# numercical features
data.select_dtypes(exclude=['object']).columns

Index(['RowNumber', 'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
       'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [21]:
# non numerical features
data.select_dtypes(include=['object']).columns

Index(['Geography', 'Gender'], dtype='object')

## Encode Categories Features

In [22]:
data.Geography.value_counts()

Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64

In [23]:
data.Gender.value_counts()

Gender
Male      5457
Female    4543
Name: count, dtype: int64

### LabelEncoder in Gender Feature

In [30]:
Label_Encoder_gender = LabelEncoder()
data['Gender'] = Label_Encoder_gender.fit_transform(data['Gender'])

In [31]:
data.head()

Unnamed: 0,RowNumber,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,2,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,3,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,4,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,5,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


### One-Hot-Encoder in Gegoraphy Feature

In [32]:
from sklearn.preprocessing import OneHotEncoder

In [33]:
Oht_Geography = OneHotEncoder()
geo_Encoder = Oht_Geography.fit_transform(data[['Geography']]).toarray()
geo_Encoder

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [35]:
Oht_Geography.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [37]:
geo_encoded_df =pd.DataFrame(geo_Encoder, columns=Oht_Geography.get_feature_names_out(['Geography']))

In [38]:
geo_Encoder.shape

(10000, 3)

In [40]:
data.head()

Unnamed: 0,RowNumber,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,2,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,3,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,4,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,5,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


##### Combine OneHotEncoder with original data 

In [41]:
data  = pd.concat([data.drop(['Geography'],axis=1), geo_encoded_df], axis=1)

In [42]:
data.head()

Unnamed: 0,RowNumber,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,1,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,2,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,3,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,4,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,5,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [44]:
data.shape

(10000, 14)

In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   RowNumber          10000 non-null  int64  
 1   CreditScore        10000 non-null  int64  
 2   Gender             10000 non-null  int32  
 3   Age                10000 non-null  int64  
 4   Tenure             10000 non-null  int64  
 5   Balance            10000 non-null  float64
 6   NumOfProducts      10000 non-null  int64  
 7   HasCrCard          10000 non-null  int64  
 8   IsActiveMember     10000 non-null  int64  
 9   EstimatedSalary    10000 non-null  float64
 10  Exited             10000 non-null  int64  
 11  Geography_France   10000 non-null  float64
 12  Geography_Germany  10000 non-null  float64
 13  Geography_Spain    10000 non-null  float64
dtypes: float64(5), int32(1), int64(8)
memory usage: 1.0 MB


## Save the encoder and scalaer


In [46]:
with open('Oht_Geography.pkl', 'wb') as file:
    pickle.dump(Oht_Geography, file)
# check for correlation between features
with open('Label_Encoder_gender.pkl', 'wb') as file:
    pickle.dump(Label_Encoder_gender, file)

## Split data into dependent and independent fearures

In [47]:
X=data.drop(['Exited'], axis=1)
y=data['Exited']

### Split data into training and testing 

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8000, 13), (2000, 13), (8000,), (2000,))

## Scale features

In [49]:
sclaer = StandardScaler()
X_train = sclaer.fit_transform(X_train)
X_test = sclaer.transform(X_test)

In [50]:
with open('sclaer.pkl', 'wb') as file:
    pickle.dump(sclaer, file)