In [None]:
## Importing Library 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
df=pd.read_csv('assets/Churn_Modelling.csv')  

In [10]:
df.info(())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [11]:
## Make some Null value from null value 
df.loc[df.sample(54).index, 'Gender'] = np.nan
df.loc[df.sample(300).index, 'Age'] = np.nan

# Verify the changes
print(df.isnull().sum())

RowNumber            0
CustomerId           0
Surname              0
CreditScore          0
Geography            0
Gender              54
Age                300
Tenure               0
Balance              0
NumOfProducts        0
HasCrCard            0
IsActiveMember       0
EstimatedSalary      0
Exited               0
dtype: int64


In [12]:
# Introduce null values in 'Gender' and 'Age' columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           9946 non-null   object 
 6   Age              9700 non-null   float64
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


## Technique For handelling Null value 

### 1.Dropping the column which has null value 

In [15]:
updated_df=df.dropna(axis=1)


updated_df_row_droppping=df.dropna(axis=0)

In [14]:
updated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Tenure           10000 non-null  int64  
 6   Balance          10000 non-null  float64
 7   NumOfProducts    10000 non-null  int64  
 8   HasCrCard        10000 non-null  int64  
 9   IsActiveMember   10000 non-null  int64  
 10  EstimatedSalary  10000 non-null  float64
 11  Exited           10000 non-null  int64  
dtypes: float64(2), int64(8), object(2)
memory usage: 937.6+ KB


In [16]:
updated_df_row_droppping.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9648 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        9648 non-null   int64  
 1   CustomerId       9648 non-null   int64  
 2   Surname          9648 non-null   object 
 3   CreditScore      9648 non-null   int64  
 4   Geography        9648 non-null   object 
 5   Gender           9648 non-null   object 
 6   Age              9648 non-null   float64
 7   Tenure           9648 non-null   int64  
 8   Balance          9648 non-null   float64
 9   NumOfProducts    9648 non-null   int64  
 10  HasCrCard        9648 non-null   int64  
 11  IsActiveMember   9648 non-null   int64  
 12  EstimatedSalary  9648 non-null   float64
 13  Exited           9648 non-null   int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


In [19]:

## Fill the missing values with the mean value of the column
df_copy = df.copy()
df_copy['Age'].fillna(df_copy['Age'].mean(), inplace=True)
# Verify the changes
print(df_copy.isnull().sum())

RowNumber           0
CustomerId          0
Surname             0
CreditScore         0
Geography           0
Gender             54
Age                 0
Tenure              0
Balance             0
NumOfProducts       0
HasCrCard           0
IsActiveMember      0
EstimatedSalary     0
Exited              0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy['Age'].fillna(df_copy['Age'].mean(), inplace=True)


### 2.Algorithimic Imputation


In [20]:
display(df)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42.0,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41.0,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42.0,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39.0,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43.0,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39.0,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35.0,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36.0,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42.0,3,75075.31,2,1,0,92888.52,1


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           9946 non-null   object 
 6   Age              9700 non-null   float64
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


In [28]:
df2=df.copy()

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


# Drop irrelevant columns (RowNumber, CustomerId, Surname)
df2.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

# Encoding categorical variable 'Geography'
df2 = pd.get_dummies(df2, columns=['Geography'], drop_first=True)

# Label encoding 'Gender' (Male=1, Female=0)
df2['Gender_encoded'] = df2['Gender'].map({'Male': 1, 'Female': 0})

# Splitting dataset into Train (non-null gender) and Test (null gender)
train_df = df2[df2['Gender_encoded'].notnull()]
test_df = df2[df2['Gender_encoded'].isnull()]

## All the feature except the Gender and gender Encoded
# Splitting dataset into Train (non-null gender) and Test (null gender)

X_train = train_df.drop(['Gender', 'Gender_encoded'], axis=1)

## Gender Incoded in 0,1 ..  0 for female and 1 for male -->
y_train = train_df['Gender_encoded']

## All the feature except Gender and gender Encoded
## Test Df is which gender is null
X_test = test_df.drop(['Gender', 'Gender_encoded'], axis=1)

# Handling missing numerical values in Age using KNN Imputer
knn_imputer = KNNImputer(n_neighbors=5)
X_train_imputed = knn_imputer.fit_transform(X_train) ## All the feature except the Gender and gender Encoded
## Xtrain the feature which gender is not null
X_test_imputed = knn_imputer.transform(X_test)


## With the using KNNImputer we can fill the missing value so that we can train the model and thn we can predict the missing value.

# **1. Random Forest Model**
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_imputed, y_train)
pred_rf = rf_model.predict(X_test_imputed)

# **2. Naïve Bayes Model**
nb_model = GaussianNB()
nb_model.fit(X_train_imputed, y_train)
pred_nb = nb_model.predict(X_test_imputed)

# **3. Majority Voting**
final_pred = np.round((pred_rf + pred_nb) / 2)  # Taking the majority vote

# Assigning the predicted values to the missing Gender column
df2.loc[df2['Gender_encoded'].isnull(), 'Gender_encoded'] = final_pred
df2['Gender'] = df2['Gender_encoded'].map({1: 'Male', 0: 'Female'})

# Final DataFrame
print(df2['Gender'].isnull().sum())  # Should print 0, meaning all missing values are filled

# Save cleaned dataset
df2.to_csv("cleaned_data.csv", index=False)

0


In [31]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CreditScore        10000 non-null  int64  
 1   Gender             10000 non-null  object 
 2   Age                9700 non-null   float64
 3   Tenure             10000 non-null  int64  
 4   Balance            10000 non-null  float64
 5   NumOfProducts      10000 non-null  int64  
 6   HasCrCard          10000 non-null  int64  
 7   IsActiveMember     10000 non-null  int64  
 8   EstimatedSalary    10000 non-null  float64
 9   Exited             10000 non-null  int64  
 10  Geography_Germany  10000 non-null  bool   
 11  Geography_Spain    10000 non-null  bool   
 12  Gender_encoded     10000 non-null  float64
dtypes: bool(2), float64(4), int64(6), object(1)
memory usage: 879.0+ KB


In [34]:
X_test_imputed.shape

(54, 11)