In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/binary-classification-io-t-a/sample_submission.csv
/kaggle/input/binary-classification-io-t-a/train.csv
/kaggle/input/binary-classification-io-t-a/test.csv


In [2]:
# Dataset Loading
import pandas as pd

train = pd.read_csv('/kaggle/input/binary-classification-io-t-a/train.csv')
test = pd.read_csv('/kaggle/input/binary-classification-io-t-a/test.csv')

In [3]:
# All imports
import numpy as np # Numpy
from sklearn.pipeline import Pipeline # Ease of understanding
from sklearn.compose import ColumnTransformer # Helps combine transformations for both categorical and numerical variables, applies them selectively based on the feature type
from sklearn.preprocessing import StandardScaler,OneHotEncoder # Ease of pre-processing
from sklearn.impute import SimpleImputer # Fill NaN values
from sklearn.model_selection import train_test_split # Dataset splits
from sklearn.linear_model import LogisticRegression # Bin class using LReg
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score

In [4]:
# Feature selection
# Removing irrelevant features

X = train.drop(columns=['id','CustomerId','Surname','Exited'])
y = train['Exited']

In [5]:
# Pre-processing
# Split into Numerical and Categorical Data

categorical = ['Geography','Gender']
numerical = ['CreditScore','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary']

numerical_trans = Pipeline(steps=[('imputer',SimpleImputer(missing_values=np.nan,strategy='mean')),('scaler',StandardScaler())])
categorical_trans = Pipeline(steps=[ ('imputer',SimpleImputer(missing_values=np.nan,strategy='most_frequent')),('encoder',OneHotEncoder(drop='first'))])

preprocess = ColumnTransformer(transformers=[('num',numerical_trans,numerical),('cat',categorical_trans,categorical)])

In [6]:
# Model 
model = Pipeline(steps=[('preprocess',preprocess),('LReg',LogisticRegression())])

In [7]:
# Train-Test Splits

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=52,stratify=y)

# Fit the model
model.fit(X_train,y_train)

In [8]:
# Prediction variables for both Exited and Prob(Exited) 
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:,1]

In [9]:
# Performance Metrics

cm = confusion_matrix(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)

print("Confusion Matrix:\n", cm)
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Confusion Matrix:
 [[3415  152]
 [ 403  530]]
Accuracy: 0.88
Precision: 0.78
Recall: 0.57
F1 Score: 0.66


In [10]:
# Submission
y_test_prob = model.predict_proba(test)[:, 1]
sample_submission = pd.read_csv('/kaggle/input/binary-classification-io-t-a/sample_submission.csv')
sample_submission['Exited'] = y_test_prob
sample_submission.to_csv('/kaggle/working/submission.csv', index=False)