# **Import Library**

In [1]:
# pandas: A powerful data manipulation and analysis library.
# - It is used for handling and analyzing structured data. It provides data structures like DataFrame 
#   and Series to work with heterogeneous data in tabular form (similar to SQL tables or Excel sheets).
# - Useful for data cleaning, transformation, and analysis.

import pandas as pd

# numpy: A package for scientific computing with support for large, multi-dimensional arrays and matrices.
# - It provides tools for numerical computations, including support for arrays, random number generation, 
#   and linear algebra operations.
# - Essential for working with numerical data and performing mathematical operations.

import numpy as np

# seaborn: A Python visualization library based on matplotlib that provides a high-level interface for drawing 
# attractive statistical graphics.
# - Useful for creating aesthetically pleasing and informative data visualizations, especially for 
#   exploring relationships in data.
# - Works well with pandas DataFrames for easy plotting.

import seaborn as sns

# matplotlib: A popular 2D plotting library for Python.
# - Used to create a variety of static, animated, and interactive visualizations, including line plots, 
#   histograms, scatter plots, etc.
# - Often used in conjunction with seaborn for more advanced visualizations.

import matplotlib.pyplot as plt

# sklearn.model_selection.train_test_split: A utility function to split datasets into training and testing sets.
# - It is used for dividing your dataset into a training set (to train the model) and a test set 
#   (to evaluate the model's performance).
# - Helps avoid overfitting by validating the model on unseen data.

from sklearn.model_selection import train_test_split

# sklearn.preprocessing: A module that provides various methods for scaling and encoding features.
# - LabelEncoder: Converts categorical labels into numeric labels.
# - StandardScaler: Standardizes the features by removing the mean and scaling to unit variance.
# - MinMaxScaler: Scales features to a given range, often [0, 1].

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

# sklearn.neighbors.KNeighborsClassifier: An implementation of the K-Nearest Neighbors (KNN) algorithm.
# - KNN is a simple, instance-based learning algorithm used for classification or regression.
# - It works by finding the closest training examples in the feature space and using their labels 
#   to predict the label for new instances.

from sklearn.neighbors import KNeighborsClassifier

# sklearn.tree.DecisionTreeClassifier: A classifier that builds a decision tree based on feature values.
# - Decision trees partition the feature space into regions and assign a label to each region.
# - Used for classification tasks and interpretable models where the decisions can be traced back 
#   to the tree structure.

from sklearn.tree import DecisionTreeClassifier

# sklearn.ensemble.RandomForestClassifier: An ensemble method that combines multiple decision trees to 
# improve classification accuracy.
# - It builds multiple decision trees using random subsets of the data and features, which helps 
#   reduce overfitting and improves generalization.

from sklearn.ensemble import RandomForestClassifier

# sklearn.svm.SVC: A support vector machine classifier for binary or multi-class classification tasks.
# - SVMs work by finding a hyperplane that best separates data points of different classes.
# - Good at handling high-dimensional data and effective for complex datasets.

from sklearn.svm import SVC

# sklearn.naive_bayes.GaussianNB: A Naive Bayes classifier that assumes features follow a Gaussian (normal) distribution.
# - It is used for classification problems where the features are assumed to be independent.
# - Typically used for text classification and spam filtering.

from sklearn.naive_bayes import GaussianNB

# sklearn.metrics: A module that provides functions to evaluate model performance.
# - confusion_matrix: Computes the confusion matrix to evaluate classification results.
# - accuracy_score: Calculates the ratio of correctly predicted instances.
# - precision_score: Measures the accuracy of positive predictions.
# - recall_score: Measures the ability of the model to identify all positive samples.
# - f1_score: Harmonic mean of precision and recall, useful for imbalanced datasets.

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# **Data Loading**

In [2]:
# unique file id form GDrive
file_id = '19IfOP0QmCHccMu8A6B2fCUpFqZwCxuzO'
 
# create direct download url
download_url = f'https://drive.google.com/uc?id={file_id}'
 
# read csv file from the url
data = pd.read_csv(download_url)
 
# Display the DataFrame to ensure it has been read correctly
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# Display general information about the dataset
print("\n Dataset information:")
data.info()


 Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


- **RowNumber**: The row number in the dataset used to uniquely identify each entry. This feature has no analytical significance.
- **CustomerId**: A unique ID that identifies each customer in the system. This is useful for referencing and aggregating data.
- **Surname**: The customer’s last name. This feature is not used in model analysis as it is not relevant.
- **CreditScore**: A credit score that indicates the creditworthiness of a customer. This score can influence their decision to stay or churn.
- **Geography**: The geographic location where the customer lives. This information can influence the customer’s behavior and service needs.
- **Gender**: The customer’s gender. While it does not always directly influence churn, this information is useful for demographic analysis.
- **Age**: The customer’s age. Age can influence their habits and preferences in using the service.
- **Tenure**: The length of the customer’s subscription. This subscription duration is often related to the customer’s likelihood of churn.
- **Balance**: The customer’s account balance. This balance can influence customer satisfaction and their likelihood of staying with the service.
- **NumOfProducts**: The number of products a customer owns. This feature helps understand customer engagement with various products.
- **HasCrCard**: This indicates whether the customer has a credit card or not. These features can affect the customer experience with the service.
- **IsActiveMember**: The customer's active membership status. This indicates whether the customer is still active or not in using the service.
- **EstimatedSalary**: The estimated salary of the customer. Salary can affect the customer's decision to subscribe or quit the service.
- **Exited**: The target label indicating whether the customer has exited the service (1) or not (0). This feature is the variable that we want to predict in the classification model.

In [4]:
# check missing values
print("\nMissing values per feature:")
print(data.isnull().sum())


Missing values per feature:
RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


In [5]:
# remove irrelevant columns for further analysis
data = data.drop(columns=['RowNumber', 'CustomerId', 'Surname'])

# display the dataframe to check the columns have been deleted
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
