# Setup & Load Data

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("../data/raw/churn_data.csv")

# Quick sanity check
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_id       10000 non-null  int64  
 1   credit_score      10000 non-null  int64  
 2   country           10000 non-null  object 
 3   gender            10000 non-null  object 
 4   age               10000 non-null  int64  
 5   tenure            10000 non-null  int64  
 6   balance           10000 non-null  float64
 7   products_number   10000 non-null  int64  
 8   credit_card       10000 non-null  int64  
 9   active_member     10000 non-null  int64  
 10  estimated_salary  10000 non-null  float64
 11  churn             10000 non-null  int64  
dtypes: float64(2), int64(8), object(2)
memory usage: 937.6+ KB


# Drop Irrelevant Columns

In [3]:
df = df.drop(columns=['customer_id'])

# Check for Missing or Duplicated Data

In [4]:
# Missing values
print(df.isnull().sum())

# Duplicates
print(f"Duplicate rows: {df.duplicated().sum()}")

# Remove duplicates if any
df = df.drop_duplicates()


credit_score        0
country             0
gender              0
age                 0
tenure              0
balance             0
products_number     0
credit_card         0
active_member       0
estimated_salary    0
churn               0
dtype: int64
Duplicate rows: 0


# Encode Categorical Variables

We have two categorical columns:
- country (France, Spain, Germany)
- gender (Male, Female)


In [5]:
# One-hot encode 'country'
df = pd.get_dummies(df, columns=['country'], drop_first=True)

# Binary encode 'gender'
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

In [6]:
df.head()

Unnamed: 0,credit_score,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn,country_Germany,country_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,False,False
1,608,0,41,1,83807.86,1,0,1,112542.58,0,False,True
2,502,0,42,8,159660.8,3,1,0,113931.57,1,False,False
3,699,0,39,1,0.0,2,0,0,93826.63,0,False,False
4,850,0,43,2,125510.82,1,1,1,79084.1,0,False,True


# Handle Skewness or Scale Differences

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

num_features = ['credit_score', 'age', 'tenure', 'balance',
                'products_number', 'estimated_salary']

df[num_features] = scaler.fit_transform(df[num_features])

In [8]:
df.head()

Unnamed: 0,credit_score,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn,country_Germany,country_Spain
0,-0.326221,0,0.293517,-1.04176,-1.225848,-0.911583,1,1,0.021886,1,False,False
1,-0.440036,0,0.198164,-1.387538,0.11735,-0.911583,0,1,0.216534,0,False,True
2,-1.536794,0,0.293517,1.032908,1.333053,2.527057,1,0,0.240687,1,False,False
3,0.501521,0,0.007457,-1.387538,-1.225848,0.807737,0,0,-0.108918,0,False,False
4,2.063884,0,0.388871,-1.04176,0.785728,-0.911583,1,1,-0.365276,0,False,True
