# Dataset: Customer Churn (intentionally messy) 
#### Missing age values 
#### Duplicate rows 
#### Inconsistent gender formats 
#### Salary outliers 
# Task: 
#### “Clean this dataset so it is ready for machine learning.” 

In [1]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("customer_churn_messy.csv")
df

Unnamed: 0,CustomerID,Age,Gender,Salary,Churn
0,101,25.0,Male,40000,No
1,102,,FEMALE,42000,Yes
2,103,35.0,female,39000,No
3,104,45.0,M,1200000,No
4,105,28.0,male,41000,Yes
5,106,35.0,F,39000,No
6,107,35.0,F,39000,No
7,108,,Male,38000,Yes
8,109,52.0,Female,45000,No
9,110,23.0,Male,37000,Yes


In [2]:
df.head()

Unnamed: 0,CustomerID,Age,Gender,Salary,Churn
0,101,25.0,Male,40000,No
1,102,,FEMALE,42000,Yes
2,103,35.0,female,39000,No
3,104,45.0,M,1200000,No
4,105,28.0,male,41000,Yes


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CustomerID  21 non-null     int64  
 1   Age         17 non-null     float64
 2   Gender      21 non-null     object 
 3   Salary      21 non-null     int64  
 4   Churn       21 non-null     object 
dtypes: float64(1), int64(2), object(2)
memory usage: 972.0+ bytes


In [4]:
df.isnull().sum()

CustomerID    0
Age           4
Gender        0
Salary        0
Churn         0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,CustomerID,Age,Salary
count,21.0,17.0,21.0
mean,110.761905,35.058824,570000.0
std,5.889862,7.956684,2175413.0
min,101.0,23.0,36000.0
25%,106.0,29.0,39000.0
50%,111.0,35.0,40000.0
75%,116.0,38.0,43000.0
max,120.0,52.0,9999999.0


# Remove Duplicate Rows

In [6]:
df = df.drop_duplicates()

In [7]:
df

Unnamed: 0,CustomerID,Age,Gender,Salary,Churn
0,101,25.0,Male,40000,No
1,102,,FEMALE,42000,Yes
2,103,35.0,female,39000,No
3,104,45.0,M,1200000,No
4,105,28.0,male,41000,Yes
5,106,35.0,F,39000,No
6,107,35.0,F,39000,No
7,108,,Male,38000,Yes
8,109,52.0,Female,45000,No
9,110,23.0,Male,37000,Yes


# Handle Missing Values

In [8]:
mean_age = df["Age"].mean()
df["Age"] = df["Age"].fillna(mean_age)
df.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Age"] = df["Age"].fillna(mean_age)


CustomerID    0
Age           0
Gender        0
Salary        0
Churn         0
dtype: int64

# Fix inconsistent Gender values

In [9]:
df["Gender"] = df["Gender"].str.lower()

df["Gender"] = df["Gender"].replace({
    "m": "male",
    "f": "female"
})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Gender"] = df["Gender"].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Gender"] = df["Gender"].replace({


In [10]:
df

Unnamed: 0,CustomerID,Age,Gender,Salary,Churn
0,101,25.0,male,40000,No
1,102,35.0625,female,42000,Yes
2,103,35.0,female,39000,No
3,104,45.0,male,1200000,No
4,105,28.0,male,41000,Yes
5,106,35.0,female,39000,No
6,107,35.0,female,39000,No
7,108,35.0625,male,38000,Yes
8,109,52.0,female,45000,No
9,110,23.0,male,37000,Yes


# Handle Salary outliers using IQR

In [11]:
Q1 = df["Salary"].quantile(0.25)
Q3 = df["Salary"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df["Salary"] >= lower_bound) & (df["Salary"] <= upper_bound)]

In [12]:
df

Unnamed: 0,CustomerID,Age,Gender,Salary,Churn
0,101,25.0,male,40000,No
1,102,35.0625,female,42000,Yes
2,103,35.0,female,39000,No
4,105,28.0,male,41000,Yes
5,106,35.0,female,39000,No
6,107,35.0,female,39000,No
7,108,35.0625,male,38000,Yes
8,109,52.0,female,45000,No
9,110,23.0,male,37000,Yes
11,112,41.0,female,43000,No


In [14]:
df.isnull().sum()

CustomerID    0
Age           0
Gender        0
Salary        0
Churn         0
dtype: int64

In [15]:
df

Unnamed: 0,CustomerID,Age,Gender,Salary,Churn
0,101,25.0,0,40000,0
1,102,35.0625,1,42000,1
2,103,35.0,1,39000,0
4,105,28.0,0,41000,1
5,106,35.0,1,39000,0
6,107,35.0,1,39000,0
7,108,35.0625,0,38000,1
8,109,52.0,1,45000,0
9,110,23.0,0,37000,1
11,112,41.0,1,43000,0


# Save cleaned dataset

In [16]:
df.to_csv("customer_churn_cleaned.csv", index=False)

In [17]:
df

Unnamed: 0,CustomerID,Age,Gender,Salary,Churn
0,101,25.0,0,40000,0
1,102,35.0625,1,42000,1
2,103,35.0,1,39000,0
4,105,28.0,0,41000,1
5,106,35.0,1,39000,0
6,107,35.0,1,39000,0
7,108,35.0625,0,38000,1
8,109,52.0,1,45000,0
9,110,23.0,0,37000,1
11,112,41.0,1,43000,0
