In [15]:
import pandas as pd
import numpy as np
train_df = pd.read_csv("customer_churn_messy.csv")

In [16]:
train_df

Unnamed: 0,CustomerID,Age,Gender,Salary,Churn
0,101,25.0,Male,40000,No
1,102,,FEMALE,42000,Yes
2,103,35.0,female,39000,No
3,104,45.0,M,1200000,No
4,105,28.0,male,41000,Yes
5,106,35.0,F,39000,No
6,107,35.0,F,39000,No
7,108,,Male,38000,Yes
8,109,52.0,Female,45000,No
9,110,23.0,Male,37000,Yes


## Handle Duplicate Rows

In [17]:
train_df.drop_duplicates(inplace=True)
train_df

Unnamed: 0,CustomerID,Age,Gender,Salary,Churn
0,101,25.0,Male,40000,No
1,102,,FEMALE,42000,Yes
2,103,35.0,female,39000,No
3,104,45.0,M,1200000,No
4,105,28.0,male,41000,Yes
5,106,35.0,F,39000,No
6,107,35.0,F,39000,No
7,108,,Male,38000,Yes
8,109,52.0,Female,45000,No
9,110,23.0,Male,37000,Yes


In [18]:
train_df.head()

Unnamed: 0,CustomerID,Age,Gender,Salary,Churn
0,101,25.0,Male,40000,No
1,102,,FEMALE,42000,Yes
2,103,35.0,female,39000,No
3,104,45.0,M,1200000,No
4,105,28.0,male,41000,Yes


## Handling Missing Age Values: median imputation

In [19]:
median_age = train_df['Age'].median()
train_df['Age'] = train_df['Age'].fillna(median_age)
test_df['Age']  = test_df['Age'].fillna(median_age)

In [20]:
train_df

Unnamed: 0,CustomerID,Age,Gender,Salary,Churn
0,101,25.0,Male,40000,No
1,102,35.0,FEMALE,42000,Yes
2,103,35.0,female,39000,No
3,104,45.0,M,1200000,No
4,105,28.0,male,41000,Yes
5,106,35.0,F,39000,No
6,107,35.0,F,39000,No
7,108,35.0,Male,38000,Yes
8,109,52.0,Female,45000,No
9,110,23.0,Male,37000,Yes


## Fix gender format

In [21]:
gender_map = {
    'male': 'Male', 'm': 'Male',
    'female': 'Female', 'f': 'Female'
}
train_df['Gender'] = train_df['Gender'].str.lower().map(gender_map)
test_df['Gender'] = test_df['Gender'].str.lower().map(gender_map)

In [22]:
train_df

Unnamed: 0,CustomerID,Age,Gender,Salary,Churn
0,101,25.0,Male,40000,No
1,102,35.0,Female,42000,Yes
2,103,35.0,Female,39000,No
3,104,45.0,Male,1200000,No
4,105,28.0,Male,41000,Yes
5,106,35.0,Female,39000,No
6,107,35.0,Female,39000,No
7,108,35.0,Male,38000,Yes
8,109,52.0,Female,45000,No
9,110,23.0,Male,37000,Yes


In [23]:
print(train_df.columns.tolist())

['CustomerID', 'Age', 'Gender', 'Salary', 'Churn']


In [24]:
Q1 = train_df['Salary'].quantile(0.25)
Q3 = train_df['Salary'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
train_df['Salary'] = np.clip(train_df['Salary'], lower, upper)

In [25]:
test_df.to_csv("customer_churn_cleaned.csv", index=False)

In [26]:
orig = pd.read_csv("customer_churn_messy.csv")
clean = pd.read_csv("customer_churn_cleaned.csv")

print("Original shape:", orig.shape)
print("Cleaned shape :", clean.shape)
print("Duplicates removed:", orig.duplicated().sum())
print("Missing Age before:", orig['Age'].isnull().sum())
print("Missing Age after :", clean['Age'].isnull().sum())


Original shape: (21, 5)
Cleaned shape : (64374, 12)
Duplicates removed: 1
Missing Age before: 4
Missing Age after : 0
