In [1]:
#Import Libraries + Load Dataset
import pandas as pd
import numpy as np

df = pd.read_csv("customer_churn_messey.csv")   # use your file name
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,age,salary
0,7590-VHVEG,Female,0.0,Yes,No,1.0,No,No phone service,DSL,No,...,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,56.0,69144.0
1,5575-GNVDE,Male,0.0,No,No,34.0,Yes,No,DSL,Yes,...,No,No,One year,No,Mailed check,56.95,1889.5,No,46.0,19025.0
2,3668-QPYBK,Male,0.0,No,No,2.0,Yes,No,DSL,Yes,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,32.0,94380.0
3,7795-CFOCW,Male,0.0,No,No,45.0,No,No phone service,DSL,Yes,...,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,,133910.0
4,9237-HQITU,Female,0.0,No,No,2.0,Yes,No,Fiber optic,No,...,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,25.0,111055.0


In [2]:
#Check Basic Info
print("Shape:", df.shape)
df.info()
df.describe(include="all")


Shape: (7043, 23)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        5883 non-null   object 
 1   gender            5878 non-null   object 
 2   SeniorCitizen     5871 non-null   float64
 3   Partner           5878 non-null   object 
 4   Dependents        5881 non-null   object 
 5   tenure            5880 non-null   float64
 6   PhoneService      5881 non-null   object 
 7   MultipleLines     5874 non-null   object 
 8   InternetService   5881 non-null   object 
 9   OnlineSecurity    5878 non-null   object 
 10  OnlineBackup      5884 non-null   object 
 11  DeviceProtection  5884 non-null   object 
 12  TechSupport       5879 non-null   object 
 13  StreamingTV       5881 non-null   object 
 14  StreamingMovies   5881 non-null   object 
 15  Contract          5881 non-null   object 
 16  PaperlessBilling  5882 n

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,age,salary
count,5883,5878,5871.0,5878,5881,5880.0,5881,5874,5881,5878,...,5881,5881,5881,5882,5880,5882.0,5883.0,5884,5880.0,5882.0
unique,5883,2,,2,2,,2,3,3,3,...,3,3,3,2,4,,5508.0,2,,
top,7590-VHVEG,Male,,No,No,,Yes,No,Fiber optic,No,...,No,No,Month-to-month,Yes,Electronic check,,,No,,
freq,1,2979,,3053,4115,,5315,2805,2588,2928,...,2343,2319,3235,3483,1966,,10.0,4327,,
mean,,,0.159257,,,32.362075,,,,,...,,,,,,64.933951,,,39.071088,82132.031452
std,,,0.365947,,,24.616017,,,,,...,,,,,,30.057657,,,12.261414,39186.851924
min,,,0.0,,,0.0,,,,,...,,,,,,18.25,,,18.0,15015.0
25%,,,0.0,,,9.0,,,,,...,,,,,,35.9,,,29.0,48059.0
50%,,,0.0,,,29.0,,,,,...,,,,,,70.425,,,39.0,82015.0
75%,,,0.0,,,56.0,,,,,...,,,,,,90.0375,,,50.0,116377.75


In [3]:
#Check Missing Values
df.isnull().sum()
#Fill Missing age
df['age'] = df['age'].fillna(df['age'].median())


In [5]:
#Remove Duplicate Rows
print("Duplicates:", df.duplicated().sum())
df = df.drop_duplicates()
print("After removing duplicates:", df.shape)


Duplicates: 0
After removing duplicates: (5886, 23)


In [6]:
#Fix Gender Inconsistency
df['gender'].unique()
df['gender'] = df['gender'].astype(str).str.strip().str.lower()

df['gender'] = df['gender'].replace({
    'm': 'male',
    'male': 'male',
    'f': 'female',
    'female': 'female'
})
df['gender'].unique()


array(['female', 'male', 'nan'], dtype=object)

In [8]:
#Convert TotalCharges to Numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
#Check missing after conversion
df['TotalCharges'].isnull().sum()
#Fill missing TotalCharges with median
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())


In [9]:
#Handle Salary Outliers
df['salary'].describe()
#Use IQR method to remove outliers
Q1 = df['salary'].quantile(0.25)
Q3 = df['salary'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df['salary'] >= lower_bound) & (df['salary'] <= upper_bound)]
print("After removing salary outliers:", df.shape)


After removing salary outliers: (5882, 23)


In [11]:
#Convert Target Column (Churn) to 0/1
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
df['Churn'].value_counts()
#Encode Categorical Columns
#Drop customerID
df = df.drop(columns=['customerID'])
df = pd.get_dummies(df, drop_first=True)
df.head()



Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,age,salary,gender_male,gender_nan,Partner_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0.0,1.0,29.85,29.85,,56.0,69144.0,False,False,True,...,False,False,False,False,False,False,True,False,True,False
1,0.0,34.0,56.95,1889.5,,46.0,19025.0,True,False,False,...,False,False,False,False,True,False,False,False,False,True
2,0.0,2.0,53.85,108.15,,32.0,94380.0,True,False,False,...,False,False,False,False,False,False,True,False,False,True
3,0.0,45.0,42.3,1840.75,,39.0,133910.0,True,False,False,...,False,False,False,False,True,False,False,False,False,False
4,0.0,2.0,70.7,151.65,,25.0,111055.0,False,False,False,...,False,False,False,False,False,False,True,False,True,False


In [12]:
#Final Check
print("Final Shape:", df.shape)
print("Missing values:", df.isnull().sum().sum())
df.head()

Final Shape: (5882, 34)
Missing values: 5904


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,age,salary,gender_male,gender_nan,Partner_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0.0,1.0,29.85,29.85,,56.0,69144.0,False,False,True,...,False,False,False,False,False,False,True,False,True,False
1,0.0,34.0,56.95,1889.5,,46.0,19025.0,True,False,False,...,False,False,False,False,True,False,False,False,False,True
2,0.0,2.0,53.85,108.15,,32.0,94380.0,True,False,False,...,False,False,False,False,False,False,True,False,False,True
3,0.0,45.0,42.3,1840.75,,39.0,133910.0,True,False,False,...,False,False,False,False,True,False,False,False,False,False
4,0.0,2.0,70.7,151.65,,25.0,111055.0,False,False,False,...,False,False,False,False,False,False,True,False,True,False
