In [2]:
import pandas as pd

df = pd.read_csv("customer_churn_messy (1).csv")
print(df.columns)


Index(['CustomerID', 'Age', 'Gender', 'Tenure', 'Usage Frequency',
       'Support Calls', 'Payment Delay', 'Subscription Type',
       'Contract Length', 'Total Spend', 'Last Interaction', 'Churn'],
      dtype='object')


In [41]:
# Summary of missing values
df = pd.read_csv("customer_churn_messy (1).csv")

print(" Missing values per column:\n")
print(df.isna().sum()[df.isna().sum() > 0])
print(" Number of duplicate rows:")
print(df.duplicated().sum())
print(" Gender values (sample):")
print(df["Gender"].value_counts())

 Missing values per column:

CustomerID               1
Age                  30860
Gender                   1
Tenure                   1
Usage Frequency          1
Support Calls            1
Payment Delay            1
Subscription Type        1
Contract Length          1
Total Spend              1
Last Interaction         1
Churn                    1
dtype: int64
 Number of duplicate rows:
8378
 Gender values (sample):
Gender
male      51236
m         51117
Male      51027
M         50992
MALE      50940
F         39120
female    39087
Female    38767
FEMALE    38727
f         38636
Name: count, dtype: int64


In [None]:
#1.Remove Duplicate Rows
df = df.drop_duplicates()
print("After removing duplicates:", df.shape)


After removing duplicates: (441272, 12)


In [6]:
#Handle Missing Age Values
df['Age'] = df['Age'].fillna(df['Age'].median())


In [7]:
#Encode Gender
df['Gender'] = (
    df['Gender']
    .astype(str)
    .str.lower()
    .map({'male': 0, 'female': 1})
)

df['Gender'] = df['Gender'].fillna(df['Gender'].mode().iloc[0])


In [8]:
#Handle Salary Outliers
Q1 = df['Total Spend'].quantile(0.25)
Q3 = df['Total Spend'].quantile(0.75)

IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

df = df[(df['Total Spend'] >= lower) & (df['Total Spend'] <= upper)]

print("After removing outliers:", df.shape)


After removing outliers: (436778, 12)


In [9]:
#Encode Target Variable (Churn)
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})


In [10]:

df = pd.get_dummies(
    df,
    columns=['Subscription Type', 'Contract Length'],
    drop_first=True
)


In [11]:
#Dataset Check
print(df.info())
print(df.isna().sum())


<class 'pandas.core.frame.DataFrame'>
Index: 436778 entries, 0 to 449643
Data columns (total 14 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   CustomerID                  436778 non-null  float64
 1   Age                         436778 non-null  float64
 2   Gender                      436778 non-null  float64
 3   Tenure                      436778 non-null  float64
 4   Usage Frequency             436778 non-null  float64
 5   Support Calls               436778 non-null  float64
 6   Payment Delay               436778 non-null  float64
 7   Total Spend                 436778 non-null  float64
 8   Last Interaction            436778 non-null  float64
 9   Churn                       0 non-null       float64
 10  Subscription Type_Premium   436778 non-null  bool   
 11  Subscription Type_Standard  436778 non-null  bool   
 12  Contract Length_Monthly     436778 non-null  bool   
 13  Contract Length_Qua

In [12]:
X = df.drop(['CustomerID', 'Churn'], axis=1)
y = df['Churn']


In [13]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
