In [2]:
import pandas as pd

df = pd.read_csv("customer_churn_messy (1).csv")
print(df.columns)


Index(['CustomerID', 'Age', 'Gender', 'Tenure', 'Usage Frequency',
       'Support Calls', 'Payment Delay', 'Subscription Type',
       'Contract Length', 'Total Spend', 'Last Interaction', 'Churn'],
      dtype='object')


In [None]:
#1.Remove Duplicate Rows
df = df.drop_duplicates()
print("After removing duplicates:", df.shape)


After removing duplicates: (441272, 12)


In [None]:
#Handle Missing Age Values
df['Age'] = df['Age'].fillna(df['Age'].median())


In [None]:
#Encode Gender
df['Gender'] = (
    df['Gender']
    .astype(str)
    .str.lower()
    .map({'male': 0, 'female': 1})
)

df['Gender'] = df['Gender'].fillna(df['Gender'].mode().iloc[0])


In [None]:
#Handle Salary Outliers
Q1 = df['Total Spend'].quantile(0.25)
Q3 = df['Total Spend'].quantile(0.75)

IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

df = df[(df['Total Spend'] >= lower) & (df['Total Spend'] <= upper)]

print("After removing outliers:", df.shape)


In [None]:
#Encode Target Variable (Churn)
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})


In [None]:

df = pd.get_dummies(
    df,
    columns=['Subscription Type', 'Contract Length'],
    drop_first=True
)


In [None]:
#Dataset Check
print(df.info())
print(df.isna().sum())


In [None]:
X = df.drop(['CustomerID', 'Churn'], axis=1)
y = df['Churn']


In [None]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
#Trainâ€“Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
