In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('customer-churn.csv')
df.head()

In [None]:
df.info()
df.describe()
df.select_dtypes(include=['object']).columns

In [None]:
df.isnull().sum()
df.dropna(inplace=True)
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

df.duplicated().sum()
df.drop_duplicates(inplace=True)

In [None]:
for col in categorical_columns:
    df[col] = df[col].str.lower().str.strip()

for col in categorical_columns:
    df[col] = df[col].astype('category')

In [None]:
df['tenure'] = df['tenure'].astype(int)
df['MonthlyCharges'] = df['MonthlyCharges'].astype(float)

In [None]:
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

Q1 = df[numerical_columns].quantile(0.25)
Q3 = df[numerical_columns].quantile(0.75)
IQR = Q3 - Q1

outliers = ((df[numerical_columns] < (Q1 - 1.5 * IQR)) | (df[numerical_columns] > (Q3 + 1.5 * IQR)))

df = df[~outliers.any(axis=1)]

In [None]:
df['is_senior'] = df['SeniorCitizen'].apply(lambda x: 1 if x == 1 else 0)

df['monthly_spend'] = df['MonthlyCharges'] * df['tenure']

scaler = StandardScaler()
df[['tenure', 'MonthlyCharges']] = scaler.fit_transform(df[['tenure', 'MonthlyCharges']])

X = df.drop(columns=['customerID', 'Churn'])
y = df['Churn']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

df.to_csv('Cleaned_Telecom_Customer_Churn.csv', index=False)
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)