In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df=pd.read_csv("../data/raw/Telco-Customer-Churn.csv")

In [3]:
df.columns = df.columns.str.strip().str.replace(" ", "").str.lower()

In [4]:
df['churn'] = df['churn'].astype(str).str.title().str.strip()
df['churn'] = df['churn'].map({'Yes':1, 'No':0})

In [5]:
df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')

In [6]:
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [7]:
df = df.fillna({
    col: df[col].mode()[0] if df[col].dtype == 'object' else df[col].median()
    for col in df.columns
})

In [8]:
df = df.drop(['customerid'], axis=1)

In [9]:
num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
num_cols.remove('churn')  # remove target

cat_cols = df.select_dtypes(include=['object']).columns.tolist()

In [10]:
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)


In [19]:
num_cols = ['tenure', 'monthlycharges', 'totalcharges']
cat_cols = [col for col in df.columns if df[col].dtype == 'object' and col != 'churn']

In [23]:
# One-hot encode categorical features
# pd.get_dummies() converts categorical features into one-hot encoded numeric columns, making them usable for ML models.
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [26]:
df.shape, df_encoded.shape

((7043, 21), (7043, 7073))

In [11]:
df_encoded.columns

Index(['seniorcitizen', 'tenure', 'monthlycharges', 'totalcharges', 'churn',
       'gender_Male', 'partner_Yes', 'dependents_Yes', 'phoneservice_Yes',
       'multiplelines_No phone service', 'multiplelines_Yes',
       'internetservice_Fiber optic', 'internetservice_No',
       'onlinesecurity_No internet service', 'onlinesecurity_Yes',
       'onlinebackup_No internet service', 'onlinebackup_Yes',
       'deviceprotection_No internet service', 'deviceprotection_Yes',
       'techsupport_No internet service', 'techsupport_Yes',
       'streamingtv_No internet service', 'streamingtv_Yes',
       'streamingmovies_No internet service', 'streamingmovies_Yes',
       'contract_One year', 'contract_Two year', 'paperlessbilling_Yes',
       'paymentmethod_Credit card (automatic)',
       'paymentmethod_Electronic check', 'paymentmethod_Mailed check'],
      dtype='object')

In [28]:
[x for x in df_encoded.columns if "_" in x]

['customerid_0003-MKNFE',
 'customerid_0004-TLHLJ',
 'customerid_0011-IGKFF',
 'customerid_0013-EXCHZ',
 'customerid_0013-MHZWF',
 'customerid_0013-SMEOE',
 'customerid_0014-BMAQU',
 'customerid_0015-UOCOJ',
 'customerid_0016-QLJIS',
 'customerid_0017-DINOC',
 'customerid_0017-IUDMW',
 'customerid_0018-NYROU',
 'customerid_0019-EFAEP',
 'customerid_0019-GFNTW',
 'customerid_0020-INWCK',
 'customerid_0020-JDNXP',
 'customerid_0021-IKXGC',
 'customerid_0022-TCJCI',
 'customerid_0023-HGHWL',
 'customerid_0023-UYUPN',
 'customerid_0023-XUOPT',
 'customerid_0027-KWYKW',
 'customerid_0030-FNXPP',
 'customerid_0031-PVLZI',
 'customerid_0032-PGELS',
 'customerid_0036-IHMOT',
 'customerid_0040-HALCW',
 'customerid_0042-JVWOJ',
 'customerid_0042-RLHYP',
 'customerid_0048-LUMLS',
 'customerid_0048-PIHNL',
 'customerid_0052-DCKON',
 'customerid_0052-YNYOT',
 'customerid_0056-EPFBG',
 'customerid_0057-QBUQH',
 'customerid_0058-EVZWM',
 'customerid_0060-FUALY',
 'customerid_0064-SUDOG',
 'customerid

In [31]:
df_encoded[['paymentmethod_Electronic check']].head()

Unnamed: 0,paymentmethod_Electronic check
0,True
1,False
2,False
3,False
4,True


In [32]:
df_encoded = df_encoded.astype(int)

In [33]:
df_encoded[['paymentmethod_Electronic check']].head()

Unnamed: 0,paymentmethod_Electronic check
0,1
1,0
2,0
3,0
4,1


### One-Hot Encoding Verification

- Encoding added new dummy columns for each category.
- Encoded columns appear as **True/False**, which are equivalent to **1/0**.
- Dataset shape increased after encoding, confirming categorical expansion.
- Verified encoding using `.columns`, `.dtypes` checks.


SECTION 1.1
- ✔ Handled missing values  
- ✔ Cleaned and standardized column names  
- ✔ Converted data types  
- ✔ Encoded categorical variables (One-Hot Encoding)  

In [12]:
scaler = StandardScaler()
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

In [13]:
X = df_encoded.drop('churn', axis=1)
y = df_encoded['churn']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [15]:
X_train.to_csv("D:/customer_churn/data/processed/X_train.csv", index=False)
X_test.to_csv("D:/customer_churn/data/processed/X_test.csv", index=False)
y_train.to_csv("D:/customer_churn/data/processed/y_train.csv", index=False)
y_test.to_csv("D:/customer_churn/data/processed/y_test.csv", index=False)

print("Preprocessing complete. Files saved in /data/processed/")

Preprocessing complete. Files saved in /data/processed/



SECTION 1.2
- ✔ Scaled numerical features (StandardScaler)  
- ✔ Created final feature matrix (X) and target vector (y)  
- ✔ Performed train–test split  
- ✔ Saved processed data for modeling  


In [19]:
import joblib
joblib.dump(scaler, "D:/customer_churn/models/scaler.pkl")


['D:/customer_churn/models/scaler.pkl']

In [21]:
joblib.dump(df_encoded.columns, "D:/customer_churn/models/columns.pkl")


['D:/customer_churn/models/columns.pkl']

In [24]:
import joblib
columns = joblib.load("D:/customer_churn/models/columns.pkl")
columns

Index(['seniorcitizen', 'tenure', 'monthlycharges', 'totalcharges', 'churn',
       'gender_Male', 'partner_Yes', 'dependents_Yes', 'phoneservice_Yes',
       'multiplelines_No phone service', 'multiplelines_Yes',
       'internetservice_Fiber optic', 'internetservice_No',
       'onlinesecurity_No internet service', 'onlinesecurity_Yes',
       'onlinebackup_No internet service', 'onlinebackup_Yes',
       'deviceprotection_No internet service', 'deviceprotection_Yes',
       'techsupport_No internet service', 'techsupport_Yes',
       'streamingtv_No internet service', 'streamingtv_Yes',
       'streamingmovies_No internet service', 'streamingmovies_Yes',
       'contract_One year', 'contract_Two year', 'paperlessbilling_Yes',
       'paymentmethod_Credit card (automatic)',
       'paymentmethod_Electronic check', 'paymentmethod_Mailed check'],
      dtype='object')