# Feature Engineering

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler

In [54]:
## Load data
df = pd.read_csv("..\\data\\processed\\cleaned_telco_customer_churn.csv")
print(df.shape)
df.head()

(7021, 20)


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [55]:
## Categorical Featues
cat_features = df.select_dtypes(include=['object']).drop(labels='Churn', axis=1).columns
cat_features

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object')

In [56]:
## Numerical features
num_features = df.select_dtypes(exclude='object').columns
num_features

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')

In [57]:
len(cat_features) + len(num_features)

19

In [58]:
## unique values in cat_features
unique_cat_list= []
for feature in cat_features:
    unique_cat_series = df[feature].value_counts()
    unique_cat_list.append({'FeatureName': feature, 'UniqueValues' : unique_cat_series.index.values, 'Numbers':unique_cat_series.values})

unique_cat_df = pd.DataFrame(unique_cat_list)
unique_cat_df

Unnamed: 0,FeatureName,UniqueValues,Numbers
0,gender,"[Male, Female]","[3545, 3476]"
1,Partner,"[No, Yes]","[3636, 3385]"
2,Dependents,"[No, Yes]","[4928, 2093]"
3,PhoneService,"[Yes, No]","[6342, 679]"
4,MultipleLines,"[No, Yes, No phone service]","[3381, 2961, 679]"
5,InternetService,"[Fiber optic, DSL, No]","[3090, 2412, 1519]"
6,OnlineSecurity,"[No, Yes, No internet service]","[3493, 2009, 1519]"
7,OnlineBackup,"[No, Yes, No internet service]","[3082, 2420, 1519]"
8,DeviceProtection,"[No, Yes, No internet service]","[3092, 2410, 1519]"
9,TechSupport,"[No, Yes, No internet service]","[3468, 2034, 1519]"


#### Comment:
The above table shows that most services categorical features like (MultipleLines, InternetService, etc) has 3 unique categories but there is actually 2 meaningful categories "yes", and "No" because "No internet service" indicates "No".

So, we can replace "Yes" by 1 and "No" or "No internet service" by 0.

### Encoding

In [59]:
## Encode binary features
binary_feat = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling']

# Encode binary cat cols manually except 'gender'
df[binary_feat[1:]]= df[binary_feat[1:]].map(lambda x: 1 if x=='Yes' else 0)

# Encode 'gender' with OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, drop='first')
df['gender']=ohe.fit_transform(df[['gender']])

df.head()


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0.0,0,1,0,1,0,0,DSL,0,1,0,0,0,0,Month-to-month,1,Electronic check,29.85,29.85,No
1,1.0,0,0,0,34,1,0,DSL,1,0,1,0,0,0,One year,0,Mailed check,56.95,1889.5,No
2,1.0,0,0,0,2,1,0,DSL,1,1,0,0,0,0,Month-to-month,1,Mailed check,53.85,108.15,Yes
3,1.0,0,0,0,45,0,0,DSL,1,0,1,1,0,0,One year,0,Bank transfer (automatic),42.3,1840.75,No
4,0.0,0,0,0,2,1,0,Fiber optic,0,0,0,0,0,0,Month-to-month,1,Electronic check,70.7,151.65,Yes


In [60]:
## Encode multiple categorical features
non_binary_cats = ['InternetService', 'Contract', 'PaymentMethod']

## Defining the order of the categories manually based on their importance (which categories in each column influence the churn more, gives them higher values)
internet_service_order = ['No', 'DSL', 'Fiber optic']
contract_order = ['Two year', 'One year', 'Month-to-month']
payment_method_order = ['Credit card (automatic)', 'Bank transfer (automatic)', 'Mailed check', 'Electronic check']

oe = OrdinalEncoder(categories=[internet_service_order, contract_order, payment_method_order])
df[['InternetService', 'Contract', 'PaymentMethod']] = oe.fit_transform(df[['InternetService', 'Contract', 'PaymentMethod']])

df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0.0,0,1,0,1,0,0,1.0,0,1,0,0,0,0,2.0,1,3.0,29.85,29.85,No
1,1.0,0,0,0,34,1,0,1.0,1,0,1,0,0,0,1.0,0,2.0,56.95,1889.5,No
2,1.0,0,0,0,2,1,0,1.0,1,1,0,0,0,0,2.0,1,2.0,53.85,108.15,Yes
3,1.0,0,0,0,45,0,0,1.0,1,0,1,1,0,0,1.0,0,1.0,42.3,1840.75,No
4,0.0,0,0,0,2,1,0,2.0,0,0,0,0,0,0,2.0,1,3.0,70.7,151.65,Yes


In [61]:
## Encode the target col
le = LabelEncoder()
df['Churn'] = le.fit_transform(df['Churn'])

df['Churn'].head()

0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64

### Feature Scaling

In [62]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0.0,0,1,0,1,0,0,1.0,0,1,0,0,0,0,2.0,1,3.0,29.85,29.85,0
1,1.0,0,0,0,34,1,0,1.0,1,0,1,0,0,0,1.0,0,2.0,56.95,1889.5,0
2,1.0,0,0,0,2,1,0,1.0,1,1,0,0,0,0,2.0,1,2.0,53.85,108.15,1
3,1.0,0,0,0,45,0,0,1.0,1,0,1,1,0,0,1.0,0,1.0,42.3,1840.75,0
4,0.0,0,0,0,2,1,0,2.0,0,0,0,0,0,0,2.0,1,3.0,70.7,151.65,1


In [None]:
## Scaling except target
scaler = StandardScaler()
for col in df.columns[0:-1]:
    df[col] = scaler.fit_transform(df[[col]])

df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,-1.009876,-0.440278,1.036412,-0.651702,-1.27991,-3.056175,-0.853997,-0.287343,-0.633118,1.378854,-0.722954,-0.63864,-0.789527,-0.795959,0.826337,0.829546,1.144624,-1.16101,-0.993745,0
1,0.99022,-0.440278,-0.964867,-0.651702,0.064885,0.327206,-0.853997,-0.287343,1.579485,-0.72524,1.383213,-0.63864,-0.789527,-0.795959,-0.3741,-1.205479,0.274318,-0.260031,-0.172844,0
2,0.99022,-0.440278,-0.964867,-0.651702,-1.239159,0.327206,-0.853997,-0.287343,1.579485,1.378854,-0.722954,-0.63864,-0.789527,-0.795959,0.826337,0.829546,0.274318,-0.363095,-0.959182,1
3,0.99022,-0.440278,-0.964867,-0.651702,0.513151,-3.056175,-0.853997,-0.287343,1.579485,-0.72524,1.383213,1.565829,-0.789527,-0.795959,-0.3741,-1.205479,-0.595988,-0.747091,-0.194364,0
4,-1.009876,-0.440278,-0.964867,-0.651702,-1.239159,0.327206,-0.853997,0.996829,-0.633118,-0.72524,-0.722954,-0.63864,-0.789527,-0.795959,0.826337,0.829546,1.144624,0.197108,-0.93998,1


#### Comment:
All features have been successfully converted to required format for feeding ML algorithms. Now we can export this data and can use for machine learning.

### Export the cleaned and transformed data

In [64]:
df.to_csv('../data/processed/cleaned_feature_engineering_telco_customer_churn.csv', index=False)

<br/>

---
### üë®‚Äçüíª Author Information
**Name:** [Amaresh Maity]  
**Date:** 2026-01-18  
**Role:** [Data Scientist | AI Engineer]



#### Let's Connect!

If you have questions about this analysis or would like to collaborate, feel free to reach out:

* **LinkedIn:** [LinkedIn](https://www.linkedin.com/in/amareshmaity/)
* **GitHub:** [@amareshmaity](https://github.com/amareshmaity)
* **Email:** [contacttoamaresh@gmail.com](mailto:contacttoamaresh@gmail.com)
