# Data Preparation


In [1]:
import os 
import random
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("../data/raw/Telco_Customer_Churn_Dataset  (1).csv")  
df.head()
df.info()
df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

## Here’s the analysis of what we see so far:

### ✅ Dataset has 7043 rows × 21 columns.

### ✅ No missing values reported (0 in all columns).

### ⚠️ But TotalCharges is stored as object (string) even though it should be numeric (since it’s charges). → We’ll need to convert it to numeric.

### SeniorCitizen is already int64 but it’s actually categorical (0 = No, 1 = Yes).

In [9]:
# Inspect unique values and try to convert
print(df["TotalCharges"].head(10))
print(df["TotalCharges"].dtype)

# Convert to numeric (coerce errors to NaN for bad values)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Check how many NaNs we got after conversion
df["TotalCharges"].isna().sum()

0      29.85
1    1889.50
2     108.15
3    1840.75
4     151.65
5     820.50
6    1949.40
7     301.90
8    3046.05
9    3487.95
Name: TotalCharges, dtype: float64
float64


np.int64(0)

In [10]:
# Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Check the result
print(df["TotalCharges"].dtype)
print(df["TotalCharges"].head(10))

float64
0      29.85
1    1889.50
2     108.15
3    1840.75
4     151.65
5     820.50
6    1949.40
7     301.90
8    3046.05
9    3487.95
Name: TotalCharges, dtype: float64


In [11]:
# Count missing values in TotalCharges
missing_total = df["TotalCharges"].isna().sum()
print("Missing values in TotalCharges:", missing_total)


Missing values in TotalCharges: 0


In [12]:
df = df.dropna(subset=["TotalCharges"])
print("Shape after dropping missing TotalCharges:", df.shape)


Shape after dropping missing TotalCharges: (7032, 21)


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 
 17  

In [14]:
for col in df.select_dtypes(include='object').columns:
    print(f"{col}: {df[col].unique()}\n")


customerID: ['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']

gender: ['Female' 'Male']

Partner: ['Yes' 'No']

Dependents: ['No' 'Yes']

PhoneService: ['No' 'Yes']

MultipleLines: ['No phone service' 'No' 'Yes']

InternetService: ['DSL' 'Fiber optic' 'No']

OnlineSecurity: ['No' 'Yes' 'No internet service']

OnlineBackup: ['Yes' 'No' 'No internet service']

DeviceProtection: ['No' 'Yes' 'No internet service']

TechSupport: ['No' 'Yes' 'No internet service']

StreamingTV: ['No' 'Yes' 'No internet service']

StreamingMovies: ['No' 'Yes' 'No internet service']

Contract: ['Month-to-month' 'One year' 'Two year']

PaperlessBilling: ['Yes' 'No']

PaymentMethod: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']

Churn: ['No' 'Yes']



| Column                                                                                                                                                                 | Type               | Notes                          | Encoding Strategy  |
| :--------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------- | :----------------------------- | :----------------- |
| `customerID`                                                                                                                                                           | Identifier         | Not useful for prediction      | Drop               |
| `gender`, `Partner`, `Dependents`, `PhoneService`, `PaperlessBilling`, `Churn`                                                                                         | Binary categorical | “Yes/No” or “Male/Female”      | Label encode (0/1) |
| `MultipleLines`, `InternetService`, `OnlineSecurity`, `OnlineBackup`, `DeviceProtection`, `TechSupport`, `StreamingTV`, `StreamingMovies`, `Contract`, `PaymentMethod` | Multi-category     | 3+ levels or “No service” type | One-hot encode     |
| `SeniorCitizen`, `tenure`, `MonthlyCharges`, `TotalCharges`                                                                                                            | Numeric            | Already numeric                | Keep as is         |


In [15]:
df = df.drop('customerID', axis=1)
df.head(3)


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


### Now encode the binary categorical columns — those with only two values like Yes/No or Male/Female.

#### I'll map them as follows:

#### Yes → 1

#### No → 0

#### Female → 0

#### Male → 1

In [16]:
# Encode binary categorical columns
binary_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 
               'PaperlessBilling', 'Churn']

# Mapping Yes/No to 1/0 and Female/Male to 0/1
df[binary_cols] = df[binary_cols].replace({'Yes': 1, 'No': 0, 'Female': 0, 'Male': 1})

# Verify changes
df[binary_cols].head()


  df[binary_cols] = df[binary_cols].replace({'Yes': 1, 'No': 0, 'Female': 0, 'Male': 1})


Unnamed: 0,gender,Partner,Dependents,PhoneService,PaperlessBilling,Churn
0,0,1,0,0,1,0
1,1,0,0,1,0,0
2,1,0,0,1,1,1
3,1,0,0,0,0,0
4,0,0,0,1,1,1


In [17]:
# Identify multi-category columns (exclude numeric and binary ones)
multi_cat_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 
                  'OnlineBackup', 'DeviceProtection', 'TechSupport', 
                  'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']

# Apply one-hot encoding
df_encoded = pd.get_dummies(df, columns=multi_cat_cols, drop_first=True)

# Check the new shape and first few rows
print("Shape after one-hot encoding:", df_encoded.shape)
df_encoded.head()


Shape after one-hot encoding: (7032, 31)


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,1,0,1,29.85,29.85,0,...,False,False,False,False,False,False,False,False,True,False
1,1,0,0,0,34,1,0,56.95,1889.5,0,...,False,False,False,False,False,True,False,False,False,True
2,1,0,0,0,2,1,1,53.85,108.15,1,...,False,False,False,False,False,False,False,False,False,True
3,1,0,0,0,45,0,0,42.3,1840.75,0,...,True,False,False,False,False,True,False,False,False,False
4,0,0,0,0,2,1,1,70.7,151.65,1,...,False,False,False,False,False,False,False,False,True,False


In [18]:
# Drop customerID column
df_encoded = df_encoded.drop('customerID', axis=1, errors='ignore')

# Separate features (X) and target (y)
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']

# Check final shapes
print("Feature set shape:", X.shape)
print("Target shape:", y.shape)


Feature set shape: (7032, 30)
Target shape: (7032,)


In [19]:
# Save processed dataset to a new CSV file
processed_path = "../data/processed/telecom_churn_processed.csv"
df.to_csv(processed_path, index=False)

print(f"✅ Processed data saved successfully at: {processed_path}")
print(df.shape)


✅ Processed data saved successfully at: ../data/processed/telecom_churn_processed.csv
(7032, 20)
