# Data Preparation


In [3]:
import os 
import random
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv("../data/raw/Telco_Customer_Churn_Dataset  (1).csv")  
df.head()
df.info()
df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

## Here’s the analysis of what we see so far:

### ✅ Dataset has 7043 rows × 21 columns.

### ✅ No missing values reported (0 in all columns).

### ⚠️ But TotalCharges is stored as object (string) even though it should be numeric (since it’s charges). → We’ll need to convert it to numeric.

### SeniorCitizen is already int64 but it’s actually categorical (0 = No, 1 = Yes).

In [5]:
# Inspect unique values and try to convert
print(df["TotalCharges"].head(10))
print(df["TotalCharges"].dtype)

# Convert to numeric (coerce errors to NaN for bad values)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Check how many NaNs we got after conversion
df["TotalCharges"].isna().sum()

0      29.85
1     1889.5
2     108.15
3    1840.75
4     151.65
5      820.5
6     1949.4
7      301.9
8    3046.05
9    3487.95
Name: TotalCharges, dtype: object
object


np.int64(11)

In [6]:
# Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Check the result
print(df["TotalCharges"].dtype)
print(df["TotalCharges"].head(10))

float64
0      29.85
1    1889.50
2     108.15
3    1840.75
4     151.65
5     820.50
6    1949.40
7     301.90
8    3046.05
9    3487.95
Name: TotalCharges, dtype: float64


In [7]:
# Count missing values in TotalCharges
missing_total = df["TotalCharges"].isna().sum()
print("Missing values in TotalCharges:", missing_total)


Missing values in TotalCharges: 11


In [8]:
df = df.dropna(subset=["TotalCharges"])
print("Shape after dropping missing TotalCharges:", df.shape)


Shape after dropping missing TotalCharges: (7032, 21)
