# The goal here is to understand churn patterns and data issues (missing values, types, distributions).

In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))


In [None]:
from src.data_loader import load_telco_csv
from src.preprocessing import make_xy, clean_telco

In [5]:
df = load_telco_csv("../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv")
X, y = make_xy(df)

# Basic churn rate

In [6]:
y.value_counts()
y.value_counts(normalize=True)


Churn
No     0.73463
Yes    0.26537
Name: proportion, dtype: float64

# Check data types + missingness

In [7]:
X.info()
X.isna().sum().sort_values(ascending=False).head(20)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


gender              0
DeviceProtection    0
MonthlyCharges      0
PaymentMethod       0
PaperlessBilling    0
Contract            0
StreamingMovies     0
StreamingTV         0
TechSupport         0
OnlineBackup        0
SeniorCitizen       0
OnlineSecurity      0
InternetService     0
MultipleLines       0
PhoneService        0
tenure              0
Dependents          0
Partner             0
TotalCharges        0
dtype: int64

# Cleaning the data:  Here we have to convert those numerical value  of column TotalCharges to float cause right now, there type is 'object'
### 1 - Attempting to convert each value to a numeric type (float) and if a value cannot be converted, replace it with NaN.

### 2 - creating a mask because we want to remove rows where TotalCharges is missing from the dataset (X) and also from the target (y)

In [8]:
X, y = clean_telco(X, y)