# Projecto MLOPS

In [7]:
import pandas as pd 
import numpy as np
import os
from sklearn.impute import SimpleImputer

In [3]:
df = pd.read_csv("./archive/WA_Fn-UseC_-Telco-Customer-Churn.csv", sep = ',')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Ahora vamos a mirar cuantos valores nulos tenemos para cada columna

In [8]:
df.isnull().sum()

# 2️⃣ Drop columns with more than 50% missing values
threshold = 0.5
missing_fraction = df.isnull().mean()
cols_to_drop = missing_fraction[missing_fraction > threshold].index
df = df.drop(columns=cols_to_drop)
print(f"Dropped columns: {list(cols_to_drop)}")

# 3️⃣ Separate numerical and categorical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# 4️⃣ Impute numerical columns (mean or median)
num_imputer = SimpleImputer(strategy='median')  # or strategy='mean'
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# 5️⃣ Impute categorical columns (mode or placeholder)
cat_imputer = SimpleImputer(strategy='most_frequent')  # or fill with 'Unknown'
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# 6️⃣ Verify that missing values are handled
print("Missing values after imputation:")
print(df.isnull().sum())



Dropped columns: []
Missing values after imputation:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [9]:
# 1. Asegurar que columnas numéricas sean numéricas (float/int)
# 'TotalCharges' suele cargarse como objeto porque tiene espacios vacíos " "
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Llenar los nulos generados (espacios vacíos) con 0 o la media (opcional, pero recomendado)
df['TotalCharges'] = df['TotalCharges'].fillna(0)

# 2. Asegurar que columnas categóricas sean objeto/string
# 'SeniorCitizen' viene como 0/1 (int), pero conceptualmente es categórica
df['SeniorCitizen'] = df['SeniorCitizen'].astype(str)

# Verificar tipos de datos
print("Tipos de datos corregidos:")
print(df.dtypes)

# 3. Eliminar duplicados exactos (filas completas repetidas)
duplicados_totales = df.duplicated().sum()
print(f"\nFilas duplicadas eliminadas: {duplicados_totales}")
df.drop_duplicates(inplace=True)

# 4. Verificar duplicados en CustomerID
# Esto es crítico: un ID no debería tener dos filas distintas
ids_duplicados = df[df.duplicated(subset=['customerID'], keep=False)]

if not ids_duplicados.empty:
    print(f"\n¡Atención! Se encontraron {len(ids_duplicados)} entradas con customerID duplicado:")
    print(ids_duplicados[['customerID', 'Tenure', 'MonthlyCharges']].head())
else:
    print("\nNo existen customerID duplicados. La integridad es correcta.")

Tipos de datos corregidos:
customerID           object
gender               object
SeniorCitizen        object
Partner              object
Dependents           object
tenure              float64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

Filas duplicadas eliminadas: 0

No existen customerID duplicados. La integridad es correcta.
