# **Analisis de Churn de Clientes**

In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


#📌 Extracción

In [96]:
url = 'https://raw.githubusercontent.com/ingridcristh/challenge2-data-science/main/TelecomX_Data.json'
dcom = pd.read_json(url)

In [97]:
#Revisaremos si tenemos datos nulos
dcom.isnull().sum()

customerID    0
Churn         0
customer      0
phone         0
internet      0
account       0
dtype: int64

In [98]:
dcom.head()

Unnamed: 0,customerID,Churn,customer,phone,internet,account
0,0002-ORFBO,No,"{'gender': 'Female', 'SeniorCitizen': 0, 'Part...","{'PhoneService': 'Yes', 'MultipleLines': 'No'}","{'InternetService': 'DSL', 'OnlineSecurity': '...","{'Contract': 'One year', 'PaperlessBilling': '..."
1,0003-MKNFE,No,"{'gender': 'Male', 'SeniorCitizen': 0, 'Partne...","{'PhoneService': 'Yes', 'MultipleLines': 'Yes'}","{'InternetService': 'DSL', 'OnlineSecurity': '...","{'Contract': 'Month-to-month', 'PaperlessBilli..."
2,0004-TLHLJ,Yes,"{'gender': 'Male', 'SeniorCitizen': 0, 'Partne...","{'PhoneService': 'Yes', 'MultipleLines': 'No'}","{'InternetService': 'Fiber optic', 'OnlineSecu...","{'Contract': 'Month-to-month', 'PaperlessBilli..."
3,0011-IGKFF,Yes,"{'gender': 'Male', 'SeniorCitizen': 1, 'Partne...","{'PhoneService': 'Yes', 'MultipleLines': 'No'}","{'InternetService': 'Fiber optic', 'OnlineSecu...","{'Contract': 'Month-to-month', 'PaperlessBilli..."
4,0013-EXCHZ,Yes,"{'gender': 'Female', 'SeniorCitizen': 1, 'Part...","{'PhoneService': 'Yes', 'MultipleLines': 'No'}","{'InternetService': 'Fiber optic', 'OnlineSecu...","{'Contract': 'Month-to-month', 'PaperlessBilli..."


#🔧 Transformación

In [99]:
dcom.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7267 entries, 0 to 7266
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   customerID  7267 non-null   object
 1   Churn       7267 non-null   object
 2   customer    7267 non-null   object
 3   phone       7267 non-null   object
 4   internet    7267 non-null   object
 5   account     7267 non-null   object
dtypes: object(6)
memory usage: 340.8+ KB


In [100]:
dcom.dtypes

customerID    object
Churn         object
customer      object
phone         object
internet      object
account       object
dtype: object

In [101]:
# Normalizaremos el Dataframe
cliente = pd.json_normalize(dcom['customer'])
telefono = pd.json_normalize(dcom['phone'])
internet = pd.json_normalize(dcom['internet'])
cuenta = pd.json_normalize(dcom['account']) 

In [102]:
# concatenaremos los datos para mejorar su visualizacion 
datacom = pd.concat([dcom['customerID'],dcom['Churn'], cliente, telefono, internet, cuenta], axis=1)

In [103]:
datacom.head()

Unnamed: 0,customerID,Churn,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,...,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Charges.Monthly,Charges.Total
0,0002-ORFBO,No,Female,0,Yes,Yes,9,Yes,No,DSL,...,Yes,No,Yes,Yes,No,One year,Yes,Mailed check,65.6,593.3
1,0003-MKNFE,No,Male,0,No,No,9,Yes,Yes,DSL,...,No,No,No,No,Yes,Month-to-month,No,Mailed check,59.9,542.4
2,0004-TLHLJ,Yes,Male,0,No,No,4,Yes,No,Fiber optic,...,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,73.9,280.85
3,0011-IGKFF,Yes,Male,1,Yes,No,13,Yes,No,Fiber optic,...,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,98.0,1237.85
4,0013-EXCHZ,Yes,Female,1,Yes,No,3,Yes,No,Fiber optic,...,No,No,Yes,Yes,No,Month-to-month,Yes,Mailed check,83.9,267.4


In [104]:
# antes de continuar revisaremos cuantos clientes se ha retirado y cuantos no
datacom['Churn'].value_counts()

Churn
No     5174
Yes    1869
        224
Name: count, dtype: int64

In [105]:
# limpiaremos la columna de cancelaciones (Churn)

#buscaremos todos los valores vacios o que tengan espacios vacios y los reemplazaremos por valores nulos
datacom['Churn'] = datacom['Churn'].replace(r'^\s*$', np.nan, regex=True)


# ahora eliminaremos las columnas con valores NaN
datacom = datacom.dropna(subset=['Churn'])


# con un for recorreremos todas las columnas para identificar que valores se repiten 
for col in datacom.columns:
    print(datacom[col].value_counts(dropna=False))
    print("\n")



customerID
9995-HOTOH    1
0002-ORFBO    1
0003-MKNFE    1
9938-PRCVK    1
9938-TKDGL    1
             ..
0013-SMEOE    1
0014-BMAQU    1
0015-UOCOJ    1
0016-QLJIS    1
0017-DINOC    1
Name: count, Length: 7043, dtype: int64


Churn
No     5174
Yes    1869
Name: count, dtype: int64


gender
Male      3555
Female    3488
Name: count, dtype: int64


SeniorCitizen
0    5901
1    1142
Name: count, dtype: int64


Partner
No     3641
Yes    3402
Name: count, dtype: int64


Dependents
No     4933
Yes    2110
Name: count, dtype: int64


tenure
1     613
72    362
2     238
3     200
4     176
     ... 
28     57
39     56
44     51
36     50
0      11
Name: count, Length: 73, dtype: int64


PhoneService
Yes    6361
No      682
Name: count, dtype: int64


MultipleLines
No                  3390
Yes                 2971
No phone service     682
Name: count, dtype: int64


InternetService
Fiber optic    3096
DSL            2421
No             1526
Name: count, dtype: int64


OnlineSecurity
No   

## Limpiaremos y convertiremos la columna 'Charges.Total', y luego agruparemos sus valores numéricos en categorías (rangos) para facilitar el análisis, visualización y/o segmentaciones.

In [106]:
# Detectar y marcar valores vacíos en 'Charges.Total'
datacom.loc[datacom['Charges.Total'].str.strip() == '', 'Charges.Total'] = np.nan

# Filtrar filas con datos válidos en 'Charges.Total'
datacom = datacom[datacom['Charges.Total'].notna()]

# Convertir los valores de la columna a tipo numérico
datacom['Charges.Total'] = pd.to_numeric(datacom['Charges.Total'], errors='coerce')

# Establecer los rangos personalizados para agrupar los valores de 'Charges.Total'
rangos = [0, 500, 1000, 1500, 2000, 3000, 5000, datacom['Charges.Total'].max() + 1]
etiquetas = [
    '0-500', '501-1000', '1001-1500',
    '1501-2000', '2001-3000', '3001-5000', '5000+'
]

# Clasificar cada valor dentro de su rango correspondiente
datacom['ChargesTotal'] = pd.cut(
    datacom['Charges.Total'],
    bins=rangos,
    labels=etiquetas,
    include_lowest=True
)


In [108]:
datacom.describe()

Unnamed: 0,SeniorCitizen,tenure,Charges.Monthly,Charges.Total
count,7032.0,7032.0,7032.0,7032.0
mean,0.1624,32.421786,64.798208,2283.300441
std,0.368844,24.54526,30.085974,2266.771362
min,0.0,1.0,18.25,18.8
25%,0.0,9.0,35.5875,401.45
50%,0.0,29.0,70.35,1397.475
75%,0.0,55.0,89.8625,3794.7375
max,1.0,72.0,118.75,8684.8


In [109]:
# agregar la columna Cuentas_Diarias

# Asegurar que 'Tenure' sea numérico
datacom['tenure'] = pd.to_numeric(datacom['tenure'], errors='coerce')

# Filtrar para evitar división por cero o NaN
datacom = datacom[datacom['tenure'] > 0]

# Calcular el cargo diario aproximado
datacom['Cuentas_Diarias'] = datacom['Charges.Total'] / (datacom['tenure'] * 30)


In [110]:
datacom.describe()

Unnamed: 0,SeniorCitizen,tenure,Charges.Monthly,Charges.Total,Cuentas_Diarias
count,7032.0,7032.0,7032.0,7032.0,7032.0
mean,0.1624,32.421786,64.798208,2283.300441,2.159981
std,0.368844,24.54526,30.085974,2266.771362,1.006196
min,0.0,1.0,18.25,18.8,0.459167
25%,0.0,9.0,35.5875,401.45,1.205996
50%,0.0,29.0,70.35,1397.475,2.345775
75%,0.0,55.0,89.8625,3794.7375,3.005985
max,1.0,72.0,118.75,8684.8,4.046667


#📊 Carga y análisis

#📄Informe final