In [108]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix,classification_report
import warnings
warnings.filterwarnings('ignore')

In [109]:
df = pd.read_csv(r'..\data\processed\teleco-clean.csv')

In [110]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [111]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   object 
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [112]:
cat_cols = ['gender','SeniorCitizen', 'Partner','Dependents', 'PhoneService', 'MultipleLines','OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies','PaperlessBilling','Churn']

le = LabelEncoder()

for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [113]:
df_ob = ['Contract','PaymentMethod','InternetService']
df = pd.get_dummies(df, columns=df_ob, drop_first=True,dtype=int)

In [114]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

X = df.drop(columns=['Churn','StreamingTV','StreamingMovies','PhoneService','InternetService_No','InternetService_Fiber optic','MonthlyCharges'])
X = add_constant(X)

vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

vif_data.sort_values(by='VIF', ascending=False)

Unnamed: 0,Variable,VIF
0,const,12.56992
12,TotalCharges,6.035152
5,tenure,5.512351
14,Contract_Two year,2.391684
16,PaymentMethod_Electronic check,1.928817
17,PaymentMethod_Mailed check,1.797045
15,PaymentMethod_Credit card (automatic),1.559963
13,Contract_One year,1.533629
3,Partner,1.461898
9,DeviceProtection,1.450634


In [115]:
X = df.drop(columns='Churn')
y = df['Churn']

In [116]:
y.value_counts()

Churn
0    5174
1    1869
Name: count, dtype: int64

In [117]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [118]:
# Upsample clase minoritaria
from sklearn.utils import resample

df_train = pd.concat([X_train, y_train], axis=1)
df_minority = df_train[df_train['Churn'] == 1]
df_majority = df_train[df_train['Churn'] == 0]

df_minority_upsampled = resample(df_minority, 
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

df_upsampled = pd.concat([df_majority, df_minority_upsampled]).sample(frac=1, random_state=42)

X_train_bal = df_upsampled.drop(columns='Churn')
y_train_bal = df_upsampled['Churn']

In [119]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

In [120]:
lr = LogisticRegression()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lr.fit(X_train_res, y_train_res)

y_prob = lr.predict_proba(X_test)[:, 1]
y_pred_lr = (y_prob > 0.35).astype(int) 

f1_scores = cross_val_score(lr, X_train_res, y_train_res, cv=skf, scoring='f1')

print(f"F1-score promedio en validación cruzada: {f1_scores.mean():.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_lr))
print(confusion_matrix(y_test, y_pred_lr))

F1-score promedio en validación cruzada: 0.7642
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.57      0.72      1036
           1       0.44      0.94      0.60       373

    accuracy                           0.67      1409
   macro avg       0.70      0.75      0.66      1409
weighted avg       0.82      0.67      0.69      1409

[[595 441]
 [ 24 349]]


In [121]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# rf
rf = RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1) 

# cross validation
f1_scores = cross_val_score(rf, X_train_bal, y_train_bal, cv=skf, scoring='f1')

print(f"F1-score promedio en validación cruzada: {f1_scores.mean():.4f}")

rf.fit(X_train_bal, y_train_bal)

y_prob = rf.predict_proba(X_test)[:, 1]
y_pred_rf = (y_prob > 0.75).astype(int) 

print(classification_report(y_test,y_pred_rf))

F1-score promedio en validación cruzada: 0.9087
              precision    recall  f1-score   support

           0       0.79      0.96      0.87      1036
           1       0.74      0.28      0.41       373

    accuracy                           0.78      1409
   macro avg       0.77      0.62      0.64      1409
weighted avg       0.78      0.78      0.75      1409



In [125]:
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                    scale_pos_weight=scale_pos_weight, random_state=42)
xgb.fit(X_train, y_train)

# Probabilidades y predicción con threshold 0.35
y_prob_xgb = xgb.predict_proba(X_test)[:, 1]
y_pred_xgb = (y_prob_xgb > 0.30).astype(int)

# F1 en validación cruzada
f1_xgb = cross_val_score(xgb, X_train, y_train, cv=skf, scoring='f1')

print(f"\n[XGBoost] F1-score promedio CV: {f1_xgb.mean():.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))


[XGBoost] F1-score promedio CV: 0.5877
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.69      0.79      1036
           1       0.49      0.83      0.62       373

    accuracy                           0.73      1409
   macro avg       0.71      0.76      0.70      1409
weighted avg       0.81      0.73      0.74      1409

Confusion Matrix:
 [[718 318]
 [ 64 309]]


In [126]:
from sklearn.linear_model import RidgeClassifier
ridge = RidgeClassifier()
ridge.fit(X_train, y_train)

# Ridge no permite threshold -> usamos predict directamente
y_pred_ridge = ridge.predict(X_test)

print("\n[RidgeClassifier]")
print(classification_report(y_test, y_pred_ridge))
print(confusion_matrix(y_test, y_pred_ridge))


[RidgeClassifier]
              precision    recall  f1-score   support

           0       0.86      0.92      0.88      1036
           1       0.71      0.57      0.63       373

    accuracy                           0.82      1409
   macro avg       0.78      0.74      0.76      1409
weighted avg       0.82      0.82      0.82      1409

[[948  88]
 [160 213]]


In [128]:
from sklearn.linear_model import LogisticRegression

# Lasso (L1)
lr_l1 = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
lr_l1.fit(X_train_res, y_train_res)
y_prob_l1 = lr_l1.predict_proba(X_test)[:, 1]
y_pred_l1 = (y_prob_l1 > 0.35).astype(int)

print("\n[LogReg L1]")
print(classification_report(y_test, y_pred_l1))
print(confusion_matrix(y_test, y_pred_l1))

# Ridge (L2)
lr_l2 = LogisticRegression(penalty='l2', solver='liblinear', random_state=42)
lr_l2.fit(X_train_res, y_train_res)
y_prob_l2 = lr_l2.predict_proba(X_test)[:, 1]
y_pred_l2 = (y_prob_l2 > 0.35).astype(int)

print("\n[LogReg L2]")
print(classification_report(y_test, y_pred_l2))
print(confusion_matrix(y_test, y_pred_l2))

# ElasticNet
lr_en = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, random_state=42)
lr_en.fit(X_train_res, y_train_res)
y_prob_en = lr_en.predict_proba(X_test)[:, 1]
y_pred_en = (y_prob_en > 0.35).astype(int)

print("\n[ElasticNet]")
print(classification_report(y_test, y_pred_en))
print(confusion_matrix(y_test, y_pred_en))



[LogReg L1]
              precision    recall  f1-score   support

           0       0.96      0.58      0.72      1036
           1       0.44      0.94      0.60       373

    accuracy                           0.67      1409
   macro avg       0.70      0.76      0.66      1409
weighted avg       0.83      0.67      0.69      1409

[[596 440]
 [ 23 350]]

[LogReg L2]
              precision    recall  f1-score   support

           0       0.96      0.58      0.72      1036
           1       0.45      0.94      0.60       373

    accuracy                           0.67      1409
   macro avg       0.70      0.76      0.66      1409
weighted avg       0.82      0.67      0.69      1409

[[601 435]
 [ 24 349]]

[ElasticNet]
              precision    recall  f1-score   support

           0       0.93      0.39      0.54      1036
           1       0.35      0.92      0.51       373

    accuracy                           0.53      1409
   macro avg       0.64      0.65      0.5

In [130]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=5, class_weight='balanced', random_state=42)
dt.fit(X_train, y_train)

y_prob_dt = dt.predict_proba(X_test)[:, 1]
y_pred_dt = (y_prob_dt > 0.30).astype(int)

print("\n[DecisionTreeClassifier]")
print(classification_report(y_test, y_pred_dt))
print(confusion_matrix(y_test, y_pred_dt))



[DecisionTreeClassifier]
              precision    recall  f1-score   support

           0       0.97      0.43      0.60      1036
           1       0.38      0.97      0.54       373

    accuracy                           0.57      1409
   macro avg       0.68      0.70      0.57      1409
weighted avg       0.81      0.57      0.58      1409

[[446 590]
 [ 13 360]]
