In [77]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report
import warnings
warnings.filterwarnings('ignore')

In [78]:
df = pd.read_csv(r'..\data\processed\teleco-ML.csv')

In [79]:
df.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_Yes,InternetService_Fiber optic,...,StreamingTV_Yes,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen_Yes,has_addons
0,1,29.85,29.85,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
1,34,56.95,1889.5,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,2
2,2,53.85,108.15,1,1,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,2
3,45,42.3,1840.75,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,3
4,2,70.7,151.65,1,0,0,0,1,0,1,...,0,0,0,0,1,0,1,0,0,0


In [80]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

X = df.select_dtypes(include=['int64', 'float64'])
X = df.drop(columns='Churn')
X = X[['tenure', 'MonthlyCharges', 'Contract_Two year', 'Contract_One year',
 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
 'PaperlessBilling_Yes', 'Partner_Yes', 'Dependents_Yes',
 'SeniorCitizen_Yes', 'has_addons', 'TotalCharges']
]
X = add_constant(X)

vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

vif_data.sort_values(by='VIF', ascending=False)

Unnamed: 0,Variable,VIF
0,const,22.112367
12,TotalCharges,10.37109
1,tenure,7.315998
2,MonthlyCharges,4.836927
11,has_addons,3.156771
3,Contract_Two year,2.446325
4,Contract_One year,1.569608
8,Partner_Yes,1.459491
5,PaymentMethod_Electronic check,1.454582
6,PaymentMethod_Mailed check,1.438552


In [81]:
X = df.drop(columns='Churn')
y = df['Churn']

In [82]:
y.value_counts()

Churn
0    5174
1    1869
Name: count, dtype: int64

In [83]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [84]:
# Upsample clase minoritaria
from sklearn.utils import resample

df_train = pd.concat([X_train, y_train], axis=1)
df_minority = df_train[df_train['Churn'] == 1]
df_majority = df_train[df_train['Churn'] == 0]

df_minority_upsampled = resample(df_minority, 
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

df_upsampled = pd.concat([df_majority, df_minority_upsampled]).sample(frac=1, random_state=42)

X_train_bal = df_upsampled.drop(columns='Churn')
y_train_bal = df_upsampled['Churn']

In [85]:
lr = LogisticRegression(class_weight='balanced')
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lr.fit(X_train_bal, y_train_bal)

y_prob = lr.predict_proba(X_test)[:, 1]
y_pred_lr = (y_prob > 0.75).astype(int) 

f1_scores = cross_val_score(lr, X_train_bal, y_train_bal, cv=skf, scoring='f1')

print(f"F1-score promedio en validación cruzada: {f1_scores.mean():.4f}")
# Evaluar el modelo
print("Classification Report:")
print(classification_report(y_test, y_pred_lr))

F1-score promedio en validación cruzada: 0.7682
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.92      0.88      1036
           1       0.71      0.55      0.62       373

    accuracy                           0.82      1409
   macro avg       0.78      0.73      0.75      1409
weighted avg       0.81      0.82      0.81      1409



In [86]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# rf
rf = RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1) 

# cross validation
f1_scores = cross_val_score(rf, X_train_bal, y_train_bal, cv=skf, scoring='f1')

print(f"F1-score promedio en validación cruzada: {f1_scores.mean():.4f}")

rf.fit(X_train_bal, y_train_bal)

y_prob = rf.predict_proba(X_test)[:, 1]
y_pred_rf = (y_prob > 0.75).astype(int) 

print(classification_report(y_test,y_pred_rf))

F1-score promedio en validación cruzada: 0.9094
              precision    recall  f1-score   support

           0       0.78      0.97      0.86      1036
           1       0.73      0.25      0.37       373

    accuracy                           0.78      1409
   macro avg       0.75      0.61      0.62      1409
weighted avg       0.77      0.78      0.73      1409

