In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
# Import libraries
import pandas as pd
import churn.config as cfg
from ydata_profiling import ProfileReport
from churn.paths import create_directories, DATA_DIR
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from churn.preprocessing import load_data
from churn.plot import plot_barcharts, plot_boxplots
import plotly.express as px
import numpy as np
from pathlib import Path
from churn.analytics import (
    cramers_v_for_unique_pairs, 
    correlation_matrix, 
    relationships_cat_vs_num, 
    analyze_features, 
    print_best_results, 
    fit_best_discretizers, 
    create_binned_dataset,
    compare_variances
)
%matplotlib inline

In [None]:
# Path to the raw data
create_directories()
file_path = Path(DATA_DIR / 'churn.parquet')

In [None]:
# Load the raw data
raw = load_data(file_path) 
# Display the first rows of the raw data
raw.head()

In [None]:
# Generate profile report
profile = ProfileReport(raw, title="Churn Dataset Report")

# Save the profile report to file
path = Path(DATA_DIR / 'churn_dataset_report.html')
profile.to_file(path)

In [None]:
# Rename column 'customer_hapiness' and apply the correct type to the variables
raw = (raw
       .rename(columns={'customer_hapiness': 'customer_happiness'})
       .assign(
    area_code=lambda df: df['area_code'].astype('category'),
    plan=lambda df: df['plan'].astype('category'),
    churn=lambda df: df['churn'].astype('category'),
    total_day_minutes=lambda df: df['total_day_minutes'].round(),
    total_day_calls=lambda df: df['total_day_calls'].round(),
    total_day_charge=lambda df: df['total_day_charge'].round(2),
    total_eve_minutes=lambda df: df['total_eve_minutes'].round(),
    total_eve_calls=lambda df: df['total_eve_calls'].round(),
    total_eve_charge=lambda df: df['total_eve_charge'].round(2),
    total_night_minutes=lambda df: df['total_night_minutes'].round(),
    total_night_calls=lambda df: df['total_night_calls'].round(),
    total_night_charge=lambda df: df['total_night_charge'].round(2))
)

## Dividir los datos de partida en dos muestras entrenamiento y test
 -  Semilla: 123
 - Split para la muestra de test: 25%
 - División estratificada

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    raw.drop(columns='churn'), 
    raw['churn'], 
    test_size=cfg.TEST_SIZE, 
    random_state=cfg.SEED, 
    stratify=raw['churn']
)

In [None]:
# Combine X_train and y_train for further analysis
train_data = X_train.copy()
train_data['churn'] = y_train.values

# Combine X_test and y_test for future use
test_data = X_test.copy()
test_data['churn'] = y_test.values

# Show the shapes of the resulting splits
train_data.shape, test_data.shape

In [None]:
# Identify numeric variables
numeric_data_train = train_data.select_dtypes(include=[np.number])
# Identify categorical variables
categorical_data_train = train_data.select_dtypes(include=['category'])

## Preprocesado:

a) Estudiar la posible multicolinealidad entre las variables predictoras usando la correlación de Pearson

In [None]:
# Plot the correlation matrix of numeric variables
correlation_matrix(numeric_data_train, 'pearson', cfg.FIG_SIZE,'coolwarm', True, '.3f', 8, False)   

In [None]:
# Calculate the Cramer's V for unique pairs of categorical variables
cramer_v_categorical = cramers_v_for_unique_pairs(train_data)
    
# Show results
cramer_v_categorical

In [None]:
# Identify relationships between categorical and numeric variables
test_results_df = relationships_cat_vs_num(train_data, categorical_data_train, numeric_data_train)

# Filter the DataFrame to only show significant relationships
significant_results_df = test_results_df[test_results_df['P-Value'] < 0.05]

# Show results
significant_results_df

b) Analizar de forma gráfica la distribución de las variables predictoras frente a la target (tanto numéricas como categóricas)

## Corregir los ejes para que indiquen lo que representan

In [None]:
# Plot boxplots for numeric variables vs the target variable
plot_boxplots(train_data, numeric_data_train)

In [None]:
# Select categorical variables (including customer_service_rating) to plot
variables_to_plot = train_data[['area_code','plan','customer_service_rating']]

# Plor barcharts
plot_barcharts(train_data, variables_to_plot, 'churn')

Selección de características: entrenar un modelo random forest y escoger las 7 variables más importantes para el modelo

 Haz un entrenamiento simple con los siguientes hiperámetros:
● número estimadores: 50
● máxima profundidad del árbol: 6
● semilla: 123

In [None]:
# Instantiate Random Forest model with the specified hyperparameters
rf_model = RandomForestClassifier(n_estimators=cfg.ESTIMATORS,
                                  max_depth=cfg.DEPTH,
                                  random_state=cfg.SEED
                                  )

# Train the model
rf_model.fit(X_train, y_train)

# Obtain feature importances from the model
importances = rf_model.feature_importances_

# Sort the feature importances in descending order and select only the top 7
indices = np.argsort(importances)[-cfg.NUM_FEATURES:] 

# Select the features corresponding to the top 7 importances
features = X_train.columns[indices]

# Print the top 7 features
features

In [None]:
# Graph showing the feature importance of the top 7 features
fig = px.bar(
    x=importances[indices],
    y=features,
    orientation='h',
    labels={'x': 'Feature Importance', 'y': 'Features'},
    color_discrete_sequence=['cornflowerblue']  # Set bar color to blue
)

# Update layout to make the chart smaller and set black background
fig.update_layout(
    height=400, 
    width=900, 
    title_text='Feature Importance (Random Forest)',
    **cfg.PLOTLY_LAYOUT_CONFIG
)

# Plot the graph
fig.show()

Categorizar todas las variables predictoras.
■ Cada variable debe contener, al menos, 4 niveles
■ Utiliza cualquier técnica que consideres adecuada para tramear las variables numéricas (cuantiles, juicio experto, information-value)
Estudiar el grado de asociación entre las variables predictoras (y frente al target). Puedes emplear la chi-square del paquete (spicy, R-básico) o la V-Cramer, su generalización.
Ten en cuenta que el preprocesado debe hacerse, inicialmente, sobre la muestra de entrenamiento. Una vez definidos los niveles de las variables propuestas, estos deben ser también planteados en la muestra de test.

In [None]:
train_features = X_train[features].copy()
train_features['churn'] = y_train.values

test_features = X_test[features].copy()
test_features['churn'] = y_test.values

In [None]:
# Create a copy of the training features
train_features = train_features.copy()

# Establish the bin range from 4 to _
bin_range = cfg.BIN_SIZES

# Establish the optimization step
opt_step = cfg.OPT_STEP

# Analyze the features and find the best discretizers
best_results = analyze_features(train_features, 'churn', bin_range, opt_step)

# Print the best results
print_best_results(best_results)

In [None]:
# Compare the variances of the original dataset with the binned dataset
variance_comparison = compare_variances(best_results, train_features, 'total_eve_minutes')

# Show the comparison
display(variance_comparison)

In [None]:
# Fit the best discretizers to the training features
fitted_discretizers = fit_best_discretizers(train_features, best_results, 'churn')

# Create the new binned training dataset
train_features_binned = create_binned_dataset(train_features, fitted_discretizers, 'churn')

# Display the new dataset with binned features
train_features_binned.head()

Estudiar el grado de asociación entre las variables predictoras (y frente al target). Puedes emplear la chi-square del paquete (spicy, R-básico) o la V-Cramer, su generalización.

In [None]:
# Generate profile report for raw features (pre-binning)
raw_features_report = ProfileReport(train_features, title="Raw_Feat")

# Generate profile report for binned features
binned_features_report = ProfileReport(train_features_binned, title="Binned_Feat")

# Compare both datsets
comparison_report = raw_features_report.compare(binned_features_report)

# Save report to file
path = Path(DATA_DIR / 'comparison_raw_binned_features.html')
comparison_report.to_file(path)

In [None]:
# Convert all columns to categorical
train_features_binned = train_features_binned.astype('category')

# Save to file the new binned dataset
path = Path(DATA_DIR / 'train_features_binned.parquet')
train_features_binned.to_parquet(path, index=False)

# Calculate the Cramer's V for unique pairs of categorical variables
cramer_v_features = cramers_v_for_unique_pairs(train_features_binned)

# Convert the dictionary to a DataFrame
features_comparison = pd.DataFrame(list(cramer_v_features.items()), columns=['Pair', 'Cramers_V'])

# Sort the DataFrame by Cramer's V in descending order
features_comparison = features_comparison.sort_values(by='Cramers_V', ascending=False)

# Show the features comparison
features_comparison

In [None]:
# Save the profile report to file
path = Path(DATA_DIR / 'train_features_binned_report.html')
binned_features_report.to_file(path)

In [None]:
# Filter the DataFrame to include only pairs with Cramér's V > 0.10
features_comparison = features_comparison[features_comparison['Cramers_V'] > cfg.RELATIONSHIP_THRESHOLD]

# Show results
features_comparison

Ten en cuenta que el preprocesado debe hacerse, inicialmente, sobre la muestra de entrenamiento. Una vez definidos los niveles de las variables propuestas, estos deben ser también planteados en la muestra de test

In [None]:
# Create the new binned training dataset
test_features_binned = create_binned_dataset(test_features, fitted_discretizers, 'churn')

# Convert all columns to categorical
test_features_binned = test_features_binned.astype('category')

# Save to file the new binned dataset
path = Path(DATA_DIR / 'test_features_binned.parquet')
test_features_binned.to_parquet(path, index=False)