# Setup

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

#Loading and exploring the data

In [None]:
def load_data(filepath):
  data = pd.read_csv(filepath)
  return data

def explore_data(data):
  print(data.head())
  print(data.shape)
  print(data.describe().T)
  print(data.info())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#EDA

In [None]:
def analizyng_nulls (data):
  nulos = data.isnull().mean()*100
  display(nulos.sort_values(ascending=False))

  porcentaje_nulos_por_fila = data.isnull().mean(axis=1) * 100
  porcentajes_filtrados = porcentaje_nulos_por_fila[porcentaje_nulos_por_fila >= 5]
  orden_porcentaje_nulos = porcentajes_filtrados.sort_values(ascending=False)

  print("\nNúmero de filas con más del 5% de nulos:", len(orden_porcentaje_nulos))
  display(orden_porcentaje_nulos)


In [None]:
def analyzing_outliers(data):
  plt.figure(figsize=(15, 8))
  data.boxplot()
  plt.title('Boxplot de todas las variables')
  plt.xticks(rotation=45)
  plt.tight_layout()
  plt.show()

#Visualizing data

In [None]:
def visualize_data(data):
  cols_per_row = 4
  total_cols = len(data.columns)
  rows = (total_cols + cols_per_row - 1) // cols_per_row

  fig, axes = plt.subplots(rows, cols_per_row, figsize=(cols_per_row * 4, rows * 3))
  axes = axes.flatten()

  for i, col in enumerate(data.columns):
    sns.histplot(data[col], kde=True, bins=30, color='skyblue', ax=axes[i])
    axes[i].set_title(f'Distribución de: {col}')
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')

  for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

  plt.tight_layout()
  plt.show()

In [None]:
def correlation_analysis(data):
  plt.figure()
  sns.heatmap(data.corr(), annot=False, cmap='coolwarm', linewidths=0.5, cbar_kws={'shrink': 0.5})
  plt.title('Matriz de Correlaciones')
  plt.xticks(rotation=90, fontsize=8)
  plt.yticks(rotation=0, fontsize=8)

  plt.tight_layout()
  plt.show()

In [None]:
def target_analysis(data):
  plt.figure(figsize=(15, 8))
  for column in data.columns[:-1]:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=data.iloc[:,-1], y=column, data=data)
    plt.title(f'Correlación entre la variable objetivo y {column}')
    plt.show()

#Preprocessing and feature engineering

In [None]:
def manage_duplicates(data):
  data.drop_duplicates(inplace=True)

In [None]:
def manage_nulls(data):
  porcentaje_nulos_por_fila = data.isnull().mean(axis=1)*100
  data = data[porcentaje_nulos_por_fila <= 5]

In [None]:
def split_data(data):
  X = data.iloc[:, :-1]
  y = data.iloc[:, -1]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
  return X_train, X_test, y_train, y_test

In [None]:
def impute_nulls(X_train):
  for col in X_train.columns:
    mediana = X_train[col].median()
    X_train[col] = X_train[col].fillna(mediana)
  return X_train

In [None]:
def impute_outliers(X_train):
  for col in X_train.columns:
    Q1 = X_train[col].quantile(0.25)
    Q3 = X_train[col].quantile(0.75)
    IQR = Q3 - Q1
    limite_inferior = Q1 - 1.5 * IQR
    limite_superior = Q3 + 1.5 * IQR
    mediana = X_train[col].median()

    X_train.loc[(X_train[col] < limite_inferior) | (X_train[col] > limite_superior), col] = mediana
  return X_train


In [None]:
def manage_corr(X_train):
  correlation_matrix = X_train.corr()
  upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
  high_corr_columns = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.75)]

  X_train = X_train.drop(columns=high_corr_columns)
  return X_train

In [None]:
def scale_features(X_train):
 scaler = StandardScaler()
 X_train_scaled = scaler.fit_transform(X_train)
 return X_train_scaled

In [None]:
def pca_analysis(X_train_scaled):
  pca = PCA()
  X_train_PCA = pca.fit_transform(X_train_scaled)
  cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
  n_components_90 = np.argmax(cumulative_variance >= 0.90)+1
  pca = PCA(n_components=n_components_90)
  X_train_VF = pca.fit_transform(X_train_scaled)
  return X_train_VF

#Output variable analysis

In [None]:
def target(y_train):
  print(f"Distribución de clases: {y_train.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'}")

  df = pd.concat([pd.DataFrame(X_train_VF), pd.Series(y_train, name='target')], axis=1)
  corr = df.corr()['target'].sort_values(ascending=False)
  print("Correlación con la variable dependiente:\n", corr)

#Preprocessing the test set using training set statistics

In [None]:
def preprocess_test_data(X_train, X_test, high_corr_columns, scaler, pca):
  """
  Preprocesa el conjunto de prueba usando estadísticas del conjunto de entrenamiento.

  Pasos:
  1. Calcula mediana, límites de outliers (IQR) por columna en X_train.
  2. Imputa valores nulos en X_test con la mediana.
  3. Sustituye outliers en X_test por la mediana.
  4. Elimina columnas altamente correlacionadas.
  5. Escala los datos y aplica PCA.

  Parámetros:
  - X_train: DataFrame de entrenamiento.
  - X_test: DataFrame de prueba.
  - high_corr_columns: lista de columnas a eliminar.
  - scaler: objeto de escalado (ya ajustado).
  - pca: objeto PCA (ya ajustado).

  Retorna:
  - X_test_VF: conjunto de prueba transformado por PCA.
  """
  for col in X_train.columns:
    mediana = X_train[col].median()
    Q1 = X_train[col].quantile(0.25)
    Q3 = X_train[col].quantile(0.75)
    IQR = Q3 - Q1
    limite_inferior = Q1 - 1.5 * IQR
    limite_superior = Q3 + 1.5 * IQR

    if col in X_test.columns:
      X_test[col] = X_test[col].fillna(mediana)
      X_test[col] = X_test[col].apply(lambda x: mediana if x < limite_inferior or x > limite_superior else x)

  X_test = X_test.drop(columns=high_corr_columns, errors='ignore')

  X_test_scaled = scaler.transform(X_test)
  X_test_VF = pca.transform(X_test_scaled)

  return X_test_VF

#Training and evaluating models

In [None]:
def train_and_evaluate_model(model, X_train_VF, y_train, X_test_VF, y_test):
  """
  Entrena un modelo de Regresión Logística con SMOTE y evalúa en el conjunto de prueba.

  Pasos:
  1. Aplica SMOTE para balancear clases en el conjunto de entrenamiento.
  2. Entrena un modelo.
  3. Realiza predicciones en el conjunto de prueba.
  4. Muestra matriz de confusión y reporte de clasificación.

  Parámetros:
  - X_train_VF: características del conjunto de entrenamiento (ya transformadas).
  - y_train: etiquetas del conjunto de entrenamiento.
  - X_test_VF: características del conjunto de prueba (ya transformadas).
  - y_test: etiquetas del conjunto de prueba.
  - random_state: semilla para reproducibilidad.

  Retorna:
  - model: modelo entrenado.
  - y_pred: predicciones en el conjunto de prueba.
  """

  smote = SMOTE(random_state=42)
  X_train_resampled, y_train_resampled = smote.fit_resample(X_train_VF, y_train)

  model.fit(X_train_resampled, y_train_resampled)

  y_pred = model.predict(X_test_VF)

  cm = confusion_matrix(y_test, y_pred)
  sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
              xticklabels=['Pred 0', 'Pred 1'],
              yticklabels=['Real 0', 'Real 1'])
  plt.title('Matriz de Confusión')
  plt.show()

  print("\nReporte de clasificación - Regresión Logística:")
  print(classification_report(y_test, y_pred, zero_division=0))

  return model, y_pred
