# **IAA - PRÀCTICA: MAIN**

### **Instal·lar llibreries necessàries**

In [None]:
%pip install -r ../assets/requirements.txt 

### **Importar llibreries**

In [1]:
def import_dependencies():
	global pd, np, plt, sns, skl

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import sklearn as skl

#import_dependencies()

### **Llegir les dades (Cirrhosis Dataset)**

In [2]:
def load_dataset(save_to_csv: bool = True):
	global data
	from ucimlrepo import fetch_ucirepo 
	
	# Fetch dataset
	cirrhosis_patient_survival_prediction = fetch_ucirepo(id=878)

	data = pd.DataFrame(cirrhosis_patient_survival_prediction.data.original)

	if save_to_csv:
		# Guardem el dataset per poder-lo visualitzar sencer
		data.to_csv('../assets/data/raw_cirrhosis.csv', index=False)

#load_dataset(save_to_csv=True)

### **Informació del dataset inicial**

In [None]:
data.shape

In [None]:
data.head(-10)

In [None]:
data.info()

### **Preprocessing inicial**

In [3]:
def initial_preprocessing(data: pd.DataFrame, save_to_csv: bool = True):
	"""
	Reemplaça els valors 'NaNN' per NaN, assigna els tipus de dades correctes a cada columna i renombra les classes d'algunes variables per una millor comprensió.
	"""
	# Reemplaçar l'string 'NaNN' per NaN
	data.replace(to_replace=['NaNN', '', pd.NA], value=np.nan, inplace=True)

	# Assignem els tipus de dades correctes a cada columna
	int64_variables = ['N_Days', 'Age', 'Cholesterol', 'Copper', 'Tryglicerides', 'Platelets']
	float64_variables = ['Bilirubin', 'Albumin', 'Alk_Phos', 'SGOT', 'Prothrombin']
	category_variables = ['ID', 'Status', 'Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']
	boolean_variables = ['Ascites', 'Hepatomegaly', 'Spiders']

	data[int64_variables] = data[int64_variables].astype('Int64')
	data[float64_variables] = data[float64_variables].astype('float64')
	data[category_variables] = data[category_variables].astype('category')

	global original_column_types

	original_column_types = {col: str(data[col].dtype) for col in data.columns} # Guardem els tipus de dades de cada columna

	# Renombrem les classes d'algunes variables per una millor comprensió
	data['Status'] = data['Status'].replace({'D': 'Dead', 'C': 'Alive', 'CL': 'LiverTransplant'})
	data[boolean_variables] = data[boolean_variables].replace({'Y': 1, 'N': 0})
	data['Edema'] = data['Edema'].replace({'N': 'NoEdema', 'S': 'EdemaResolved', 'Y': 'EdemaPersistent'})

	if save_to_csv:
		# Guardem el dataset
		data.to_csv('../assets/data/initial_preprocessing_cirrhosis.csv', index=False)

#initial_preprocessing(data=data, save_to_csv=True)

In [None]:
data.head(-10)

In [None]:
data.info()

### **Anàlisis inicial de les variables**

In [None]:
data.head(-10)

In [None]:
data.isna().sum().sort_values(ascending=False)

In [None]:
# Estudi de les variables numèriques
data.describe()

In [None]:
# Estadístiques de les variables categòriques
data.describe(include='category')

In [None]:
def numerical_vars_histograms(data: pd.DataFrame):
    # Visualització de les distribucions de les variables numèriques en una sola figura
    numerical_columns = data.select_dtypes(include=['Int64', 'float64']).columns

    num_rows = int(np.ceil(len(numerical_columns) / 2))

    fig = plt.figure(figsize=(10, num_rows * 4))

    for i, col in enumerate(numerical_columns):
        ax = fig.add_subplot(num_rows, 2, i + 1)
        
        sns.histplot(data[col], edgecolor="k", linewidth=1.5, kde=True)
        
        plt.xticks(rotation=45, ha='right')
        
        ax.set_title(f'Distribució de la variable numèrica {col}')
        ax.set_xlabel(col)
        ax.set_ylabel('Freqüència')

    plt.tight_layout()
    plt.show()

#numerical_vars_histograms(data=data)

In [None]:
def categorical_vars_countplots(data: pd.DataFrame):
    """
    Visualització de les distribucions de les variables categòriques en una sola figura (menys ID).
    """
    # Visualització de les distribucions de les variables categòriques en una sola figura (menys ID)
    categorical_columns = data.select_dtypes(include=['category']).columns.drop(['ID'])
    num_rows = int(np.ceil(len(categorical_columns) / 2))

    fig = plt.figure(figsize=(10, num_rows * 4))

    for i, col in enumerate(categorical_columns):
        ax = fig.add_subplot(num_rows, 2, i + 1)
        
        sns.countplot(data=data, x=col, ax=ax, hue=col, legend=False)
        
        plt.xticks(rotation=45, ha='right')
        
        ax.set_title(f'Distribució de la variable categòrica {col}')
        ax.set_xlabel(col)
        ax.set_ylabel('Quantitat')

    plt.tight_layout()
    plt.show()

#categorical_vars_countplots(data=data)

### **Tractament d'outliers**

In [None]:
def compare_iqr_factors(data: pd.DataFrame, factors: list = [1.5, 1.75, 2, 2.25, 2.5, 2.75, 3, 3.25, 3.5, 3.75, 4, 4.25, 4.5, 4.75, 5]):
	"""
	Compara diferents factors que multipliquen al IQR per a determinar els outliers i realitza un gràfic evolutiu per comparar-los.
	"""
	numerical_columns = data.select_dtypes(include=['Int64', 'float64']).columns

	plt.figure(figsize=(10, 6))

	# Dictionary to store outlier percentages for each factor and column
	outlier_percentages = {col: [] for col in numerical_columns}
	total_percentages = [set() for _ in range(len(factors))]

	for col in numerical_columns:
		Q1 = data[col].quantile(0.25)
		Q3 = data[col].quantile(0.75)
		IQR = Q3 - Q1

		for f_id, factor in enumerate(factors):
			outliers_mask = ((data[col] < (Q1 - factor * IQR)) | (data[col] > (Q3 + factor * IQR)))
			total_percentages[f_id].update(data.index[outliers_mask])
			outliers_percentage = np.mean(outliers_mask) * 100
			outlier_percentages[col].append(outliers_percentage)

	total_percentages = [(len(outliers) / len(data)) * 100 for outliers in total_percentages]
			
	# Plotting the results
	for col, percentages in outlier_percentages.items():
		plt.plot(factors, percentages, label=col)
	plt.plot(factors, total_percentages, label='Total', linestyle='--', color='black')

	plt.xlabel('Factor multiplicatiu del IQR')
	plt.ylabel('Percentage d\'outliers (%)')
	plt.title('Percentatge d\'outliers de cada variable numèrica per a diferents factors multiplicatius del IQR')
	plt.xticks(factors)
	
	plt.legend()
	plt.grid(True)
	plt.show()

#compare_iqr_factors(data=data, factors=[1.5, 1.75, 2, 2.25, 2.5, 2.75, 3, 3.25, 3.5, 3.75, 4, 4.25, 4.5, 4.75, 5])

In [4]:
def delete_outliers(data: pd.DataFrame, factor: float = 1.5, plots: bool = True, save_to_csv: bool = True):
    """
    Funció que detecta, visualitza i elimina els outliers d'un dataset. El factor multiplica el IQR per a determinar quins valors són outliers.
    """
    # Detecció, visualització i eliminació d'outliers
    numerical_columns = data.select_dtypes(include=['Int64', 'float64']).columns

    outliers_indices = []

    for col in numerical_columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - factor * IQR
        upper_bound = Q3 + factor * IQR
        outliers_mask = ((data[col] < lower_bound) | (data[col] > upper_bound))
        outliers = data[col][outliers_mask]
        non_outliers = data[col][~outliers_mask]

        outliers_indices.extend(data[col][outliers_mask].index.tolist())
        
        if plots:
            fig, axes = plt.subplots(2, 2, figsize=(8, 6))

            # Boxplot con los outliers originales
            sns.boxplot(ax=axes[0, 0], y=data[col], orient='v')
            axes[0, 0].scatter(x=[0]*len(outliers), y=outliers, color='red', marker='o')
            axes[0, 0].set_title(f'Boxplot de {col} con outliers ({factor}x IQR)')

            # Histograma con línea vertical para outliers
            sns.histplot(ax=axes[0, 1], data=data, x=col, kde=True)
            if (data[col] < lower_bound).any():
                axes[0, 1].axvline(x=lower_bound, color='red', linestyle='dashed')
            if (data[col] > upper_bound).any():
                axes[0, 1].axvline(x=upper_bound, color='red', linestyle='dashed')
            axes[0, 1].set_title(f'Histograma de {col}')
            axes[0, 1].set_xlabel(col)
            axes[0, 1].set_ylabel('Frecuencia')

            # Boxplot sin los outliers
            sns.boxplot(ax=axes[1, 0], y=non_outliers, orient='v')
            axes[1, 0].set_title(f'Boxplot de {col} sin outliers')

            # Histograma sin los outliers
            sns.histplot(ax=axes[1, 1], data=data[~outliers_mask], x=col, kde=True)
            axes[1, 1].set_title(f'Histograma de {col} sin outliers')

            percent_outliers = len(outliers) / data.shape[0] * 100
            fig.text(x=0.5, y=0, s=f'Outliers de {col} ({factor}x IQR): {len(outliers)} ({percent_outliers:.2f}%)', 
                    ha='center', va='center')

            plt.tight_layout()
            plt.show()


    unique_outliers = len(set(outliers_indices))

    print(f"Datset amb outliers: {data.shape[0]} files i {data.shape[1]} columnes.")
    print(f"Nombre total d'outliers únics eliminats: {unique_outliers} ({unique_outliers / data.shape[0] * 100:.2f}% de tot el dataset).")

    # Eliminació d'outliers
    data.drop(list(set(outliers_indices)), inplace=True)
    
    print(f"Dataset sense outliers: {data.shape[0]} files i {data.shape[1]} columnes.")

    if save_to_csv:
        # Guardem el dataset
        data.to_csv('../assets/data/no_outliers_cirrhosis.csv', index=False)

#delete_outliers(data=data, factor=3, plots=False, save_to_csv=True)

Datset amb outliers: 418 files i 20 columnes.
Nombre total d'outliers únics eliminats: 70 (16.75% de tot el dataset).
Dataset sense outliers: 348 files i 20 columnes.


### **Recodificació de variables categòriques**

In [5]:
def encode_variables(data: pd.DataFrame, save_to_csv: bool = True):
    """
    Codifica les variables categòriques que calgui per a poder-les utilitzar en els models de ML. 
    A més, guarda el mapping per a poder decodificar-les.
    Els NaNs es mantenen (en comptes de considerar-los una classe més) per poder imputar-los posteriorment.
    """
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.impute import SimpleImputer

    global ohe_mapping, original_columns_order

    original_columns_order = data.columns

    columns_to_encode = ['Drug', 'Sex', 'Edema', 'Stage'] # Sense la variable 'Status' perquè és la target i, a més, no té valors NaN

    na_indexs_per_old_encoded_column = {col: set(data[data[col].isna()].index) for col in columns_to_encode} # Guardem els indexs dels NaNs per a cada columna a codificar
    new_encoded_columns_per_old_encoded_column = {col: set() for col in columns_to_encode} # Guardem les classes de cada columna a codificar

    # Imputem els NaNs per evitar que es crein columnes innecessàries al fer el OneHotEncoding. Després tornarem a inserir els NaNs
    data[columns_to_encode] = SimpleImputer(strategy='most_frequent').fit_transform(data[columns_to_encode])

    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

    data_encoded = ohe.fit_transform(data[columns_to_encode])
    encoded_columns = ohe.get_feature_names_out(columns_to_encode)

    # Guardem el mapping per a poder decodificar les variables
    ohe_mapping = {}
    for i, col in enumerate(columns_to_encode):
        for category in ohe.categories_[i]:
            new_encoded_column_name = f"{col}_{category}"
            ohe_mapping[new_encoded_column_name] = (col, category)
            new_encoded_columns_per_old_encoded_column[col].add(new_encoded_column_name)

    data[encoded_columns] = data_encoded
    data[encoded_columns] = data[encoded_columns].astype('category')

    # Tornem a posar els NaNs per poder imputar-los
    for col in columns_to_encode:
        for na_index in na_indexs_per_old_encoded_column[col]:
            for new_column in new_encoded_columns_per_old_encoded_column[col]:
                data.loc[na_index, new_column] = np.nan

    # Eliminem les columnes originals
    data.drop(columns=columns_to_encode, inplace=True)

    if save_to_csv:
        # Guardem el dataset
        data.to_csv('../assets/data/encoded_cirrhosis.csv', index=False)

#encode_variables(data=data, save_to_csv=True)

In [7]:
def decode_variables(data: pd.DataFrame, ohe_mapping, original_columns_order):
    """
    Decodifica les variables categòriques que s'hagin codificat anteriorment.
    """
    reconstructed_columns = {}

    # Creem les columnes reconstruïdes
    for encoded_column in ohe_mapping:
        if encoded_column in data.columns:
            original_column, category = ohe_mapping[encoded_column]

            if original_column not in reconstructed_columns:
                reconstructed_columns[original_column] = pd.Series([np.nan] * len(data), index=data.index, dtype='object')

            category_rows = data[encoded_column] == 1
            reconstructed_columns[original_column].loc[category_rows] = category

    # Eliminem les columnes codificades
    data.drop(columns=[col for col in ohe_mapping if col in data.columns], inplace=True)

    # Inserim les columnes reconstruïdes al DataFrame
    for col in reconstructed_columns:
        data[col] = reconstructed_columns[col]
        data[col] = data[col].astype(original_column_types[col])

    # Reordenem les columnes perquè quedin igual que a l'original
    data = data.reindex(columns=original_columns_order)

#decode_variables(data=data, ohe_mapping=ohe_mapping, original_columns_order=original_columns_order)

### **Escalar variables numèriques**

In [8]:
def scale_variables(data: pd.DataFrame, scaler: str = 'standard', save_to_csv: bool = True):
	assert scaler in ['standard', 'minmax'], "El paràmetre 'scaler' ha de ser 'standard' o 'minmax'."
	"""
	Escala les variables numèriques.
	"""
	from sklearn.preprocessing import StandardScaler, MinMaxScaler

	numerical_columns = data.select_dtypes(include=['Int64', 'float64']).columns

	sc = StandardScaler() if scaler == 'standard' else MinMaxScaler()

	data[numerical_columns] = sc.fit_transform(data[numerical_columns])

	if save_to_csv:
		# Guardem el dataset
		data.to_csv('../assets/data/scaled_cirrhosis.csv', index=False)

# scale_variables(data=data, scaler='standard', save_to_csv=True)
#scale_variables(data=data, scaler='minmax', save_to_csv=True)

### **Partició del dataset en train/test**

In [9]:
def split_dataset(data: pd.DataFrame, test_size: float = 0.15, stratify: bool = True, random_state: int = 42):
	"""
	Particiona el dataset en train i test.
	"""
	global train, test, X_train, y_train, X_test, y_test

	from sklearn.model_selection import train_test_split

	train, test = train_test_split(data, test_size=test_size, random_state=random_state, stratify=data['Status']) \
		if stratify else train_test_split(data, test_size=test_size, random_state=random_state)

	print(f"Train shape: {train.shape}")
	print(f"Test shape: {test.shape}")

	# 'Status' és la variable target
	X_train = train.drop(columns=['Status'])
	y_train = train['Status']
	X_test = test.drop(columns=['Status'])
	y_test = test['Status']

#split_dataset(data=data, test_size=0.15, random_state=42)

Train shape: (295, 20)
Test shape: (53, 20)


### **Imputar els valors faltants (Missings)**

In [None]:
# Imprimeix les variables que tenen valors NaN, el seu percentatge i el seu tipus de dades
for col_train in train.columns:
	if train[col_train].isna().any():
		print(f"{col_train}: {train[col_train].isna().sum()} NaNs ({train[col_train].isna().sum() / len(train) * 100:.2f}%) ({train[col_train].dtype})")

In [38]:
def best_imputer(data_is_encoded: bool, X_train: pd.DataFrame, random_state: int = 42, print_scores: bool = True, return_best_imputer: bool = True, proportion_to_test_imputation: float = 0.1):
    """
    Prova diferents imputadors, imprimeix els seus resultats i retorna el millor.
    """
    assert proportion_to_test_imputation > 0 and proportion_to_test_imputation < 1, "El paràmetre 'proportion_to_test_imputation' ha de ser un valor entre 0 i 1."
    from sklearn.impute import KNNImputer, SimpleImputer
    from sklearn.compose import ColumnTransformer
    from sklearn.metrics import r2_score, accuracy_score

    numerical_columns = X_train.select_dtypes(include=['Int64', 'float64']).columns
    categorical_columns = X_train.select_dtypes(include=['category']).columns.drop(['ID'])
    original_cols_with_na = X_train.columns[X_train.isna().any()]

    MixedImputer = ColumnTransformer([
        ('numerical', SimpleImputer(strategy='mean'), numerical_columns),
        ('categorical', SimpleImputer(strategy='most_frequent'), categorical_columns)
    ])

    imputers: dict = {'mixed': MixedImputer}

    if data_is_encoded:
        imputers['knn-1'] = KNNImputer(n_neighbors=1)
        imputers['knn-3'] = KNNImputer(n_neighbors=3)
        imputers['knn-5'] = KNNImputer(n_neighbors=5)

    # from sklearn.experimental import enable_iterative_imputer
    # from sklearn.impute import IterativeImputer
    # imputers['iterative-10'] = IterativeImputer(max_iter=10, random_state=random_state)
    # imputers['iterative-20'] = IterativeImputer(max_iter=20, random_state=random_state)

    scores = {}
    best_score = float('-inf')
    best_imputer = (None, None)
    best_imputer_name = 'None'

    # Ens quedem només amb les files sense NaNs i sense la variable 'ID' (ja que no aporta informació)
    X_train_complete = X_train.dropna().drop(columns=['ID'])

    # Creem un dataset amb NaNs aleatoris per imputar
    X_train_incomplete = X_train_complete.copy()
    for col in original_cols_with_na:
        X_train_incomplete.loc[X_train_incomplete.sample(frac=proportion_to_test_imputation, random_state=random_state).index, col] = np.nan

    # Imputar i calcular mètriques
    for name_imputer, imputer in imputers.items():
        # Imputar
        imputed_data = imputer.fit_transform(X_train_incomplete)

        # Convertir a DataFrame y asegurarse de que las columnas coincidan
        if isinstance(imputer, ColumnTransformer):
            # Extraer los nombres de las columnas después de la transformación
            transformed_columns = [col for name, trans, cols in imputer.transformers if trans != 'drop' for col in cols]
            X_train_imputed = pd.DataFrame(imputed_data, columns=transformed_columns, index=X_train_incomplete.index)
        else:
            # Para otros imputadores, simplemente usa las columnas originales
            X_train_imputed = pd.DataFrame(imputed_data, columns=X_train_incomplete.columns, index=X_train_incomplete.index)

        # Calcular mètriques
        r2_scores = {} # Per a les variables numèriques
        acc_scores = {} # Per a les variables categòriques
        for col in original_cols_with_na:
            if col in numerical_columns:
                r2 = r2_score(X_train_complete[col], X_train_imputed[col])
                r2_scores[col] = r2
            elif col in categorical_columns:
                acc = accuracy_score(X_train_complete[col].astype('category'), np.round(X_train_imputed[col].astype('float64')).astype('category'))
                acc_scores[col] = acc

        overall_score = np.mean(list(r2_scores.values()) + list(acc_scores.values()))
        scores[name_imputer] = {'categorical': acc_scores, 'numerical': r2_scores, 'overall': overall_score}

        # Guardar el millor imputador
        if overall_score > best_score:
            best_score = overall_score
            best_imputer = imputer
            best_imputer_name = name_imputer

    # Imprimir els resultats
    if print_scores:
        for name_imputer, scores_imputer in scores.items():
            print(f"IMPUTER [{name_imputer}]: {scores_imputer['overall']} (overall score)")
            print(f"\t*Variables numèriques (R²): {np.mean(list(scores_imputer['numerical'].values()))}")
            for col, s in scores_imputer['numerical'].items():
                print(f"\t\t*{col}: {s}")
            print(f"\t*Variables categòriques (Accuracy): {np.mean(list(scores_imputer['categorical'].values()))}")
            for col, s in scores_imputer['categorical'].items():
                print(f"\t\t*{col}: {s}")
            print()

        print(f"MILLOR IMPUTER OVERALL --> {best_imputer_name} ({best_score})")
    
    # Retornar el millor imputador
    if return_best_imputer:
        return best_imputer_name, best_imputer, best_score

#best_imputer(X_train=X_train, random_state=42, print_scores=True, return_best_imputer=False)

In [39]:
def impute_data(data_to_impute: pd.DataFrame, imputer = 'best', save_to_csv: bool = True, random_state: int = 42, encode: bool = True, decode: bool = True, proportion_to_test_imputation: float = 0.1):
	"""
	Imputa els valors NaN del dataset.
	"""
	from sklearn.compose import ColumnTransformer

	# Si no hi ha cap NaN, no cal imputar
	if not data_to_impute.isna().values.any():
		print("No hi ha cap NaN al dataset.")
		return

	if imputer == 'best':
		if encode:
			# Codifiquem les variables categòriques
			encode_variables(data=data_to_impute, save_to_csv=False)

		name_imputer, imputer, score_imputer = best_imputer(data_is_encoded=encode, X_train=X_train, random_state=random_state, print_scores=False, return_best_imputer=True, proportion_to_test_imputation=proportion_to_test_imputation)	

		print(f"IMPUTADOR SELECCIONAT: {name_imputer} ({score_imputer} overall score imputant en X_train)")
	
	imputed_data = imputer.fit_transform(data_to_impute)

	if isinstance(imputer, ColumnTransformer):
		# Extraer los nombres de las columnas después de la transformación
		transformed_columns = [col for name, trans, cols in imputer.transformers if trans != 'drop' for col in cols]
		data_to_impute = pd.DataFrame(imputed_data, columns=transformed_columns, index=data_to_impute.index)
	else:
		# Para otros imputadores, simplemente usa las columnas originales
		data_to_impute = pd.DataFrame(imputed_data, columns=data_to_impute.columns, index=data_to_impute.index)

	# Comprovem que ja no hi hagi NaNs
	if data_to_impute.isna().values.any():
		raise Exception("Per algun motiu desconegut, encara hi ha NaNs al dataset imputat.")
	
	if decode:
		# Decodifiquem les variables categòriques
		decode_variables(data=data_to_impute, ohe_mapping=ohe_mapping, original_columns_order=original_columns_order)
	
	if save_to_csv:
		# Guardem el dataset
		data_to_impute.to_csv('../assets/data/imputed_cirrhosis.csv', index=False)

#impute_data(data_to_impute=X_train, imputer='best', save_to_csv=True, random_state=42, encode=True, decode=True)

In [41]:
# Pipeline per realizar tot un experiment
import random
load_dataset(save_to_csv=False)
initial_preprocessing(data=data, save_to_csv=False)
delete_outliers(data=data, factor=3, plots=False, save_to_csv=False)
#encode_variables(data=data, save_to_csv=False)
scale_variables(data=data, scaler='minmax', save_to_csv=False)
split_dataset(data=data, test_size=0.15, random_state=random.randint(0, 1000))
impute_data(data_to_impute=X_train, imputer='best', save_to_csv=False, random_state=42, encode=True, decode=True, proportion_to_test_imputation=0.5)
impute_data(data_to_impute=X_test, imputer='best', save_to_csv=False, random_state=42, encode=True, decode=True, proportion_to_test_imputation=0.5)
#decode_variables(data=X_train, ohe_mapping=ohe_mapping, original_columns_order=original_columns_order)
#decode_variables(data=X_test, ohe_mapping=ohe_mapping, original_columns_order=original_columns_order)

Datset amb outliers: 418 files i 20 columnes.
Nombre total d'outliers únics eliminats: 70 (16.75% de tot el dataset).
Dataset sense outliers: 348 files i 20 columnes.
Train shape: (295, 20)
Test shape: (53, 20)
IMPUTADOR SELECCIONAT: knn-5 (0.6441688206303966 overall score imputant en X_train)
IMPUTADOR SELECCIONAT: knn-5 (0.6441688206303966 overall score imputant en X_train)


In [None]:
def cross_validation_con_imputacion(X, y):
    from sklearn.model_selection import cross_val_score, KFold

    kf = KFold(n_splits=5)  # Ajustar según necesidades

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        imputacion = mejor_imputacion(X_train, y_train)[0][1]
        # Aplicar la mejor imputación y entrenar el modelo
        
        X_train_imputed = imputacion.fit_transform(X_train)

        # Evaluar el modelo en X_test, y_test...

### **Correlacions entre variables numèriques**

In [None]:
def numerical_vars_correlations(data: pd.DataFrame):
	"""
	Visualitza la correlació entre les variables numèriques.
	"""

	numerical_columns = data.select_dtypes(include=['Int64', 'float64']).columns

	plt.figure(figsize=(10, 6))

	sns.heatmap(data[numerical_columns].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)

	plt.title('Correlació entre les variables numèriques')
	plt.tight_layout()
	plt.show()


numerical_vars_correlations(data=data)

**1r Model: K-Nearest Neighbors (KNN)**

**2n Model: Decision Tree**

**3r Model: Support Vector Machine (SVM)**