In [73]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

import warnings
warnings.filterwarnings('ignore')

In [74]:
df = pd.read_csv('heart_cleaned.csv')
df.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289.0,0,172,0,0.0,False,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49,0,160,180.0,0,156,0,1.0,True,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,37,1,130,283.0,0,98,0,0.0,False,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,48,0,138,214.0,0,108,1,1.5,True,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,54,1,150,195.0,0,122,0,0.0,False,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [75]:
def introduce_missing_values(df: pd.DataFrame, columns: list, percentage: float) -> pd.DataFrame:
    """
    Introduce una percentuale specifica di valori mancanti (NaN) in determinate colonne di un DataFrame.
    La funzione originale è corretta e la manteniamo così com'è.

    Args:
        df (pd.DataFrame): Il DataFrame di input.
        columns (list): La lista dei nomi delle colonne in cui inserire i NaN.
        percentage (float): La percentuale di valori da sostituire (es. 0.10 per il 10%).

    Returns:
        pd.DataFrame: Un nuovo DataFrame con i valori mancanti.
    """
    df_degraded = df.copy()
    for col in columns:
        non_missing_indices = df_degraded[col].dropna().index
        n_to_make_missing = int(len(non_missing_indices) * percentage)
        
        missing_indices = np.random.choice(
            non_missing_indices, 
            size=n_to_make_missing, 
            replace=False
        )
        df_degraded.loc[missing_indices, col] = np.nan

    return df_degraded

# --- Esempio Pratico di Utilizzo ---

# 2. DEFINIZIONE DEI PARAMETRI
# Selezioniamo tutte le feature tranne la variabile target
features_to_degrade = df.columns.drop("HeartDisease")

# Creiamo la lista di percentuali progressive da 5% a 50%
missing_percentages = np.arange(0.05, 0.51, 0.05).round(2)

# Impostiamo un seme per la riproducibilità dell'esperimento
np.random.seed(42)

# 3. GENERAZIONE DEI DATASET DEGRADATI IN UN CICLO
# Usiamo un dizionario per mappare ogni percentuale al suo dataset degradato
degraded_datasets = {}

for p in missing_percentages:
    print(f"Generando dataset con {int(p*100)}% di valori mancanti...")
    degraded_datasets[p] = introduce_missing_values(
        df=df,
        columns=features_to_degrade,
        percentage=p
    )

print("\nGenerazione completata! ✅")

Generando dataset con 5% di valori mancanti...
Generando dataset con 10% di valori mancanti...
Generando dataset con 15% di valori mancanti...
Generando dataset con 20% di valori mancanti...
Generando dataset con 25% di valori mancanti...
Generando dataset con 30% di valori mancanti...
Generando dataset con 35% di valori mancanti...
Generando dataset con 40% di valori mancanti...
Generando dataset con 45% di valori mancanti...
Generando dataset con 50% di valori mancanti...

Generazione completata! ✅
