# Limpieza Radiomica

1. Librerías

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import os
import re
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt


2. Funciones 

In [None]:
def extract_original_ref(image_name):
    """
    Extrae el ID de referencia original de un nombre de imagen.
    Maneja nombres aumentados (ej. 'RefID_aug_0' devolverá 'RefID').
    """
    match = re.match(r'(.+?)(_aug_\d+)?$', str(image_name))
    return match.group(1) if match else image_name

In [None]:
def clean_and_merge_features_with_clinical(features_csv_path, clinical_data_excel_path, output_merged_csv_path):
    print(f"\n--- Procesando: {os.path.basename(features_csv_path)} ---")

    try:
        df_features = pd.read_csv(features_csv_path)
        print(f"Features cargadas: {len(df_features)} filas")
        df_features = df_features[df_features['Image'] != '521325_300481_20-5-2022_DSC_0245']
        print("Imagen mal clasificada eliminada si existía.")
        cols_to_drop = ['Clinical_Type']
        df_features.drop(columns=[col for col in cols_to_drop if col in df_features.columns], inplace=True, errors='ignore')
    except FileNotFoundError:
        print(f"[ERROR] No se encontró el archivo de características: {features_csv_path}")
        return None

    try:
        df_clinical = pd.read_excel(clinical_data_excel_path)
        df_clinical.columns = df_clinical.columns.str.strip()
        df_clinical = df_clinical[df_clinical['Clinical_Type'] != 5]
        df_clinical.rename(columns={'Referencia': 'Original_Reference'}, inplace=True)
        df_clinical['Original_Reference'] = df_clinical['Original_Reference'].str.replace(r'\.jpg$', '', case=False, regex=True).str.strip()

        required_cols = ['Original_Reference', 'Clinical_Type']
        if not all(col in df_clinical.columns for col in required_cols):
            print(f"[ERROR] Faltan columnas requeridas en el Excel clínico. Encontradas: {df_clinical.columns.tolist()}")
            return None

        df_clinical_reduced = df_clinical[required_cols]
    except Exception as e:
        print(f"[ERROR] Error al cargar o preparar el Excel clínico: {e}")
        return None

    df_features['Original_Reference'] = df_features['Image'].apply(extract_original_ref).astype(str).str.strip()
    print("Referencia original extraída y limpiada.")

    df_merged = pd.merge(df_features, df_clinical_reduced, on='Original_Reference', how='left')
    print(f"Merge realizado. Filas resultantes: {len(df_merged)}")

    df_merged.dropna(subset=['Clinical_Type'], inplace=True)
    print(f"Filas restantes tras eliminar nulos: {len(df_merged)}")

    df_merged.drop(columns=['Original_Reference'], inplace=True)

    df_merged.to_csv(output_merged_csv_path, index=False)
    print(f"Archivo guardado: {output_merged_csv_path}")

    return df_merged

In [None]:
def prepare_dataframe_for_modeling(df_input):
    df_model = df_input.copy()
    cols_to_remove = ['Image', 'Image_Name', 'Original_Reference']
    df_model.drop(columns=[col for col in cols_to_remove if col in df_model.columns], inplace=True, errors='ignore')

    print(f"\nDataFrame listo para modelado. Columnas: {df_model.columns.tolist()}")
    print("Primeras filas:")
    print(df_model.head())

    shuffled_df = df_model.sample(frac=1, random_state=42).reset_index(drop=True)
    print("\nDataFrame mezclado.")
    return shuffled_df

In [None]:
def analyze_class_distribution(df):
    if 'Clinical_Type' not in df.columns:
        print("[ADVERTENCIA] No se pueden analizar clases. Faltan columnas.")
        return
    print("\n--- Distribución de clases ---")
    print(df['Clinical_Type'].value_counts())

In [None]:
def calculate_correlation_and_remove(df_features_numeric, threshold=0.9):
    if df_features_numeric.isna().any().any():
        print("[!] NaNs detectados. Se rellenan con la media.")
        df_features_numeric = df_features_numeric.fillna(df_features_numeric.mean())

    plt.figure(figsize=(15, 12))
    sns.heatmap(df_features_numeric.corr(), cmap='coolwarm', linewidths=0.5)
    plt.title('Heatmap Inicial')
    plt.show()

    corr_matrix = df_features_numeric.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    features_to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    print(f"\nEliminando {len(features_to_drop)} características correlacionadas (>{threshold}): {features_to_drop}")

    df_final = df_features_numeric.drop(columns=features_to_drop)

    plt.figure(figsize=(15, 12))
    sns.heatmap(df_final.corr(), cmap='coolwarm', linewidths=0.5)
    plt.title('Heatmap Post-Clean')
    plt.show()

    return df_final, features_to_drop


In [None]:
def save_final_df_and_verify(shuffled_df, selected_features_df, output_path):
    target_cols = [col for col in ['Clinical_Type'] if col in shuffled_df.columns]

    df_clean = pd.concat([selected_features_df, shuffled_df[target_cols]], axis=1)
    df_clean.to_csv(output_path, index=False)

    print(f"\nGuardado en {output_path}")
    print("Vista previa:")
    print(df_clean.head())

In [None]:
# Rutas
csv_path = '/home/anna/TFM/Radiomica/Radiomica/Características/caracteristicas_radiomicas.csv'
excel_path = '/home/anna/TFM/Radiomica/NO SUPERVISADO/Casos_cancer_red_241104_anon.xlsx'
output_final_features_path = '/home/anna/TFM/Radiomica/NO SUPERVISADO/radiomica_limpia.csv'

print("\n--- Iniciando limpieza, fusión y preparación de TODO el conjunto ---")
df_merged = clean_and_merge_features_with_clinical(
    csv_path,
    excel_path,
    output_merged_csv_path=None  
)

if df_merged is not None:
    df_prepared = prepare_dataframe_for_modeling(df_merged)
    analyze_class_distribution(df_prepared)

    X_numeric = df_prepared.drop(columns=['Clinical_Type'], errors='ignore').select_dtypes(include=[float, int])
    X_cleaned, dropped_features = calculate_correlation_and_remove(X_numeric, threshold=0.9)

    save_final_df_and_verify(df_prepared, X_cleaned, output_final_features_path)
else:
    print("[⚠️] No se pudo continuar con el procesamiento.")
