In [1244]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [1245]:
df = pd.read_csv('../data/raw/clinical_data_lung.zip')

In [1246]:
df.columns

Index(['Age recode with <1 year olds and 90+',
       'Race recode (White, Black, Other)', 'Primary Site - labeled',
       'Histologic Type ICD-O-3', 'Grade Clinical (2018+)',
       'Grade Recode (thru 2017)',
       'Combined Summary Stage with Expanded Regional Codes (2004+)',
       'Derived EOD 2018 Stage Group Recode (2018+)',
       '7th Edition Stage Group Recode (2016-2017)',
       'Derived AJCC Stage Group, 7th ed (2010-2015)',
       'CS tumor size (2004-2015)', 'Tumor Size Summary (2016+)',
       'Survival months', 'Survival months flag',
       'Vital status recode (study cutoff used)',
       'SEER cause-specific death classification', 'Year of diagnosis',
       'Median household income inflation adj to 2023',
       'RX Summ--Surg Prim Site (1998+)', 'Radiation recode',
       'Chemotherapy recode (yes, no/unk)', 'Sex',
       'Rural-Urban Continuum Code', 'Reason no cancer-directed surgery',
       'Total number of in situ/malignant tumors for patient',
       'Tota

In [1247]:
df.rename(columns={'Race recode (White, Black, Other)': 'Race', 'Rural-Urban Continuum Code': 'Rural Code',
                   'SEER cause-specific death classification': 'vital_status', 'Primary Site - labeled': 'Primary Site'}, inplace=True)

## Eliminar duplicados

In [1248]:
df.duplicated().sum()

np.int64(4977)

In [1249]:
df.drop_duplicates(inplace=True)

## Target `survival_months`

In [1250]:
df["Survival months flag"].value_counts()

Survival months flag
Complete dates are available and there are more than 0 days of survival      514288
Not calculated because a Death Certificate Only or Autopsy Only case           8706
Incomplete dates are available and there cannot be zero days of follow-up      6452
Complete dates are available and there are 0 days of survival                  2058
Incomplete dates are available and there could be zero days of follow-up        937
Name: count, dtype: int64

In [1251]:
df["Survival months"].unique()

array(['0059', '0027', '0000', '0052', '0012', '0014', '0005', '0016',
       '0040', '0010', '0009', '0001', '0020', '0003', '0002', '0021',
       '0025', '0130', '0007', '0013', '0060', '0017', '0111', '0033',
       '0004', '0090', '0037', '0062', '0024', '0008', '0006', '0076',
       '0075', '0034', '0098', '0050', '0053', '0015', '0094', '0038',
       '0078', '0026', '0128', '0125', 'Unknown', '0042', '0058', '0091',
       '0051', '0079', '0018', '0036', '0113', '0019', '0011', '0086',
       '0129', '0030', '0092', '0101', '0032', '0070', '0084', '0126',
       '0056', '0082', '0116', '0043', '0054', '0063', '0044', '0045',
       '0121', '0123', '0102', '0074', '0057', '0081', '0028', '0023',
       '0099', '0039', '0022', '0046', '0029', '0072', '0088', '0115',
       '0064', '0061', '0108', '0073', '0035', '0120', '0031', '0068',
       '0055', '0047', '0041', '0077', '0071', '0066', '0093', '0067',
       '0065', '0083', '0117', '0085', '0100', '0118', '0105', '0069',
   

In [1252]:
# 1. Definimos quiénes entran al modelo de excelencia
# Solo pacientes con fechas completas y vida mayor a 0
filtro_excelencia = df['Survival months flag'] == 'Complete dates are available and there are more than 0 days of survival'

# 2. Separamos los datasets
df = df[filtro_excelencia].copy()
df_excluidos = df[~filtro_excelencia].copy() # El ~ significa "lo que no cumple el filtro"

df = df[df['Survival months'] != 'Unknown'].copy()

# 1. Aseguramos que sea entero
df['survival_months_int'] = pd.to_numeric(df['Survival months'], errors='coerce').fillna(0).astype(int)

# 2. Definir la nueva función con rangos mixtos (de 5 en 5 y luego de 10 en 10)
def agrupar_supervivencia_ajustado(meses):
    if meses == 0:
        return 0
    elif 1 <= meses <= 6:
        return 6
    elif 6 < meses <= 54:
        # Incrementos de 5 en 5 hasta los 5 años
        return int(np.ceil(meses / 6.0) * 6)
    else:
        # Más de 60 meses (más de 5 años)
        return 60

# 3. Aplicamos la transformación
df['Target_Meses'] = df['survival_months_int'].apply(agrupar_supervivencia_ajustado)

# 4. Verificación para confirmar los saltos que pediste
print("Rangos de supervivencia generados:")
print(sorted(df['Target_Meses'].unique()))

# Eliminar columnas antiguas
df = df.drop(["Survival months", "Survival months flag", "survival_months_int"], axis=1)

print("\nConteo por rango:")
print(df['Target_Meses'].value_counts().sort_index())


  df_excluidos = df[~filtro_excelencia].copy() # El ~ significa "lo que no cumple el filtro"


Rangos de supervivencia generados:
[np.int64(0), np.int64(6), np.int64(12), np.int64(18), np.int64(24), np.int64(30), np.int64(36), np.int64(42), np.int64(48), np.int64(54), np.int64(60)]

Conteo por rango:
Target_Meses
0      56502
6     151558
12     73834
18     46851
24     32512
30     24543
36     19123
42     16709
48     14189
54     11985
60     66482
Name: count, dtype: int64


> IMPORTANCIA DE LOS 5 AÑOS DE SUPERVIVENCIA

In [1253]:
# Ejemplo de pesos: Duplicar la importancia de registros con supervivencia >= 60 meses
# df['sample_weight'] = df['Survival months'].apply(lambda x: 2.0 if x >= 60 else 1.0)

## Variables 'Vital status'

In [1254]:
# 1. Creamos un filtro de 'Muerte por Cáncer' o 'Sigue Vivo'
# Excluimos a los que murieron por causas ajenas para no confundir al modelo
filtro_causa_especifica = (df['vital_status'] == 'Dead (attributable to this cancer dx)') | \
                          (df['vital_status'] == 'Alive or dead of other cause')

# Aplicamos el filtro para tener un dataframe puramente oncólogico
df = df[filtro_causa_especifica].copy()

In [1255]:
# Si alguien está vivo pero tiene pocos meses, es un caso de diagnóstico reciente
# Si alguien está muerto pero tiene 20 trimestres, es un éxito de supervivencia a largo plazo
print(df.groupby('Vital status recode (study cutoff used)')['Target_Meses'].mean())

Vital status recode (study cutoff used)
Alive    35.337714
Dead     15.416786
Name: Target_Meses, dtype: float64


In [1256]:
# Se elimina las variasbles de 'respuesta'.
df = df.drop(["vital_status", "Vital status recode (study cutoff used)"], axis=1)

## Variable AGE

In [1257]:
df["Age recode with <1 year olds and 90+"].value_counts()

Age recode with <1 year olds and 90+
70-74 years    93119
65-69 years    87202
75-79 years    81446
60-64 years    66911
80-84 years    57730
55-59 years    43943
85-89 years    31549
50-54 years    21227
90+ years      12920
45-49 years     8374
40-44 years     3343
35-39 years     1477
30-34 years      725
25-29 years      363
20-24 years      191
15-19 years       82
01-04 years       36
10-14 years       34
00 years          20
05-09 years       17
Name: count, dtype: int64

In [1258]:
df = df[~df['Age recode with <1 year olds and 90+'].isin(['00 years', '01-04 years', '05-09 years', '10-14 years', '15-19 years', '20-24 years', '25-29 years', '30-34 years'])]

In [1259]:
df["Age recode with <1 year olds and 90+"].value_counts()

Age recode with <1 year olds and 90+
70-74 years    93119
65-69 years    87202
75-79 years    81446
60-64 years    66911
80-84 years    57730
55-59 years    43943
85-89 years    31549
50-54 years    21227
90+ years      12920
45-49 years     8374
40-44 years     3343
35-39 years     1477
Name: count, dtype: int64

##### Nota: Trabajamos con las caracteristicas mayores de 40 años por la cantidad de datos, si agrupamos a las edades comprendidas entre 20 y 39 estariamos asumiendo que uno de 20 y uno de 39 se comportan igual, perderiamos `granularidad`.
> He agrupado a los de 35-39 años con los de 40-44 años para tener mas datos. Ahora comparten el valor medio de 40-44 (42.0)

### Convertir a Valor Numérico Continúo (Midpoint Mapping)

In [1260]:
def transform_age_recode(df, column_name):
    
    # Define the mapping based on your specific distribution
    age_mapping = {
        '35-39 years': 42.0,
        '40-44 years': 42.0,
        '45-49 years': 47.0,
        '50-54 years': 52.0,
        '55-59 years': 57.0,
        '60-64 years': 62.0,
        '65-69 years': 67.0,
        '70-74 years': 72.0,
        '75-79 years': 77.0,
        '80-84 years': 82.0,
        '85-89 years': 87.0,
        '90+ years': 92.5  # Standard clinical estimate for 90+
    }
    
    # Clean the string to ensure matching (strip extra spaces)
    df[column_name] = df[column_name].str.strip()
    
    # Map values and create a new numeric column
    df['age_numeric'] = df[column_name].map(age_mapping)
    
    return df


df = transform_age_recode(df, 'Age recode with <1 year olds and 90+')

In [1261]:
#Borra la antigua columna de registros de edad
df = df.drop("Age recode with <1 year olds and 90+", axis=1)

## Variable Age para el otro dataframe


## Variable Race

In [1262]:
df["Race"].value_counts()

Race
White                                                        412654
Black                                                         54432
Other (American Indian/AK Native, Asian/Pacific Islander)     40615
Unknown                                                        1540
Name: count, dtype: int64

In [1263]:
# Cambiar el nombre de valor (Other (American Indian/AK Native, Asian/Pacific Islander) a (Other) y añadir los valores (Unknown) a (Other).
df["Race"] = df["Race"].replace({
    "Other (American Indian/AK Native, Asian/Pacific Islander)": "Other",
    "Unknown" : "Other"}
)

df["Race"].value_counts()

Race
White    412654
Black     54432
Other     42155
Name: count, dtype: int64

## Variable `Primary site`
> Lugar donde se originó el tumor primario (origen de la metástasis).

> - NOS: significa que no ha sido especificado "Not Otherwise Specified"
> - En medicina, una "lesión que se solapa" (overlapping lesion) ocurre cuando el tumor primario es tan grande o está ubicado de tal manera que invade dos o más lóbulos del pulmón (por ejemplo, parte del lóbulo superior y parte del medio) y no es posible determinar en cuál de los dos comenzó originalmente.

In [1264]:
df["Primary Site"] = df["Primary Site"].replace({"C34.1-Upper lobe, lung": "Upper lobe",
                                                 "C34.3-Lower lobe, lung": "Lower lobe",
                                                 "C34.9-Lung, NOS": "Unspecified",
                                                 "C34.2-Middle lobe, lung": "Middle lobe",
                                                 "C34.0-Main bronchus": "Main bronchus",
                                                 "C34.8-Overlapping lesion of lung": "Overlapping"
})
df["Primary Site"].value_counts()

Primary Site
Upper lobe       259117
Lower lobe       137989
Unspecified       64092
Middle lobe       22676
Main bronchus     20320
Overlapping        5047
Name: count, dtype: int64

## Variable `Histologic Type ICD-O-3`

In [1265]:
df["Histologic Type ICD-O-3"].unique()

array([8140, 8041, 8550, 8070, 8144, 8000, 8046, 8240, 8010, 8560, 8246,
       8255, 8551, 8012, 8254, 8083, 8252, 8230, 8480, 8073, 8071, 8260,
       8250, 8022, 8020, 8072, 8033, 8980, 8430, 8032, 8265, 8249, 8013,
       8481, 8253, 8310, 8043, 8507, 8800, 8200, 8045, 8244, 9120, 9041,
       8323, 8082, 8490, 8001, 8031, 8802, 8890, 8044, 8574, 8256, 8042,
       8123, 8084, 8830, 8562, 8004, 8251, 8052, 8075, 8575, 8074, 8470,
       9064, 8933, 8050, 9133, 8333, 9101, 8805, 9040, 8005, 8801, 9043,
       8680, 9540, 8023, 9137, 8825, 8257, 9364, 9080, 8341, 8576, 8940,
       8900, 8720, 8021, 8815, 8891, 8811, 8014, 8030, 8580, 8245, 8972,
       8581, 8290, 9071, 8525, 9100, 8441, 8810, 8120, 8936, 8201, 9180,
       8803, 9473, 8901, 9044, 8211, 9260, 8982, 8920, 8570, 8500, 8440,
       8002, 8804, 8247, 8410, 8850, 8051, 8912, 8896, 8851, 8894, 8854,
       8401, 8963, 8510, 9015, 9571, 9580, 9220, 8910, 8094, 9240, 8584,
       8320, 9130, 8040, 8210, 8743, 8973, 8714, 93

In [1266]:
def agrupar_histologia(codigo):
    try:
        c = int(codigo)
    except:
        return "Other/NOS"

    # 1. Adenocarcinomas (Incluye variantes papilares y mucinosas comunes)
    if (8140 <= c <= 8384) or (8440 <= c <= 8551):
        return "Adenocarcinoma"
    
    # 2. Escamosos (Squamous Cell)
    elif 8050 <= c <= 8084:
        return "Squamous Cell Carcinoma"
    
    # 3. Small Cell (Altamente agresivos - Crítico para predicción de vida)
    elif 8041 <= c <= 8045:
        return "Small Cell Carcinoma"
    
    # 4. Large Cell
    elif 8012 <= c <= 8014 or c == 8021:
        return "Large Cell Carcinoma"
    
    # 5. Neuroendocrinos (Excluyendo Small Cell)
    elif 8240 <= c <= 8249:
        return "Neuroendocrine Tumors"
    
    # 6. Carcinomas No Especificados (8000-8011)
    elif 8000 <= c <= 8011:
        return "Carcinoma NOS"
    
    else:
        return "Other/Specific Rare"
    
    # Aplicamos la función para crear una columna de texto
df['histology_type'] = df['Histologic Type ICD-O-3'].apply(agrupar_histologia)

# Eliminamos la columna original.
df = df.drop("Histologic Type ICD-O-3", axis=1)

df['histology_type'].value_counts()

histology_type
Adenocarcinoma             253962
Squamous Cell Carcinoma    106075
Small Cell Carcinoma        57993
Carcinoma NOS               51143
Other/Specific Rare         34417
Large Cell Carcinoma         5651
Name: count, dtype: int64

## Unificar Variable `Grade Clinical (2018+)` y `Grade Recode (thru 2017)`

In [1267]:
df["Grade Clinical (2018+)"].value_counts()

Grade Clinical (2018+)
Blank(s)    278959
9           163568
3            33017
2            22545
1             8733
4             2353
C               30
D               28
H                3
A                3
B                2
Name: count, dtype: int64

In [1268]:
df["Grade Recode (thru 2017)"].value_counts()

Grade Recode (thru 2017)
Blank(s)                                  230282
Unknown                                   151339
Poorly differentiated; Grade III           60542
Moderately differentiated; Grade II        44477
Well differentiated; Grade I               16082
Undifferentiated; anaplastic; Grade IV      6519
Name: count, dtype: int64

In [1269]:
def unificar_grados_clinicos(row):
    # Extraemos valores y limpiamos espacios
    g_new = str(row['Grade Clinical (2018+)']).strip()
    g_old = str(row['Grade Recode (thru 2017)']).strip()
    
    # Prioridad 1: Grado I / Bien diferenciado / 1 o A
    if 'Grade I' in g_old or g_new in ['1', 'A']:
        return '1: Well Differentiated'
    
    # Prioridad 2: Grado II / Moderadamente diferenciado / 2 o B
    if 'Grade II' in g_old or g_new in ['2', 'B']:
        return '2: Moderately Differentiated'
    
    # Prioridad 3: Grado III / Pobremente diferenciado / 3 o C
    if 'Grade III' in g_old or g_new in ['3', 'C', 'H']:
        return '3: Poorly Differentiated'
    
    # Prioridad 4: Grado IV / Anaplásico / 4 o D
    if 'Grade IV' in g_old or g_new in ['4', 'D']:
        return '4: Undifferentiated/Anaplastic'
    
    # Si no cae en ninguno, es Desconocido o Blank
    return 'Unknown/Blank'

# Aplicamos a todo el dataframe
df['grade_clinical'] = df.apply(unificar_grados_clinicos, axis=1)

# Se eliminan las columnas antiguas
df = df.drop(["Grade Clinical (2018+)","Grade Recode (thru 2017)"], axis=1)

# Verificamos el éxito de la unificación
df['grade_clinical'].value_counts()

grade_clinical
Unknown/Blank                     314907
1: Well Differentiated            136356
3: Poorly Differentiated           33050
2: Moderately Differentiated       22547
4: Undifferentiated/Anaplastic      2381
Name: count, dtype: int64

In [1270]:
# # Mapeo ordinal respetando la jerarquía clínica
# grado_mapping = {
#     '1: Well Differentiated': 1,
#     '2: Moderately Differentiated': 2,
#     '3: Poorly Differentiated': 3,
#     '4: Undifferentiated/Anaplastic': 4,
#     'Unknown/Blank': np.nan
# }
# df['grade_numeric'] = df['grade_clinical'].map(grado_mapping)

#### NOTA
> - MIRAR MAS ADELANTE SI TRATAR LOS VALORES UNKNOWS DE UNA MANERA DISTINTA A LA ACTUAL

> - EL CODIGO DE ABAJO

In [1271]:
# import numpy as np

# # Creamos un mapeo numérico ordinal
# # Dejamos fuera los 'Unknown/Blank' para que se conviertan en NaN automáticamente
# mapeo_ordinal = {
#     '1: Well Differentiated': 1,
#     '2: Moderately Differentiated': 2,
#     '3: Poorly Differentiated': 3,
#     '4: Undifferentiated/Anaplastic': 4
# }

# # Aplicamos el mapeo. Los que no están en el diccionario (los Unknown) se vuelven NaN
# df['grade_numeric'] = df['grade_final'].map(mapeo_ordinal)

# # Verificamos que ahora tenemos NaNs reales
# print(f"Total de valores nulos en grado: {df['grade_numeric'].isna().sum()}")

In [1272]:
# Comparamos la media de supervivencia en meses por cada grado
print(df.groupby('grade_clinical')['Target_Meses'].mean().sort_values())

grade_clinical
4: Undifferentiated/Anaplastic    12.267115
3: Poorly Differentiated          16.384448
Unknown/Blank                     17.624099
2: Moderately Differentiated      20.288553
1: Well Differentiated            30.548447
Name: Target_Meses, dtype: float64


## Variable `Combined Summary Stage with Expanded Regional Codes (2004+)`

In [1273]:
df["Combined Summary Stage with Expanded Regional Codes (2004+)"].value_counts()

Combined Summary Stage with Expanded Regional Codes (2004+)
Distant site(s)/node(s) involved                                258250
Localized only                                                  125683
Regional lymph nodes involved only                               44089
Regional by direct extension only                                34344
Regional by both direct extension and lymph node involvement     29613
Unknown/unstaged/unspecified/DCO                                 17259
In situ                                                              3
Name: count, dtype: int64

In [1274]:
def unificar_estadio_total_final(row):
    # 1. Extraemos las 4 fuentes temporales
    s_2004 = str(row['Combined Summary Stage with Expanded Regional Codes (2004+)']).lower()
    s_2010 = str(row['Derived AJCC Stage Group, 7th ed (2010-2015)']).upper()
    s_2016 = str(row['7th Edition Stage Group Recode (2016-2017)']).upper()
    s_2018 = str(row['Derived EOD 2018 Stage Group Recode (2018+)']).upper()
    
    # --- LÓGICA DE UNIFICACIÓN JERÁRQUICA ---
    
    # NIVEL 3: DISTANTE (Metástasis / Estadio IV)
    # Buscamos 'distant', 'IV' o código '4'
    if 'distant' in s_2004 or 'IV' in s_2010 or 'IV' in s_2016 or '4' in s_2018:
        return 3
    
    # NIVEL 2: REGIONAL (Extensión a ganglios / Estadios II y III)
    # Buscamos 'regional', romanos 'II'/'III' o códigos '2'/'3'
    elif 'regional' in s_2004 or \
         any(x in s_2010 for x in ['II', 'III']) or \
         any(x in s_2016 for x in ['II', 'III']) or \
         any(x in s_2018 for x in ['2A', '2B', '3A', '3B', '3C']):
        return 2
    
    # NIVEL 1: LOCALIZADO / IN SITU (Estadio I o 0)
    # Buscamos 'localized', 'situ', romano 'I' o código '1'
    elif 'localized' in s_2004 or 'situ' in s_2004 or \
         'I' in s_2010 or 'I' in s_2016 or \
         any(x in s_2018 for x in ['1A', '1B', '0S', '0C']):
        return 1
        
    # Si todo falla, enviamos a NaN para que XGBoost use su lógica de nulos
    return np.nan

# Aplicamos la columna definitiva
df['stage_numeric'] = df.apply(unificar_estadio_total_final, axis=1)

# Se eliminan las columnas antiguas
df = df.drop(["Combined Summary Stage with Expanded Regional Codes (2004+)",
              "Derived EOD 2018 Stage Group Recode (2018+)",
              "7th Edition Stage Group Recode (2016-2017)",
              "Derived AJCC Stage Group, 7th ed (2010-2015)"], axis=1)

df['stage_numeric'].value_counts()

stage_numeric
3.0    258597
2.0    120026
1.0    113860
Name: count, dtype: int64

In [1275]:
# Analizamos la supervivencia trimestral media por estadio unificado
analisis = df.groupby('stage_numeric')['Target_Meses'].agg(['mean', 'count']).sort_index()
print(analisis)

                    mean   count
stage_numeric                   
1.0            34.388284  113860
2.0            26.397581  120026
3.0            13.166092  258597


In [1276]:
df.columns

Index(['Race', 'Primary Site', 'CS tumor size (2004-2015)',
       'Tumor Size Summary (2016+)', 'Year of diagnosis',
       'Median household income inflation adj to 2023',
       'RX Summ--Surg Prim Site (1998+)', 'Radiation recode',
       'Chemotherapy recode (yes, no/unk)', 'Sex', 'Rural Code',
       'Reason no cancer-directed surgery',
       'Total number of in situ/malignant tumors for patient',
       'Total number of benign/borderline tumors for patient',
       'Sequence number', 'Type of Reporting Source', 'Target_Meses',
       'age_numeric', 'histology_type', 'grade_clinical', 'stage_numeric'],
      dtype='object')

In [1277]:
import numpy as np

def consolidar_tumor_size(row):
    # Extraemos valores de ambas épocas
    ts_2004 = str(row['CS tumor size (2004-2015)']).strip().upper()
    ts_2016 = str(row['Tumor Size Summary (2016+)']).strip().upper()
    
    def limpiar_valor(val):
        # Manejo de nulos administrativos
        if val in ['BLANK(S)', '999', 'NAN', '990']: 
            return np.nan
        
        try:
            v_float = float(val)
            # 000 es microinvasión, lo asignamos como 0.5mm para diferenciarlo de 0
            if v_float == 0: return 0.5
            # Valores por encima de 988 son códigos de error o desconocidos en SEER
            if v_float > 988: return np.nan
            return v_float
        except ValueError:
            return np.nan

    # Prioridad: Intentamos con la columna de 2016+, si falla, vamos a la de 2004
    final_size = limpiar_valor(ts_2016)
    if np.isnan(final_size):
        final_size = limpiar_valor(ts_2004)
        
    return final_size

# Creamos la variable definitiva
df['tumor_size_mm'] = df.apply(consolidar_tumor_size, axis=1)

# Techo clinico
df['tumor_size_mm'] = df['tumor_size_mm'].clip(upper=200)

# Se elimina las columnas originales
df = df.drop(["CS tumor size (2004-2015)","Tumor Size Summary (2016+)"], axis=1)

df['tumor_size_mm'].value_counts()


tumor_size_mm
20.0     13499
15.0     13035
30.0     12508
25.0     12488
40.0     10375
         ...  
197.0        3
183.0        2
196.0        2
191.0        2
194.0        1
Name: count, Length: 201, dtype: int64

In [1278]:
df['tumor_size_mm'].describe()

count    420122.000000
mean         40.843826
std          27.348741
min           0.500000
25%          20.000000
50%          34.000000
75%          55.000000
max         200.000000
Name: tumor_size_mm, dtype: float64

In [1279]:
df.isnull().sum()

Race                                                        0
Primary Site                                                0
Year of diagnosis                                           0
Median household income inflation adj to 2023               0
RX Summ--Surg Prim Site (1998+)                             0
Radiation recode                                            0
Chemotherapy recode (yes, no/unk)                           0
Sex                                                         0
Rural Code                                                  0
Reason no cancer-directed surgery                           0
Total number of in situ/malignant tumors for patient        0
Total number of benign/borderline tumors for patient        0
Sequence number                                             0
Type of Reporting Source                                    0
Target_Meses                                                0
age_numeric                                                 0
histolog

In [1280]:
df.columns

Index(['Race', 'Primary Site', 'Year of diagnosis',
       'Median household income inflation adj to 2023',
       'RX Summ--Surg Prim Site (1998+)', 'Radiation recode',
       'Chemotherapy recode (yes, no/unk)', 'Sex', 'Rural Code',
       'Reason no cancer-directed surgery',
       'Total number of in situ/malignant tumors for patient',
       'Total number of benign/borderline tumors for patient',
       'Sequence number', 'Type of Reporting Source', 'Target_Meses',
       'age_numeric', 'histology_type', 'grade_clinical', 'stage_numeric',
       'tumor_size_mm'],
      dtype='object')

## Variable `Year of diagnosis`

In [1281]:
df["Year of diagnosis"].value_counts()

Year of diagnosis
2019    48257
2017    47672
2018    46890
2016    46750
2015    46732
2014    46320
2021    46147
2022    45813
2013    45785
2012    45700
2020    43175
Name: count, dtype: int64

In [1282]:
# ¿Ha aumentado la supervivencia media con los años?
df.groupby('Year of diagnosis')['Target_Meses'].mean()

Year of diagnosis
2012    22.170460
2013    22.690838
2014    23.435104
2015    24.442181
2016    25.251465
2017    26.198272
2018    26.000384
2019    22.952981
2020    18.243752
2021    13.345700
2022     6.635060
Name: Target_Meses, dtype: float64

> Transformar el año en tres "Eras Médicas"

In [1283]:
def agrupar_eras_medicas(year):
    # Agrupamos por eras de tratamiento/tecnología
    if year <= 2016: return 1 # Era antigua
    if year <= 2021: return 2 # Era media
    return 3 # Era reciente (alta censura)

df['medical_era'] = df['Year of diagnosis'].apply(agrupar_eras_medicas)

# Eliminar la columna original
df.drop('Year of diagnosis', axis=1, inplace=True)

In [1284]:
# Verificación de volumen por Era
df['medical_era'].value_counts().sort_index()

medical_era
1    231287
2    232141
3     45813
Name: count, dtype: int64

## Variable `Median household income inflation adj to 2023`

In [1285]:
df["Median household income inflation adj to 2023"].value_counts()

Median household income inflation adj to 2023
$75,000 - $79,999                         49603
$85,000 - $89,999                         48490
$100,000 - $109,999                       45757
$80,000 - $84,999                         45628
$70,000 - $74,999                         40811
$65,000 - $69,999                         40179
$60,000 - $64,999                         36148
$120,000+                                 36002
$55,000 - $59,999                         26850
$95,000 - $99,999                         26648
$50,000 - $54,999                         25913
$110,000 - $119,999                       25419
$90,000 - $94,999                         22078
$45,000 - $49,999                         19239
$40,000 - $44,999                         11619
< $40,000                                  8846
Unknown/missing/no match/Not 1990-2023       11
Name: count, dtype: int64

In [1286]:
def mapear_ingresos(valor):
    v = str(valor).strip()
    
    # Diccionario de mapeo ordinal (de menor a mayor ingreso)
    mapeo = {
        '< $40,000': 1,
        '$40,000 - $44,999': 2,
        '$45,000 - $49,999': 3,
        '$50,000 - $54,999': 4,
        '$55,000 - $59,999': 5,
        '$60,000 - $64,999': 6,
        '$65,000 - $69,999': 7,
        '$70,000 - $74,999': 8,
        '$75,000 - $79,999': 9,
        '$80,000 - $84,999': 10,
        '$85,000 - $89,999': 11,
        '$90,000 - $94,999': 12,
        '$95,000 - $99,999': 13,
        '$100,000 - $109,999': 14,
        '$110,000 - $119,999': 15,
        '$120,000+': 16
    }
    
    return mapeo.get(v, np.nan) # Los 'Unknown/missing' devuelven NaN

df['income_ordinal'] = df['Median household income inflation adj to 2023'].apply(mapear_ingresos)

In [1287]:
# Comprobamos si el ingreso afecta el promedio de supervivencia trimestral
print(df.groupby('income_ordinal')['Target_Meses'].mean().sort_index())

income_ordinal
1.0     18.063758
2.0     18.377485
3.0     19.684391
4.0     19.038475
5.0     19.039777
6.0     20.596326
7.0     19.597103
8.0     22.006420
9.0     22.142774
10.0    23.105286
11.0    19.115735
12.0    21.730229
13.0    22.210222
14.0    22.160019
15.0    24.331406
16.0    21.472474
Name: Target_Meses, dtype: float64


In [1288]:
# Se elimina la columna original.
df.drop('Median household income inflation adj to 2023', axis=1, inplace=True)

In [1289]:
df.columns

Index(['Race', 'Primary Site', 'RX Summ--Surg Prim Site (1998+)',
       'Radiation recode', 'Chemotherapy recode (yes, no/unk)', 'Sex',
       'Rural Code', 'Reason no cancer-directed surgery',
       'Total number of in situ/malignant tumors for patient',
       'Total number of benign/borderline tumors for patient',
       'Sequence number', 'Type of Reporting Source', 'Target_Meses',
       'age_numeric', 'histology_type', 'grade_clinical', 'stage_numeric',
       'tumor_size_mm', 'medical_era', 'income_ordinal'],
      dtype='object')

## Variable `RX Summ--Surg Prim Site (1998+)`

In [1290]:
df["RX Summ--Surg Prim Site (1998+)"].value_counts()

RX Summ--Surg Prim Site (1998+)
0     395835
33     66023
21     19456
30      8740
22      6123
99      3378
56      2144
45      1891
12       824
23       751
46       745
55       614
90       582
20       511
24       376
15       328
13       260
19       192
80       113
25        90
47        79
48        66
70        57
66        36
65        27
Name: count, dtype: int64

In [1291]:
def mapear_cirugia(valor):
    try:
        v = int(valor)
        if v == 0:
            return 0  # Sin cirugía
        elif 10 <= v <= 19:
            return 1  # Destrucción local
        elif 20 <= v <= 27:
            return 2  # Resección parcial
        elif 30 <= v <= 80:
            return 3  # Cirugía radical/total
        else:
            return np.nan # Códigos 90, 99 o errores
    except:
        return np.nan

df['surgery_intensity'] = df['RX Summ--Surg Prim Site (1998+)'].apply(mapear_cirugia)

## Variable `Radiation recode`

In [1292]:
df["Radiation recode"].value_counts()

Radiation recode
None/Unknown                                             301259
Beam radiation                                           191707
Refused (1988+)                                            9737
Recommended, unknown if administered                       4371
Radiation, NOS  method or source not specified             1451
Radioactive implants (includes brachytherapy) (1988+)       429
Combination of beam with implants or isotopes               168
Radioisotopes (1988+)                                       119
Name: count, dtype: int64

In [1293]:
def limpiar_radioterapia(valor):
    v = str(valor).lower()
    # Si contiene 'radiation' o 'isotopes' o 'implants', asumimos que recibió
    if 'radiation' in v or 'isotopes' in v or 'implants' in v:
        return 1
    return 0

df['radiation_binary'] = df['Radiation recode'].apply(limpiar_radioterapia)

df['radiation_binary'].value_counts()

radiation_binary
0    315367
1    193874
Name: count, dtype: int64

## Variable `Reason no cancer-directed surgery`

In [1294]:
df["Reason no cancer-directed surgery"].value_counts()

Reason no cancer-directed surgery
Not recommended                                                                 357708
Surgery performed                                                               110028
Not recommended, contraindicated due to other cond; autopsy only (1973-2002)     24249
Recommended but not performed, patient refused                                    7903
Recommended but not performed, unknown reason                                     3923
Unknown; death certificate; or autopsy only (2003+)                               3330
Recommended, unknown if performed                                                 1113
Not performed, patient died prior to recommended surgery                           987
Name: count, dtype: int64

In [1295]:
def mapear_motivo_no_cirugia(valor):
    v = str(valor).strip()
    
    # 0: Se realizó la cirugía (sin barrera)
    if 'Surgery performed' in v:
        return 0
    
    # 1: Barrera Clínica (Estado de salud muy pobre / Contraindicado)
    if 'contraindicated' in v or 'died prior' in v:
        return 1
    
    # 2: Barrera de Decisión (El paciente o familia dijo no)
    if 'patient refused' in v:
        return 2
    
    # 3: Barrera de Gravedad (No recomendada por el médico)
    if 'Not recommended' in v:
        return 3
        
    return np.nan # Otros casos o desconocidos

df['surgical_barrier'] = df['Reason no cancer-directed surgery'].apply(mapear_motivo_no_cirugia)

df['surgical_barrier'].value_counts()

surgical_barrier
3.0    357708
0.0    110028
1.0     25236
2.0      7903
Name: count, dtype: int64

## Variable `"Chemotherapy recode (yes, no/unk)"`

In [1296]:
df["Chemotherapy recode (yes, no/unk)"].value_counts()

Chemotherapy recode (yes, no/unk)
No/Unknown    307417
Yes           201824
Name: count, dtype: int64

In [1297]:
df['chemo_binary'] = df['Chemotherapy recode (yes, no/unk)'].map({'Yes': 1, 'No/Unknown': 0})
df["chemo_binary"].value_counts()

chemo_binary
0    307417
1    201824
Name: count, dtype: int64

In [1298]:
# Lista de columnas originales que ya procesamos y podemos eliminar
cols_to_drop = [
    'Radiation recode', 
    'Chemotherapy recode (yes, no/unk)', 
    'Reason no cancer-directed surgery',
    'RX Summ--Surg Prim Site (1998+)'
]

df = df.drop(columns=cols_to_drop)

In [1299]:
df.columns

Index(['Race', 'Primary Site', 'Sex', 'Rural Code',
       'Total number of in situ/malignant tumors for patient',
       'Total number of benign/borderline tumors for patient',
       'Sequence number', 'Type of Reporting Source', 'Target_Meses',
       'age_numeric', 'histology_type', 'grade_clinical', 'stage_numeric',
       'tumor_size_mm', 'medical_era', 'income_ordinal', 'surgery_intensity',
       'radiation_binary', 'surgical_barrier', 'chemo_binary'],
      dtype='object')

## Variable `Sex`

In [1300]:
df["Sex"].value_counts()

Sex
Male      259095
Female    250146
Name: count, dtype: int64

In [1301]:
# Mapeo binario: Male -> 0, Female -> 1
# (El orden es arbitrario, pero ser consistente es clave)
df['sex_binary'] = df['Sex'].map({'Male': 1, 'Female': 0})

# Se elimina la columna original
df = df.drop('Sex', axis=1)

# Verificamos 
df['sex_binary'].value_counts()

sex_binary
1    259095
0    250146
Name: count, dtype: int64

In [1302]:
df.columns

Index(['Race', 'Primary Site', 'Rural Code',
       'Total number of in situ/malignant tumors for patient',
       'Total number of benign/borderline tumors for patient',
       'Sequence number', 'Type of Reporting Source', 'Target_Meses',
       'age_numeric', 'histology_type', 'grade_clinical', 'stage_numeric',
       'tumor_size_mm', 'medical_era', 'income_ordinal', 'surgery_intensity',
       'radiation_binary', 'surgical_barrier', 'chemo_binary', 'sex_binary'],
      dtype='object')

## Variable `Rural Code`

In [1303]:
df["Rural Code"].value_counts()

Rural Code
Counties in metropolitan areas ge 1 million pop                 275408
Counties in metropolitan areas of 250,000 to 1 million pop      107955
Nonmetropolitan counties adjacent to a metropolitan area         47392
Counties in metropolitan areas of lt 250 thousand pop            44531
Nonmetropolitan counties not adjacent to a metropolitan area     33141
Unknown/missing/no match (Alaska or Hawaii - Entire State)         803
Unknown/missing/no match/Not 1990-2023                              11
Name: count, dtype: int64

In [1304]:
def mapear_rural_code(valor):
    v = str(valor).lower()
    
    # 1: Metrópolis grandes (>= 1 millón)
    if 'ge 1 million' in v:
        return 1
    # 2: Metrópolis medianas (250k - 1M)
    elif '250,000 to 1 million' in v:
        return 2
    # 3: Metrópolis pequeñas (< 250k)
    elif 'lt 250 thousand' in v:
        return 3
    # 4: No metropolitano adyacente a metro
    elif 'adjacent to a metropolitan' in v:
        return 4
    # 5: No metropolitano NO adyacente (Rural puro)
    elif 'not adjacent' in v:
        return 5
    
    return np.nan # Para los Unknown/Missing

df['urbanization_index'] = df['Rural Code'].apply(mapear_rural_code)

In [1305]:
df.groupby('urbanization_index')['Target_Meses'].mean().sort_values()

urbanization_index
4.0    18.841891
3.0    19.938335
2.0    21.029707
1.0    21.979797
Name: Target_Meses, dtype: float64

In [1306]:
df = df.drop('Rural Code', axis=1)

In [1307]:
df.columns

Index(['Race', 'Primary Site',
       'Total number of in situ/malignant tumors for patient',
       'Total number of benign/borderline tumors for patient',
       'Sequence number', 'Type of Reporting Source', 'Target_Meses',
       'age_numeric', 'histology_type', 'grade_clinical', 'stage_numeric',
       'tumor_size_mm', 'medical_era', 'income_ordinal', 'surgery_intensity',
       'radiation_binary', 'surgical_barrier', 'chemo_binary', 'sex_binary',
       'urbanization_index'],
      dtype='object')

## Variable `Total number of in situ/malignant tumors for patient`

In [1308]:
df["Total number of in situ/malignant tumors for patient"].value_counts()

Total number of in situ/malignant tumors for patient
1     347944
2     116806
3      33094
4       8526
5       2067
6        550
7        164
8         39
9         20
10        10
11         6
21         2
13         2
14         2
32         2
20         2
30         2
34         1
18         1
23         1
Name: count, dtype: int64

In [1309]:
# 1. Procesamos los tumores Malignos/In Situ
df['total_malignant_tumors'] = df['Total number of in situ/malignant tumors for patient'].clip(upper=5)

# Se elimina la columna original
df= df.drop("Total number of in situ/malignant tumors for patient", axis=1)

df["total_malignant_tumors"].value_counts()

total_malignant_tumors
1    347944
2    116806
3     33094
4      8526
5      2871
Name: count, dtype: int64

## Variable `Total number of benign/borderline tumors for patient`

In [1310]:
df["Total number of benign/borderline tumors for patient"].value_counts()

Total number of benign/borderline tumors for patient
0    503181
1      5866
2       183
3        10
4         1
Name: count, dtype: int64

In [1311]:
# 1. Procesamos los tumores Benignos
df['total_benign_tumors'] = df['Total number of benign/borderline tumors for patient'].clip(upper=5)

# Se elimina la columna original
df= df.drop("Total number of benign/borderline tumors for patient", axis=1)

df["total_benign_tumors"].value_counts()

total_benign_tumors
0    503181
1      5866
2       183
3        10
4         1
Name: count, dtype: int64

In [1312]:
df.columns

Index(['Race', 'Primary Site', 'Sequence number', 'Type of Reporting Source',
       'Target_Meses', 'age_numeric', 'histology_type', 'grade_clinical',
       'stage_numeric', 'tumor_size_mm', 'medical_era', 'income_ordinal',
       'surgery_intensity', 'radiation_binary', 'surgical_barrier',
       'chemo_binary', 'sex_binary', 'urbanization_index',
       'total_malignant_tumors', 'total_benign_tumors'],
      dtype='object')

## Variable `Sequence number`

In [1313]:
df["Sequence number"].value_counts()

Sequence number
One primary only                344611
2nd of 2 or more primaries      104603
1st of 2 or more primaries       28251
3rd of 3 or more primaries       24697
4th of 4 or more primaries        5496
5th of 5 or more primaries        1175
6th of 6 or more primaries         283
7th of 7 or more primaries          79
8th of 8 or more primaries          17
9th of 9 or more primaries           8
10th of 10 or more primaries         5
21st of 21 or more primaries         2
13th of 13 or more primaries         2
14th of 14 or more primaries         2
11th of 11 or more primaries         2
32nd of 32 or more primaries         2
30th of 30 or more primaries         2
20th of 20 or more primaries         2
17th of 17 or more primaries         1
34th of 34 or more primaries         1
Name: count, dtype: int64

In [1314]:
def mapear_secuencia(valor):
    v = str(valor).lower()
    
    # Caso más simple: Un solo tumor en la vida
    if 'one primary' in v:
        return 1
    
    # Extraemos el número de la posición (1st, 2nd, 3rd...)
    if '1st' in v: return 1
    if '2nd' in v: return 2
    if '3rd' in v: return 3
    if '4th' in v: return 4
    
    # Para casos de 5 o más, los agrupamos (Clamping)
    # Al igual que con el conteo total, el riesgo se satura en valores altos
    return 5

df['sequence_numeric'] = df['Sequence number'].apply(mapear_secuencia)

#Se elimina la columna original 
df = df.drop(["Sequence number"], axis=1)

In [1315]:
df.columns

Index(['Race', 'Primary Site', 'Type of Reporting Source', 'Target_Meses',
       'age_numeric', 'histology_type', 'grade_clinical', 'stage_numeric',
       'tumor_size_mm', 'medical_era', 'income_ordinal', 'surgery_intensity',
       'radiation_binary', 'surgical_barrier', 'chemo_binary', 'sex_binary',
       'urbanization_index', 'total_malignant_tumors', 'total_benign_tumors',
       'sequence_numeric'],
      dtype='object')

## Variable `Type of Reporting Source`

In [1316]:
df["Type of Reporting Source"].value_counts()

Type of Reporting Source
Hospital inpatient/outpatient or clinic                     488029
Other hospital outpatient unit or surgery center (2006+)      5296
Radiation treatment or medical oncology center (2006+)        5280
Physicians office/private medical practitioner (LMD)          4531
Laboratory only (hospital or private)                         4316
Nursing/convalescent home/hospice                             1789
Name: count, dtype: int64

### Agrupación por Calidad de Información

In [1317]:
def mapear_fuente_reporte(valor):
    v = str(valor).lower()
    
    # Nivel 1: Centros con capacidad quirúrgica y hospitalaria
    if 'hospital' in v or 'surgery center' in v:
        return 1
    # Nivel 2: Especialistas y clínicas externas
    if 'physician' in v or 'radiation' in v or 'medical oncology' in v:
        return 2
    # Nivel 3: Fuentes limitadas o de cuidados paliativos
    if 'laboratory' in v or 'nursing' in v or 'hospice' in v:
        return 3
        
    return np.nan

df['reporting_source_quality'] = df['Type of Reporting Source'].apply(mapear_fuente_reporte)

#Se elimina la columna original 
df = df.drop(["Type of Reporting Source"], axis=1)


In [1318]:
df.columns

Index(['Race', 'Primary Site', 'Target_Meses', 'age_numeric', 'histology_type',
       'grade_clinical', 'stage_numeric', 'tumor_size_mm', 'medical_era',
       'income_ordinal', 'surgery_intensity', 'radiation_binary',
       'surgical_barrier', 'chemo_binary', 'sex_binary', 'urbanization_index',
       'total_malignant_tumors', 'total_benign_tumors', 'sequence_numeric',
       'reporting_source_quality'],
      dtype='object')

In [1319]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 509241 entries, 0 to 537417
Data columns (total 20 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Race                      509241 non-null  object 
 1   Primary Site              509241 non-null  object 
 2   Target_Meses              509241 non-null  int64  
 3   age_numeric               509241 non-null  float64
 4   histology_type            509241 non-null  object 
 5   grade_clinical            509241 non-null  object 
 6   stage_numeric             492483 non-null  float64
 7   tumor_size_mm             420122 non-null  float64
 8   medical_era               509241 non-null  int64  
 9   income_ordinal            509230 non-null  float64
 10  surgery_intensity         505281 non-null  float64
 11  radiation_binary          509241 non-null  int64  
 12  surgical_barrier          500875 non-null  float64
 13  chemo_binary              509241 non-null  int64 

## Codificacion de las variables

### One-Hot Encoding

In [1320]:
df["Race"].value_counts()

Race
White    412654
Black     54432
Other     42155
Name: count, dtype: int64

In [1321]:
df = pd.get_dummies(df, columns=['Race'], prefix='race')

### LabelEncoder

In [1322]:
df["histology_type"].value_counts()

histology_type
Adenocarcinoma             253962
Squamous Cell Carcinoma    106075
Small Cell Carcinoma        57993
Carcinoma NOS               51143
Other/Specific Rare         34417
Large Cell Carcinoma         5651
Name: count, dtype: int64

In [1323]:
df["Primary Site"].value_counts()

Primary Site
Upper lobe       259117
Lower lobe       137989
Unspecified       64092
Middle lobe       22676
Main bronchus     20320
Overlapping        5047
Name: count, dtype: int64

In [1324]:
le = LabelEncoder()

# df['histology_numeric'] = le.fit_transform(df['histology_type'].astype(str))
df['site_numeric'] = le.fit_transform(df['Primary Site'].astype(str))

## Creación de Interacciones Clave

In [1325]:
# 1. Agresividad por Tamaño: ¿Es un tumor pequeño pero avanzado, o grande y temprano?
df['size_stage_ratio'] = df['tumor_size_mm'] * df['stage_numeric']

# 2. Fragilidad por Edad: La etapa impacta diferente según la edad del paciente
df['age_stage_interaction'] = df['age_numeric'] * df['stage_numeric']

# 3. Carga Tumoral Relativa: Malignidad total ajustada por la etapa actual
df['total_burden_stage'] = df['total_malignant_tumors'] * df['stage_numeric']

# Llenamos posibles nulos generados si había nulos en stage o size
df[['size_stage_ratio', 'age_stage_interaction', 'total_burden_stage']] = \
    df[['size_stage_ratio', 'age_stage_interaction', 'total_burden_stage']].fillna(0)

print("Nuevas variables creadas exitosamente.")

Nuevas variables creadas exitosamente.


In [1326]:
df.columns

Index(['Primary Site', 'Target_Meses', 'age_numeric', 'histology_type',
       'grade_clinical', 'stage_numeric', 'tumor_size_mm', 'medical_era',
       'income_ordinal', 'surgery_intensity', 'radiation_binary',
       'surgical_barrier', 'chemo_binary', 'sex_binary', 'urbanization_index',
       'total_malignant_tumors', 'total_benign_tumors', 'sequence_numeric',
       'reporting_source_quality', 'race_Black', 'race_Other', 'race_White',
       'site_numeric', 'size_stage_ratio', 'age_stage_interaction',
       'total_burden_stage'],
      dtype='object')

In [1327]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# A. Convertir Grado Clínico a números (Ordinal)
grado_mapping = {
    '1: Well Differentiated': 1,
    '2: Moderately Differentiated': 2,
    '3: Poorly Differentiated': 3,
    '4: Undifferentiated/Anaplastic': 4,
    'Unknown/Blank': np.nan # XGBoost maneja los NaN automáticamente
}
df['grade_clinical'] = df['grade_clinical'].map(grado_mapping)

# C. Convertir Histología y Sitio a números (Label Encoding)
le = LabelEncoder()
df['histology_type'] = le.fit_transform(df['histology_type'].astype(str))
df['Primary Site'] = le.fit_transform(df['Primary Site'].astype(str))

# D. Asegurar que todo sea numérico (Esto evita el error de nuevo)
print(df.dtypes.value_counts())

int64      12
float64    11
bool        3
Name: count, dtype: int64


## Split

In [1328]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# 1. Definir X e y
X = df.drop('Target_Meses', axis=1)
y = df['Target_Meses']

# 2. Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Modelo con parámetros para evitar sobreajuste
regressor = xgb.XGBRegressor(
    n_estimators=2000,
    learning_rate=0.03,
    max_depth=10,
    objective='reg:squarederror',
    tree_method='hist',
    random_state=42
)

regressor.fit(X_train, y_train, eval_set=[(X_test, y_test)])

[0]	validation_0-rmse:19.63461
[1]	validation_0-rmse:19.36696
[2]	validation_0-rmse:19.11122
[3]	validation_0-rmse:18.86765
[4]	validation_0-rmse:18.63523
[5]	validation_0-rmse:18.41354
[6]	validation_0-rmse:18.20298
[7]	validation_0-rmse:18.00191
[8]	validation_0-rmse:17.81144
[9]	validation_0-rmse:17.62909
[10]	validation_0-rmse:17.45538
[11]	validation_0-rmse:17.29072
[12]	validation_0-rmse:17.13410
[13]	validation_0-rmse:16.98508
[14]	validation_0-rmse:16.84356
[15]	validation_0-rmse:16.70895
[16]	validation_0-rmse:16.58095
[17]	validation_0-rmse:16.45944
[18]	validation_0-rmse:16.34379
[19]	validation_0-rmse:16.23473
[20]	validation_0-rmse:16.13060
[21]	validation_0-rmse:16.03226
[22]	validation_0-rmse:15.93926
[23]	validation_0-rmse:15.85116
[24]	validation_0-rmse:15.76733
[25]	validation_0-rmse:15.68759
[26]	validation_0-rmse:15.61217
[27]	validation_0-rmse:15.54088
[28]	validation_0-rmse:15.47347
[29]	validation_0-rmse:15.40937
[30]	validation_0-rmse:15.34878
[31]	validation_0-

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'reg:squarederror'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [1329]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Realizar predicciones sobre el set de prueba
y_pred = regressor.predict(X_test)

# Calcular métricas
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"--- Métricas del Modelo ---")
print(f"MAE (Error Absoluto Medio): {mae:.2f} meses")
print(f"RMSE (Error Cuadrático Medio): {rmse:.2f} meses")
print(f"R² (Coeficiente de Determinación): {r2:.4f}")

--- Métricas del Modelo ---
MAE (Error Absoluto Medio): 10.52 meses
RMSE (Error Cuadrático Medio): 14.37 meses
R² (Coeficiente de Determinación): 0.4791


## ENTRENAMIENTO SIN LA ERA 3, 2018-2022

In [1330]:
# Filtramos para quedarnos solo con pacientes con seguimiento de largo plazo
# Era 1 y 2 tienen datos históricos más erass.
df_eras = df[df['medical_era'].isin([1, 2])].copy()

print(f"Registros originales: {len(df)}")
print(f"Registros tras eliminar Era 3: {len(df_eras)}")

Registros originales: 509241
Registros tras eliminar Era 3: 463428


## Variable AGE para df_eras

In [1331]:
# import pandas as pd

# # Definir los cortes por décadas (de 0 a 100+ años)
# bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 110]
# labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] # Representan las décadas

# # Crear la nueva variable de década
# df_eras['age_decade'] = pd.cut(df_eras['age_numeric'], bins=bins, labels=labels, right=False)

# # Convertir a tipo entero (importante para XGBoost)
# df_eras['age_decade'] = df_eras['age_decade'].astype(int)

# # Opcional: Podemos mantener la edad numérica original o eliminarla. 
# # A veces dejar ambas ayuda al modelo a ver el detalle y la tendencia general.
# print(df_eras[['age_numeric', 'age_decade']].head())

In [1332]:
# 1. Definir X e y para el set eras
X_eras = df_eras.drop('Target_Meses', axis=1)
y_eras = df_eras['Target_Meses']

# 2. Split
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
    X_eras, y_eras, test_size=0.2, random_state=42
)

# 3. Modelo XGBoost de alta capacidad
regressor_eras = xgb.XGBRegressor(
    n_estimators=2000,
    learning_rate=0.03,
    max_depth=12,
    tree_method='hist',
    random_state=42
)

# 4. Fit con Early Stopping
regressor_eras.fit(
    X_train_m, y_train_m, 
    eval_set=[(X_test_m, y_test_m)],
    verbose=100
)

[0]	validation_0-rmse:19.98401
[100]	validation_0-rmse:14.87523
[200]	validation_0-rmse:14.87642
[300]	validation_0-rmse:14.90033
[400]	validation_0-rmse:14.92744
[500]	validation_0-rmse:14.95201
[600]	validation_0-rmse:14.97166
[700]	validation_0-rmse:14.99886
[800]	validation_0-rmse:15.02168
[900]	validation_0-rmse:15.04873
[1000]	validation_0-rmse:15.07772
[1100]	validation_0-rmse:15.11040
[1200]	validation_0-rmse:15.14195
[1300]	validation_0-rmse:15.17223
[1400]	validation_0-rmse:15.20604
[1500]	validation_0-rmse:15.23495
[1600]	validation_0-rmse:15.26335
[1700]	validation_0-rmse:15.29087
[1800]	validation_0-rmse:15.31724
[1900]	validation_0-rmse:15.34547
[1999]	validation_0-rmse:15.37580


0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'reg:squarederror'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [1333]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Realizar predicciones sobre el set de prueba
y_pred = regressor_eras.predict(X_test)

# Calcular métricas
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"--- Métricas del Modelo ---")
print(f"MAE (Error Absoluto Medio): {mae:.2f} meses")
print(f"RMSE (Error Cuadrático Medio): {rmse:.2f} meses")
print(f"R² (Coeficiente de Determinación): {r2:.4f}")

--- Métricas del Modelo ---
MAE (Error Absoluto Medio): 7.78 meses
RMSE (Error Cuadrático Medio): 11.35 meses
R² (Coeficiente de Determinación): 0.6755


## Modelo BASE, HIPERPARAMETRIZACIÓN

In [1334]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

# 1. Configuración de alto rendimiento para CPU (AMD compatible)
xgb_reg = xgb.XGBRegressor(
    objective='reg:squarederror', # Mucho más rápido para converger
    tree_method='hist',           
    device='cpu',                 # Estabilidad total en Windows/AMD
    n_jobs=4,                     # Deja algunos hilos libres para que el sistema no se congele
    random_state=42
)

param_dist = {
    'n_estimators': [1000, 2000], 
    'max_depth': [7, 8, 9, 10],   # Rangos más seguros para evitar Memory Error
    'learning_rate': [0.01, 0.03, 0.05],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8],
    'gamma': [0.1, 0.5, 1]        # Ayuda a que el R2 no se infle artificialmente
}

random_search = RandomizedSearchCV(
    estimator=xgb_reg,
    param_distributions=param_dist,
    n_iter=10,                    # 10 iteraciones * 3 folds = 30 entrenamientos
    scoring='neg_mean_absolute_error',
    cv=3,
    verbose=3,
    n_jobs=-1,                    # Paraleliza la búsqueda de parámetros
    random_state=42
)

# 2. Ejecutar (Asegúrate de que X_train no tenga las columnas 'object')
random_search.fit(X_train, y_train)

print(f"Mejor MAE encontrado: {-random_search.best_score_}")
print(f"Mejores Parámetros: {random_search.best_params_}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits


KeyboardInterrupt: 

## Guarda el mejor modelo

In [None]:
# Extraemos el mejor modelo de la búsqueda
mejor_modelo2 = random_search.best_estimator_

# Guardar en formato JSON (recomendado para XGBoost moderno)
mejor_modelo2.save_model('../models/mejormodelo_xgb_supervivencia2.json')

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

# Realizar predicciones
y_pred = mejor_modelo2.predict(X_test)

# Calcular métricas
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"--- RENDIMIENTO DEL MODELO ---")
print(f"Error Promedio (MAE): {mae:.2f} meses")
print(f"Precisión (R2 Score): {r2:.4f}")

--- RENDIMIENTO DEL MODELO ---
Error Promedio (MAE): 10.35 meses
Precisión (R2 Score): 0.5163
