In [6864]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [6865]:
df = pd.read_csv('../data/raw/clinical_data_lung.zip')

In [6866]:
df.columns

Index(['Age recode with <1 year olds and 90+',
       'Race recode (White, Black, Other)', 'Primary Site - labeled',
       'Histologic Type ICD-O-3', 'Grade Clinical (2018+)',
       'Grade Recode (thru 2017)',
       'Combined Summary Stage with Expanded Regional Codes (2004+)',
       'Derived EOD 2018 Stage Group Recode (2018+)',
       '7th Edition Stage Group Recode (2016-2017)',
       'Derived AJCC Stage Group, 7th ed (2010-2015)',
       'CS tumor size (2004-2015)', 'Tumor Size Summary (2016+)',
       'Survival months', 'Survival months flag',
       'Vital status recode (study cutoff used)',
       'SEER cause-specific death classification', 'Year of diagnosis',
       'Median household income inflation adj to 2023',
       'RX Summ--Surg Prim Site (1998+)', 'Radiation recode',
       'Chemotherapy recode (yes, no/unk)', 'Sex',
       'Rural-Urban Continuum Code', 'Reason no cancer-directed surgery',
       'Total number of in situ/malignant tumors for patient',
       'Tota

In [6867]:
df.rename(columns={'Race recode (White, Black, Other)': 'Race', 'Rural-Urban Continuum Code': 'Rural Code',
                   'SEER cause-specific death classification': 'vital_status', 'Primary Site - labeled': 'Primary Site'}, inplace=True)

## Eliminar duplicados

In [6868]:
df.duplicated().sum()

np.int64(4977)

In [6869]:
df.drop_duplicates(inplace=True)

## Target `survival_months`

In [6870]:
df["Survival months flag"].value_counts()

Survival months flag
Complete dates are available and there are more than 0 days of survival      514288
Not calculated because a Death Certificate Only or Autopsy Only case           8706
Incomplete dates are available and there cannot be zero days of follow-up      6452
Complete dates are available and there are 0 days of survival                  2058
Incomplete dates are available and there could be zero days of follow-up        937
Name: count, dtype: int64

In [6871]:
df["Survival months"].unique()

array(['0059', '0027', '0000', '0052', '0012', '0014', '0005', '0016',
       '0040', '0010', '0009', '0001', '0020', '0003', '0002', '0021',
       '0025', '0130', '0007', '0013', '0060', '0017', '0111', '0033',
       '0004', '0090', '0037', '0062', '0024', '0008', '0006', '0076',
       '0075', '0034', '0098', '0050', '0053', '0015', '0094', '0038',
       '0078', '0026', '0128', '0125', 'Unknown', '0042', '0058', '0091',
       '0051', '0079', '0018', '0036', '0113', '0019', '0011', '0086',
       '0129', '0030', '0092', '0101', '0032', '0070', '0084', '0126',
       '0056', '0082', '0116', '0043', '0054', '0063', '0044', '0045',
       '0121', '0123', '0102', '0074', '0057', '0081', '0028', '0023',
       '0099', '0039', '0022', '0046', '0029', '0072', '0088', '0115',
       '0064', '0061', '0108', '0073', '0035', '0120', '0031', '0068',
       '0055', '0047', '0041', '0077', '0071', '0066', '0093', '0067',
       '0065', '0083', '0117', '0085', '0100', '0118', '0105', '0069',
   

In [6872]:
# 1. Definimos quiénes entran al modelo de excelencia
# Solo pacientes con fechas completas y vida mayor a 0
filtro_excelencia = df['Survival months flag'] == 'Complete dates are available and there are more than 0 days of survival'

# 2. Separamos los datasets
df = df[filtro_excelencia].copy()
df_excluidos = df[~filtro_excelencia].copy() # El ~ significa "lo que no cumple el filtro"

df = df[df['Survival months'] != 'Unknown'].copy()

# 1. Aseguramos que sea entero
df['survival_months_int'] = pd.to_numeric(df['Survival months'], errors='coerce').fillna(0).astype(int)

# 2. Definir la nueva función con rangos mixtos (de 5 en 5 y luego de 10 en 10)
def agrupar_supervivencia_ajustado(meses):
    if meses == 0:
        return 0
    elif 1 <= meses <= 6:
        return 6
    elif 6 < meses <= 54:
        # Incrementos de 5 en 5 hasta los 5 años
        return int(np.ceil(meses / 6.0) * 6)
    else:
        # Más de 60 meses (más de 5 años)
        return 60

# 3. Aplicamos la transformación
df['Target_Meses'] = df['survival_months_int'].apply(agrupar_supervivencia_ajustado)

# 4. Verificación para confirmar los saltos que pediste
print("Rangos de supervivencia generados:")
print(sorted(df['Target_Meses'].unique()))

# Eliminar columnas antiguas
df = df.drop(["Survival months", "Survival months flag", "survival_months_int"], axis=1)

print("\nConteo por rango:")
print(df['Target_Meses'].value_counts().sort_index())


  df_excluidos = df[~filtro_excelencia].copy() # El ~ significa "lo que no cumple el filtro"


Rangos de supervivencia generados:
[np.int64(0), np.int64(6), np.int64(12), np.int64(18), np.int64(24), np.int64(30), np.int64(36), np.int64(42), np.int64(48), np.int64(54), np.int64(60)]

Conteo por rango:
Target_Meses
0      56502
6     151558
12     73834
18     46851
24     32512
30     24543
36     19123
42     16709
48     14189
54     11985
60     66482
Name: count, dtype: int64


> IMPORTANCIA DE LOS 5 AÑOS DE SUPERVIVENCIA

In [6873]:
# Ejemplo de pesos: Duplicar la importancia de registros con supervivencia >= 60 meses
# df['sample_weight'] = df['Survival months'].apply(lambda x: 2.0 if x >= 60 else 1.0)

## Variables 'Vital status'

In [6874]:
# 1. Creamos un filtro de 'Muerte por Cáncer' o 'Sigue Vivo'
# Excluimos a los que murieron por causas ajenas para no confundir al modelo
filtro_causa_especifica = (df['vital_status'] == 'Dead (attributable to this cancer dx)') | \
                          (df['vital_status'] == 'Alive or dead of other cause')

# Aplicamos el filtro para tener un dataframe puramente oncólogico
df = df[filtro_causa_especifica].copy()

In [6875]:
# Si alguien está vivo pero tiene pocos meses, es un caso de diagnóstico reciente
# Si alguien está muerto pero tiene 20 trimestres, es un éxito de supervivencia a largo plazo
print(df.groupby('Vital status recode (study cutoff used)')['Target_Meses'].mean())

Vital status recode (study cutoff used)
Alive    35.337714
Dead     15.416786
Name: Target_Meses, dtype: float64


In [6876]:
# Se elimina las variasbles de 'respuesta'.
df = df.drop(["vital_status", "Vital status recode (study cutoff used)"], axis=1)

## Variable AGE

In [6877]:
df["Age recode with <1 year olds and 90+"].value_counts()

Age recode with <1 year olds and 90+
70-74 years    93119
65-69 years    87202
75-79 years    81446
60-64 years    66911
80-84 years    57730
55-59 years    43943
85-89 years    31549
50-54 years    21227
90+ years      12920
45-49 years     8374
40-44 years     3343
35-39 years     1477
30-34 years      725
25-29 years      363
20-24 years      191
15-19 years       82
01-04 years       36
10-14 years       34
00 years          20
05-09 years       17
Name: count, dtype: int64

In [6878]:
df = df[~df['Age recode with <1 year olds and 90+'].isin(['00 years', '01-04 years', '05-09 years', '10-14 years', '15-19 years', '20-24 years', '25-29 years', '30-34 years'])]

In [6879]:
df["Age recode with <1 year olds and 90+"].value_counts()

Age recode with <1 year olds and 90+
70-74 years    93119
65-69 years    87202
75-79 years    81446
60-64 years    66911
80-84 years    57730
55-59 years    43943
85-89 years    31549
50-54 years    21227
90+ years      12920
45-49 years     8374
40-44 years     3343
35-39 years     1477
Name: count, dtype: int64

##### Nota: Trabajamos con las caracteristicas mayores de 40 años por la cantidad de datos, si agrupamos a las edades comprendidas entre 20 y 39 estariamos asumiendo que uno de 20 y uno de 39 se comportan igual, perderiamos `granularidad`.
> He agrupado a los de 35-39 años con los de 40-44 años para tener mas datos. Ahora comparten el valor medio de 40-44 (42.0)

### Convertir a Valor Numérico Continúo (Midpoint Mapping)

In [6880]:
def transform_age_recode(df, column_name):
    
    # Define the mapping based on your specific distribution
    age_mapping = {
        '35-39 years': 42.0,
        '40-44 years': 42.0,
        '45-49 years': 47.0,
        '50-54 years': 52.0,
        '55-59 years': 57.0,
        '60-64 years': 62.0,
        '65-69 years': 67.0,
        '70-74 years': 72.0,
        '75-79 years': 77.0,
        '80-84 years': 82.0,
        '85-89 years': 87.0,
        '90+ years': 92.5  # Standard clinical estimate for 90+
    }
    
    # Clean the string to ensure matching (strip extra spaces)
    df[column_name] = df[column_name].str.strip()
    
    # Map values and create a new numeric column
    df['age_numeric'] = df[column_name].map(age_mapping)
    
    return df


df = transform_age_recode(df, 'Age recode with <1 year olds and 90+')

In [6881]:
# # agregar una columna de grupo de edades 

# # 1. Definimos los rangos para cada grupo
# grupo_1 = ['00 years', '01-04 years', '05-09 years', '10-14 years', '15-19 years']
# grupo_2 = ['20-24 years', '25-29 years', '30-34 years', '35-39 years']
# grupo_3 = ['40-44 years','45-49 years', '50-54 years', '55-59 years']
# grupo_4 =['60-64 years','65-69 years', '70-74 years']
# # El resto (65+) caerá en el grupo 5

# # 2. Función lógica para asignar el grupo
# def asignar_grupo(edad):
#     if edad in grupo_1: return 1
#     if edad in grupo_2: return 2
#     if edad in grupo_3: return 3
#     if edad in grupo_4: return 4
#     return 5  # Todo lo que no esté en los anteriores (80+)

# # 3. Creamos la columna
# df['age_group'] = df['Age recode with <1 year olds and 90+'].apply(asignar_grupo)

# # 4. Verificamos los conteos
# df['age_group'].value_counts().sort_index()

In [6882]:
#Borra la antigua columna de registros de edad
df = df.drop("Age recode with <1 year olds and 90+", axis=1)

## Variable Age para el otro dataframe


## Variable Race

In [6883]:
df["Race"].value_counts()

Race
White                                                        412654
Black                                                         54432
Other (American Indian/AK Native, Asian/Pacific Islander)     40615
Unknown                                                        1540
Name: count, dtype: int64

In [6884]:
# Cambiar el nombre de valor (Other (American Indian/AK Native, Asian/Pacific Islander) a (Other) y añadir los valores (Unknown) a (Other).
df["Race"] = df["Race"].replace({
    "Other (American Indian/AK Native, Asian/Pacific Islander)": "Other",
    "Unknown" : "Other"}
)

df["Race"].value_counts()

Race
White    412654
Black     54432
Other     42155
Name: count, dtype: int64

## Variable `Primary site`
> Lugar donde se originó el tumor primario (origen de la metástasis).

> - NOS: significa que no ha sido especificado "Not Otherwise Specified"
> - En medicina, una "lesión que se solapa" (overlapping lesion) ocurre cuando el tumor primario es tan grande o está ubicado de tal manera que invade dos o más lóbulos del pulmón (por ejemplo, parte del lóbulo superior y parte del medio) y no es posible determinar en cuál de los dos comenzó originalmente.

In [6885]:
df["Primary Site"] = df["Primary Site"].replace({"C34.1-Upper lobe, lung": "Upper lobe",
                                                 "C34.3-Lower lobe, lung": "Lower lobe",
                                                 "C34.9-Lung, NOS": "Unspecified",
                                                 "C34.2-Middle lobe, lung": "Middle lobe",
                                                 "C34.0-Main bronchus": "Main bronchus",
                                                 "C34.8-Overlapping lesion of lung": "Overlapping"
})
df["Primary Site"].value_counts()

Primary Site
Upper lobe       259117
Lower lobe       137989
Unspecified       64092
Middle lobe       22676
Main bronchus     20320
Overlapping        5047
Name: count, dtype: int64

In [6886]:
df = df[df["Primary Site"] != "Overlapping"]
df["Primary Site"].value_counts()

Primary Site
Upper lobe       259117
Lower lobe       137989
Unspecified       64092
Middle lobe       22676
Main bronchus     20320
Name: count, dtype: int64

## Variable `Histologic Type ICD-O-3`

In [6887]:
df["Histologic Type ICD-O-3"].unique()

array([8140, 8041, 8550, 8070, 8144, 8000, 8046, 8240, 8010, 8560, 8246,
       8255, 8551, 8012, 8254, 8083, 8252, 8230, 8480, 8073, 8071, 8260,
       8250, 8022, 8020, 8072, 8033, 8980, 8430, 8032, 8265, 8249, 8013,
       8481, 8253, 8310, 8043, 8507, 8800, 8200, 8045, 8244, 9120, 9041,
       8323, 8082, 8490, 8001, 8031, 8802, 8890, 8044, 8574, 8256, 8042,
       8123, 8084, 8830, 8562, 8004, 8251, 8052, 8075, 8575, 8074, 8470,
       9064, 8933, 8050, 9133, 8333, 9101, 8805, 9040, 8005, 8801, 9043,
       8680, 9540, 8023, 9137, 8825, 8257, 9364, 9080, 8341, 8576, 8940,
       8720, 8021, 8815, 8891, 8811, 8014, 8030, 8580, 8245, 8972, 8581,
       8290, 8525, 9100, 8441, 8810, 8120, 8936, 8201, 9180, 8803, 9473,
       8901, 9044, 8900, 8211, 9260, 8982, 8920, 8570, 8500, 8440, 8002,
       8804, 8247, 8410, 8850, 8051, 8912, 8896, 8851, 8894, 8854, 8401,
       8963, 8510, 9015, 9571, 9580, 9220, 8910, 8094, 9240, 8584, 8320,
       9130, 8040, 8210, 8743, 8973, 8714, 9370, 80

In [6888]:
def agrupar_histologia(codigo):
    try:
        c = int(codigo)
    except:
        return "Other/NOS"

    # 1. Adenocarcinomas (Incluye variantes papilares y mucinosas comunes)
    if (8140 <= c <= 8384) or (8440 <= c <= 8551):
        return "Adenocarcinoma"
    
    # 2. Escamosos (Squamous Cell)
    elif 8050 <= c <= 8084:
        return "Squamous Cell Carcinoma"
    
    # 3. Small Cell (Altamente agresivos - Crítico para predicción de vida)
    elif 8041 <= c <= 8045:
        return "Small Cell Carcinoma"
    
    # 4. Large Cell
    elif 8012 <= c <= 8014 or c == 8021:
        return "Large Cell Carcinoma"
    
    # 5. Neuroendocrinos (Excluyendo Small Cell)
    elif 8240 <= c <= 8249:
        return "Neuroendocrine Tumors"
    
    # 6. Carcinomas No Especificados (8000-8011)
    elif 8000 <= c <= 8011:
        return "Carcinoma NOS"
    
    else:
        return "Other/Specific Rare"
    
    # Aplicamos la función para crear una columna de texto
df['histology_type'] = df['Histologic Type ICD-O-3'].apply(agrupar_histologia)

# Eliminamos la columna original.
df = df.drop("Histologic Type ICD-O-3", axis=1)

df['histology_type'].value_counts()

histology_type
Adenocarcinoma             251731
Squamous Cell Carcinoma    104916
Small Cell Carcinoma        57169
Carcinoma NOS               50740
Other/Specific Rare         34049
Large Cell Carcinoma         5589
Name: count, dtype: int64

In [6889]:
df = df[df["histology_type"] != "Other/Specific Rare"]

## Unificar Variable `Grade Clinical (2018+)` y `Grade Recode (thru 2017)`

In [6890]:
df["Grade Clinical (2018+)"].value_counts()

Grade Clinical (2018+)
Blank(s)    255775
9           152412
3            29109
2            22055
1             8609
4             2185
Name: count, dtype: int64

In [6891]:
df["Grade Recode (thru 2017)"].value_counts()

Grade Recode (thru 2017)
Blank(s)                                  214370
Unknown                                   138184
Poorly differentiated; Grade III           52934
Moderately differentiated; Grade II        43061
Well differentiated; Grade I               15821
Undifferentiated; anaplastic; Grade IV      5775
Name: count, dtype: int64

In [6892]:
def unificar_grados_clinicos(row):
    # Extraemos valores y limpiamos espacios
    g_new = str(row['Grade Clinical (2018+)']).strip()
    g_old = str(row['Grade Recode (thru 2017)']).strip()
    
    # Prioridad 1: Grado I / Bien diferenciado / 1 o A
    if 'Grade I' in g_old or g_new in ['1', 'A']:
        return '1: Well Differentiated'
    
    # Prioridad 2: Grado II / Moderadamente diferenciado / 2 o B
    if 'Grade II' in g_old or g_new in ['2', 'B']:
        return '2: Moderately Differentiated'
    
    # Prioridad 3: Grado III / Pobremente diferenciado / 3 o C
    if 'Grade III' in g_old or g_new in ['3', 'C', 'H']:
        return '3: Poorly Differentiated'
    
    # Prioridad 4: Grado IV / Anaplásico / 4 o D
    if 'Grade IV' in g_old or g_new in ['4', 'D']:
        return '4: Undifferentiated/Anaplastic'
    
    # Si no cae en ninguno, es Desconocido o Blank
    return np.nan
# 'Unknown/Blank'

# Aplicamos a todo el dataframe
df['grade_clinical'] = df.apply(unificar_grados_clinicos, axis=1)

# Se eliminan las columnas antiguas
df = df.drop(["Grade Clinical (2018+)","Grade Recode (thru 2017)"], axis=1)

# Verificamos el éxito de la unificación
df['grade_clinical'].value_counts()

grade_clinical
1: Well Differentiated            126200
3: Poorly Differentiated           29109
2: Moderately Differentiated       22055
4: Undifferentiated/Anaplastic      2185
Name: count, dtype: int64

In [6893]:
df['grade_clinical'].isna().sum()

np.int64(290596)

In [6894]:
# # Mapeo ordinal respetando la jerarquía clínica
# grado_mapping = {
#     '1: Well Differentiated': 1,
#     '2: Moderately Differentiated': 2,
#     '3: Poorly Differentiated': 3,
#     '4: Undifferentiated/Anaplastic': 4,
#     'Unknown/Blank': np.nan
# }
# df['grade_numeric'] = df['grade_clinical'].map(grado_mapping)

#### NOTA
> - MIRAR MAS ADELANTE SI TRATAR LOS VALORES UNKNOWS DE UNA MANERA DISTINTA A LA ACTUAL

> - EL CODIGO DE ABAJO

In [6895]:
# Comparamos la media de supervivencia en meses por cada grado
print(df.groupby('grade_clinical')['Target_Meses'].mean().sort_values())

grade_clinical
4: Undifferentiated/Anaplastic    12.280092
3: Poorly Differentiated          16.613006
2: Moderately Differentiated      20.320018
1: Well Differentiated            31.174754
Name: Target_Meses, dtype: float64


## Variable `Combined Summary Stage with Expanded Regional Codes (2004+)`

In [6896]:
df["Combined Summary Stage with Expanded Regional Codes (2004+)"].value_counts()

Combined Summary Stage with Expanded Regional Codes (2004+)
Distant site(s)/node(s) involved                                236509
Localized only                                                  117956
Regional lymph nodes involved only                               40832
Regional by direct extension only                                31709
Regional by both direct extension and lymph node involvement     27037
Unknown/unstaged/unspecified/DCO                                 16099
In situ                                                              3
Name: count, dtype: int64

In [6897]:
def unificar_estadio_total_final(row):
    # 1. Extraemos las 4 fuentes temporales
    s_2004 = str(row['Combined Summary Stage with Expanded Regional Codes (2004+)']).lower()
    s_2010 = str(row['Derived AJCC Stage Group, 7th ed (2010-2015)']).upper()
    s_2016 = str(row['7th Edition Stage Group Recode (2016-2017)']).upper()
    s_2018 = str(row['Derived EOD 2018 Stage Group Recode (2018+)']).upper()
    
    # --- LÓGICA DE UNIFICACIÓN JERÁRQUICA ---
    
    # NIVEL 3: DISTANTE (Metástasis / Estadio IV)
    # Buscamos 'distant', 'IV' o código '4'
    if 'distant' in s_2004 or 'IV' in s_2010 or 'IV' in s_2016 or '4' in s_2018:
        return 3
    
    # NIVEL 2: REGIONAL (Extensión a ganglios / Estadios II y III)
    # Buscamos 'regional', romanos 'II'/'III' o códigos '2'/'3'
    elif 'regional' in s_2004 or \
         any(x in s_2010 for x in ['II', 'III']) or \
         any(x in s_2016 for x in ['II', 'III']) or \
         any(x in s_2018 for x in ['2A', '2B', '3A', '3B', '3C']):
        return 2
    
    # NIVEL 1: LOCALIZADO / IN SITU (Estadio I o 0)
    # Buscamos 'localized', 'situ', romano 'I' o código '1'
    elif 'localized' in s_2004 or 'situ' in s_2004 or \
         'I' in s_2010 or 'I' in s_2016 or \
         any(x in s_2018 for x in ['1A', '1B', '0S', '0C']):
        return 1
        
    # Si todo falla, enviamos a NaN para que XGBoost use su lógica de nulos
    return np.nan

# Aplicamos la columna definitiva
df['stage_numeric'] = df.apply(unificar_estadio_total_final, axis=1)

# Se eliminan las columnas antiguas
df = df.drop(["Combined Summary Stage with Expanded Regional Codes (2004+)",
              "Derived EOD 2018 Stage Group Recode (2018+)",
              "7th Edition Stage Group Recode (2016-2017)",
              "Derived AJCC Stage Group, 7th ed (2010-2015)"], axis=1)

df['stage_numeric'].value_counts()

stage_numeric
3.0    236826
2.0    110927
1.0    106757
Name: count, dtype: int64

In [6898]:
df['stage_numeric'].isna().sum()

np.int64(15635)

In [6899]:
df = df.dropna(subset="stage_numeric")
df['stage_numeric'].isna().sum()

np.int64(0)

In [6900]:
# Analizamos la supervivencia trimestral media por estadio unificado
analisis = df.groupby('stage_numeric')['Target_Meses'].agg(['mean', 'count']).sort_index()
print(analisis)

                    mean   count
stage_numeric                   
1.0            34.632033  106757
2.0            26.597600  110927
3.0            13.287021  236826


In [6901]:
df.columns

Index(['Race', 'Primary Site', 'CS tumor size (2004-2015)',
       'Tumor Size Summary (2016+)', 'Year of diagnosis',
       'Median household income inflation adj to 2023',
       'RX Summ--Surg Prim Site (1998+)', 'Radiation recode',
       'Chemotherapy recode (yes, no/unk)', 'Sex', 'Rural Code',
       'Reason no cancer-directed surgery',
       'Total number of in situ/malignant tumors for patient',
       'Total number of benign/borderline tumors for patient',
       'Sequence number', 'Type of Reporting Source', 'Target_Meses',
       'age_numeric', 'histology_type', 'grade_clinical', 'stage_numeric'],
      dtype='object')

In [6902]:
def consolidar_tumor_size(row):
    # Extraemos valores de ambas épocas
    ts_2004 = str(row['CS tumor size (2004-2015)']).strip().upper()
    ts_2016 = str(row['Tumor Size Summary (2016+)']).strip().upper()
    
    def limpiar_valor(val):
        # Manejo de nulos administrativos
        if val in ['BLANK(S)', '999', 'NAN', '990']: 
            return np.nan
        
        try:
            v_float = float(val)
            # 000 es microinvasión, lo asignamos como 0.5mm para diferenciarlo de 0
            if v_float == 0: return 0.5
            # Valores por encima de 988 son códigos de error o desconocidos en SEER
            if v_float > 988: return np.nan
            return v_float
        except ValueError:
            return np.nan

    # Prioridad: Intentamos con la columna de 2016+, si falla, vamos a la de 2004
    final_size = limpiar_valor(ts_2016)
    if np.isnan(final_size):
        final_size = limpiar_valor(ts_2004)
        
    return final_size

# Creamos la variable definitiva
df['tumor_size_mm'] = df.apply(consolidar_tumor_size, axis=1)

# Techo clinico
df['tumor_size_mm'] = df['tumor_size_mm'].clip(upper=200)

# Se elimina las columnas originales
df = df.drop(["CS tumor size (2004-2015)","Tumor Size Summary (2016+)"], axis=1)

df['tumor_size_mm'].value_counts()


tumor_size_mm
20.0     12620
15.0     12272
25.0     11723
30.0     11615
40.0      9478
         ...  
184.0        2
197.0        2
183.0        2
196.0        1
191.0        1
Name: count, Length: 200, dtype: int64

In [6903]:
df['tumor_size_mm'].describe()

count    385337.000000
mean         40.259969
std          26.935277
min           0.500000
25%          20.000000
50%          33.000000
75%          55.000000
max         200.000000
Name: tumor_size_mm, dtype: float64

In [6904]:
df['tumor_size_mm'].isnull().sum()

np.int64(69173)

In [6905]:
df.isnull().sum()

Race                                                         0
Primary Site                                                 0
Year of diagnosis                                            0
Median household income inflation adj to 2023                0
RX Summ--Surg Prim Site (1998+)                              0
Radiation recode                                             0
Chemotherapy recode (yes, no/unk)                            0
Sex                                                          0
Rural Code                                                   0
Reason no cancer-directed surgery                            0
Total number of in situ/malignant tumors for patient         0
Total number of benign/borderline tumors for patient         0
Sequence number                                              0
Type of Reporting Source                                     0
Target_Meses                                                 0
age_numeric                                            

In [6906]:
df.columns

Index(['Race', 'Primary Site', 'Year of diagnosis',
       'Median household income inflation adj to 2023',
       'RX Summ--Surg Prim Site (1998+)', 'Radiation recode',
       'Chemotherapy recode (yes, no/unk)', 'Sex', 'Rural Code',
       'Reason no cancer-directed surgery',
       'Total number of in situ/malignant tumors for patient',
       'Total number of benign/borderline tumors for patient',
       'Sequence number', 'Type of Reporting Source', 'Target_Meses',
       'age_numeric', 'histology_type', 'grade_clinical', 'stage_numeric',
       'tumor_size_mm'],
      dtype='object')

## Variable `Year of diagnosis`

In [6907]:
df["Year of diagnosis"].value_counts()

Year of diagnosis
2019    43507
2017    42645
2018    42529
2016    41848
2015    41783
2021    41488
2022    41224
2014    41150
2013    40143
2012    39732
2020    38461
Name: count, dtype: int64

In [6908]:
# ¿Ha aumentado la supervivencia media con los años?
df.groupby('Year of diagnosis')['Target_Meses'].mean()

Year of diagnosis
2012    22.825581
2013    23.323817
2014    24.033536
2015    25.040375
2016    25.823743
2017    26.777348
2018    26.441675
2019    23.394029
2020    18.539768
2021    13.539626
2022     6.698913
Name: Target_Meses, dtype: float64

> Transformar el año en tres "Eras Médicas"

In [6909]:
def agrupar_eras_medicas(year):
    # Agrupamos por eras de tratamiento/tecnología
    if year <= 2016: return 1 # Era antigua
    if year <= 2020: return 2 # Era media
    return 3 # Era reciente (alta censura)

df['medical_era'] = df['Year of diagnosis'].apply(agrupar_eras_medicas)

# Eliminar la columna original
df.drop('Year of diagnosis', axis=1, inplace=True)

In [6910]:
# Verificación de volumen por Era
df['medical_era'].value_counts().sort_index()

medical_era
1    204656
2    167142
3     82712
Name: count, dtype: int64

## Variable `Median household income inflation adj to 2023`

In [6911]:
df["Median household income inflation adj to 2023"].value_counts()

Median household income inflation adj to 2023
$75,000 - $79,999                         44096
$85,000 - $89,999                         43285
$100,000 - $109,999                       41209
$80,000 - $84,999                         40637
$70,000 - $74,999                         36119
$65,000 - $69,999                         35845
$120,000+                                 32861
$60,000 - $64,999                         31940
$95,000 - $99,999                         23989
$55,000 - $59,999                         23851
$110,000 - $119,999                       23109
$50,000 - $54,999                         22833
$90,000 - $94,999                         19728
$45,000 - $49,999                         16978
$40,000 - $44,999                         10281
< $40,000                                  7743
Unknown/missing/no match/Not 1990-2023        6
Name: count, dtype: int64

In [6912]:
def mapear_ingresos(valor):
    v = str(valor).strip()
    
    # Diccionario de mapeo ordinal (de menor a mayor ingreso)
    mapeo = {
        '< $40,000': 1,
        '$40,000 - $44,999': 2,
        '$45,000 - $49,999': 3,
        '$50,000 - $54,999': 4,
        '$55,000 - $59,999': 5,
        '$60,000 - $64,999': 6,
        '$65,000 - $69,999': 7,
        '$70,000 - $74,999': 8,
        '$75,000 - $79,999': 9,
        '$80,000 - $84,999': 10,
        '$85,000 - $89,999': 11,
        '$90,000 - $94,999': 12,
        '$95,000 - $99,999': 13,
        '$100,000 - $109,999': 14,
        '$110,000 - $119,999': 15,
        '$120,000+': 16
    }
    
    return mapeo.get(v, np.nan) # Los 'Unknown/missing' devuelven NaN

df['income_ordinal'] = df['Median household income inflation adj to 2023'].apply(mapear_ingresos)
df["income_ordinal"].value_counts().sort_values()

income_ordinal
1.0      7743
2.0     10281
3.0     16978
12.0    19728
4.0     22833
15.0    23109
5.0     23851
13.0    23989
6.0     31940
16.0    32861
7.0     35845
8.0     36119
10.0    40637
14.0    41209
11.0    43285
9.0     44096
Name: count, dtype: int64

In [6913]:
# Comprobamos si el ingreso afecta el promedio de supervivencia trimestral
print(df.groupby('income_ordinal')['Target_Meses'].mean().sort_index())

income_ordinal
1.0     18.582720
2.0     18.751094
3.0     20.163506
4.0     19.458153
5.0     19.441197
6.0     21.128303
7.0     20.047984
8.0     22.602951
9.0     22.628855
10.0    23.523833
11.0    19.456856
12.0    22.131083
13.0    22.663137
14.0    22.559004
15.0    24.826431
16.0    21.841088
Name: Target_Meses, dtype: float64


In [6914]:
# Se elimina la columna original.
df.drop('Median household income inflation adj to 2023', axis=1, inplace=True)

In [6915]:
df.columns

Index(['Race', 'Primary Site', 'RX Summ--Surg Prim Site (1998+)',
       'Radiation recode', 'Chemotherapy recode (yes, no/unk)', 'Sex',
       'Rural Code', 'Reason no cancer-directed surgery',
       'Total number of in situ/malignant tumors for patient',
       'Total number of benign/borderline tumors for patient',
       'Sequence number', 'Type of Reporting Source', 'Target_Meses',
       'age_numeric', 'histology_type', 'grade_clinical', 'stage_numeric',
       'tumor_size_mm', 'medical_era', 'income_ordinal'],
      dtype='object')

## Variable `RX Summ--Surg Prim Site (1998+)`

In [6916]:
df["RX Summ--Surg Prim Site (1998+)"].value_counts()

RX Summ--Surg Prim Site (1998+)
0     349555
33     62939
21     18543
30      8098
22      5906
56      1790
45      1634
99      1117
12       757
23       657
46       630
55       517
90       499
20       451
24       338
15       291
13       236
19       177
80        83
25        81
47        69
48        53
70        43
66        26
65        20
Name: count, dtype: int64

In [6917]:
def mapear_cirugia(valor):
    try:
        v = int(valor)
        if v == 0:
            return 0  # Sin cirugía
        elif 10 <= v <= 19:
            return 1  # Destrucción local
        elif 20 <= v <= 27:
            return 2  # Resección parcial
        elif 30 <= v <= 80:
            return 3  # Cirugía radical/total
        else:
            return np.nan # Códigos 90, 99 o errores
    except:
        return np.nan

df['surgery_intensity'] = df['RX Summ--Surg Prim Site (1998+)'].apply(mapear_cirugia)

In [6918]:
df["surgery_intensity"].isnull().sum()

np.int64(1616)

In [6919]:
df = df.dropna(subset="surgery_intensity")
df["surgery_intensity"].isnull().sum()

np.int64(0)

## Variable `Radiation recode`

In [6920]:
df["Radiation recode"].value_counts()

Radiation recode
None/Unknown                                             265780
Beam radiation                                           172551
Refused (1988+)                                            8753
Recommended, unknown if administered                       3906
Radiation, NOS  method or source not specified             1260
Radioactive implants (includes brachytherapy) (1988+)       393
Combination of beam with implants or isotopes               142
Radioisotopes (1988+)                                       109
Name: count, dtype: int64

In [6921]:
def limpiar_radioterapia(valor):
    v = str(valor).lower()
    # Si contiene 'radiation' o 'isotopes' o 'implants', asumimos que recibió
    if 'radiation' in v or 'isotopes' in v or 'implants' in v:
        return 1
    return 0

df['radiation_binary'] = df['Radiation recode'].apply(limpiar_radioterapia)

df['radiation_binary'].value_counts()

radiation_binary
0    278439
1    174455
Name: count, dtype: int64

## Variable `Reason no cancer-directed surgery`

In [6922]:
df["Reason no cancer-directed surgery"].value_counts()

Reason no cancer-directed surgery
Not recommended                                                                 316628
Surgery performed                                                               103339
Not recommended, contraindicated due to other cond; autopsy only (1973-2002)     21628
Recommended but not performed, patient refused                                    6947
Recommended but not performed, unknown reason                                     2514
Recommended, unknown if performed                                                  986
Not performed, patient died prior to recommended surgery                           852
Name: count, dtype: int64

In [6923]:
df = df[~df["Reason no cancer-directed surgery"].isin(["Not recommended, contraindicated due to other cond; autopsy only (1973-2002)",
                                                        "Not performed, patient died prior to recommended surgery",
                                                        "Recommended but not performed, patient refused","Recommended but not performed, unknown reason",
                                                        "Recommended, unknown if performed"])]
df["Reason no cancer-directed surgery"].value_counts()

Reason no cancer-directed surgery
Not recommended      316628
Surgery performed    103339
Name: count, dtype: int64

In [6924]:
df['surgery_binary'] = df['Reason no cancer-directed surgery'].astype('category')

In [6925]:
# # Es más profesional y seguro definirlo tú mismo
# df['surgery_binary'] = df['Reason no cancer-directed surgery'].map({"Surgery performed": 1, "Not recommended": 0})
df["surgery_binary"].value_counts()

surgery_binary
Not recommended      316628
Surgery performed    103339
Name: count, dtype: int64

In [6926]:
# def mapear_motivo_no_cirugia(valor):
#     v = str(valor).strip()
    
#     # 0: Se realizó la cirugía (sin barrera)
#     if 'Surgery performed' in v:
#         return 0
    
#     # 1: Barrera Clínica (Estado de salud muy pobre / Contraindicado)
#     if 'died prior' in v:
#         return 1
    
#     # 2: Barrera de Decisión (El paciente o familia dijo no)
#     if 'patient refused' in v:
#         return 2
    
#     # 3: Barrera de Gravedad (No recomendada por el médico)
#     if 'Not recommended' in v:
#         return 3
        
#     return np.nan # Otros casos o desconocidos

# df['surgical_barrier'] = df['Reason no cancer-directed surgery'].apply(mapear_motivo_no_cirugia)

# df['surgical_barrier'].value_counts()

## Variable `"Chemotherapy recode (yes, no/unk)"`

In [6927]:
df["Chemotherapy recode (yes, no/unk)"].value_counts()

Chemotherapy recode (yes, no/unk)
No/Unknown    244140
Yes           175827
Name: count, dtype: int64

In [6928]:
df["chemo_binary"] = df["Chemotherapy recode (yes, no/unk)"].astype("category")
df["chemo_binary"].value_counts()

chemo_binary
No/Unknown    244140
Yes           175827
Name: count, dtype: int64

In [6929]:
# df['chemo_binary'] = df['Chemotherapy recode (yes, no/unk)'].map({'Yes': 1, 'No/Unknown': 0})
# df["chemo_binary"].value_counts()

In [6930]:
# Lista de columnas originales que ya procesamos y podemos eliminar
cols_to_drop = [
    'Radiation recode', 
    'Chemotherapy recode (yes, no/unk)', 
    'Reason no cancer-directed surgery',
    'RX Summ--Surg Prim Site (1998+)'
]

df = df.drop(columns=cols_to_drop)

In [6931]:
df.columns

Index(['Race', 'Primary Site', 'Sex', 'Rural Code',
       'Total number of in situ/malignant tumors for patient',
       'Total number of benign/borderline tumors for patient',
       'Sequence number', 'Type of Reporting Source', 'Target_Meses',
       'age_numeric', 'histology_type', 'grade_clinical', 'stage_numeric',
       'tumor_size_mm', 'medical_era', 'income_ordinal', 'surgery_intensity',
       'radiation_binary', 'surgery_binary', 'chemo_binary'],
      dtype='object')

## Variable `Sex`

In [6932]:
df["Sex"].value_counts()

Sex
Male      211422
Female    208545
Name: count, dtype: int64

In [6933]:
df["Sex"] = df["Sex"].astype("category")

In [6934]:
# # Mapeo binario: Male -> 0, Female -> 1
# # (El orden es arbitrario, pero ser consistente es clave)
# df['sex_binary'] = df['Sex'].map({'Male': 1, 'Female': 0})

# # Se elimina la columna original
# df = df.drop('Sex', axis=1)

# # Verificamos 
# df['sex_binary'].value_counts()

In [6935]:
df.columns

Index(['Race', 'Primary Site', 'Sex', 'Rural Code',
       'Total number of in situ/malignant tumors for patient',
       'Total number of benign/borderline tumors for patient',
       'Sequence number', 'Type of Reporting Source', 'Target_Meses',
       'age_numeric', 'histology_type', 'grade_clinical', 'stage_numeric',
       'tumor_size_mm', 'medical_era', 'income_ordinal', 'surgery_intensity',
       'radiation_binary', 'surgery_binary', 'chemo_binary'],
      dtype='object')

## Variable `Rural Code`

In [6936]:
df["Rural Code"].value_counts()

Rural Code
Counties in metropolitan areas ge 1 million pop                 230028
Counties in metropolitan areas of 250,000 to 1 million pop       88017
Nonmetropolitan counties adjacent to a metropolitan area         38509
Counties in metropolitan areas of lt 250 thousand pop            36638
Nonmetropolitan counties not adjacent to a metropolitan area     26212
Unknown/missing/no match (Alaska or Hawaii - Entire State)         559
Unknown/missing/no match/Not 1990-2023                               4
Name: count, dtype: int64

In [6937]:
def mapear_rural_code(valor):
    v = str(valor).lower()
    
    # 1: Metrópolis grandes (>= 1 millón)
    if 'ge 1 million' in v:
        return 1
    # 2: Metrópolis medianas (250k - 1M)
    elif '250,000 to 1 million' in v:
        return 2
    # 3: Metrópolis pequeñas (< 250k)
    elif 'lt 250 thousand' in v:
        return 3
    # 4: No metropolitano adyacente a metro
    elif 'adjacent to a metropolitan' in v:
        return 4
    # 5: No metropolitano NO adyacente (Rural puro)
    elif 'not adjacent' in v:
        return 5
    
    return np.nan # Para los Unknown/Missing

df['urbanization_index'] = df['Rural Code'].apply(mapear_rural_code)

In [6938]:
df.groupby('urbanization_index')['Target_Meses'].mean().sort_values()

urbanization_index
4.0    19.379089
3.0    20.357225
2.0    21.453287
1.0    22.546090
Name: Target_Meses, dtype: float64

In [6939]:
df = df.drop('Rural Code', axis=1)

In [6940]:
df.columns

Index(['Race', 'Primary Site', 'Sex',
       'Total number of in situ/malignant tumors for patient',
       'Total number of benign/borderline tumors for patient',
       'Sequence number', 'Type of Reporting Source', 'Target_Meses',
       'age_numeric', 'histology_type', 'grade_clinical', 'stage_numeric',
       'tumor_size_mm', 'medical_era', 'income_ordinal', 'surgery_intensity',
       'radiation_binary', 'surgery_binary', 'chemo_binary',
       'urbanization_index'],
      dtype='object')

## Variable `Total number of in situ/malignant tumors for patient`

In [6941]:
df["Total number of in situ/malignant tumors for patient"].value_counts()

Total number of in situ/malignant tumors for patient
1     287371
2      96079
3      27167
4       6961
5       1719
6        456
7        141
8         31
9         17
10         8
11         6
32         2
13         2
20         2
21         1
14         1
34         1
18         1
30         1
Name: count, dtype: int64

In [6942]:
# # 1. Procesamos los tumores Malignos/In Situ
df['total_malignant_tumors'] = df['Total number of in situ/malignant tumors for patient'].clip(upper=5)

# # Se elimina la columna original
df= df.drop("Total number of in situ/malignant tumors for patient", axis=1)

df["total_malignant_tumors"].value_counts()

total_malignant_tumors
1    287371
2     96079
3     27167
4      6961
5      2389
Name: count, dtype: int64

## Variable `Total number of benign/borderline tumors for patient`

In [6943]:
df["Total number of benign/borderline tumors for patient"].value_counts()

Total number of benign/borderline tumors for patient
0    414835
1      4963
2       160
3         8
4         1
Name: count, dtype: int64

In [6944]:
# 1. Procesamos los tumores Benignos
df['total_benign_tumors'] = df['Total number of benign/borderline tumors for patient'].clip(upper=1)

# Se elimina la columna original
df= df.drop("Total number of benign/borderline tumors for patient", axis=1)

df["total_benign_tumors"].value_counts()

total_benign_tumors
0    414835
1      5132
Name: count, dtype: int64

In [6945]:
df.columns

Index(['Race', 'Primary Site', 'Sex', 'Sequence number',
       'Type of Reporting Source', 'Target_Meses', 'age_numeric',
       'histology_type', 'grade_clinical', 'stage_numeric', 'tumor_size_mm',
       'medical_era', 'income_ordinal', 'surgery_intensity',
       'radiation_binary', 'surgery_binary', 'chemo_binary',
       'urbanization_index', 'total_malignant_tumors', 'total_benign_tumors'],
      dtype='object')

## Variable `Sequence number`

In [6946]:
df["Sequence number"].value_counts()

Sequence number
One primary only                284473
2nd of 2 or more primaries       85710
1st of 2 or more primaries       24025
3rd of 3 or more primaries       20042
4th of 4 or more primaries        4419
5th of 5 or more primaries         965
6th of 6 or more primaries         229
7th of 7 or more primaries          68
8th of 8 or more primaries          13
9th of 9 or more primaries           7
10th of 10 or more primaries         3
13th of 13 or more primaries         2
32nd of 32 or more primaries         2
11th of 11 or more primaries         2
20th of 20 or more primaries         2
21st of 21 or more primaries         1
14th of 14 or more primaries         1
34th of 34 or more primaries         1
17th of 17 or more primaries         1
30th of 30 or more primaries         1
Name: count, dtype: int64

In [6947]:
def mapear_secuencia(valor):
    v = str(valor).lower()
    
    # Caso más simple: Un solo tumor en la vida
    if 'one primary' in v:
        return 1
    
    # Extraemos el número de la posición (1st, 2nd, 3rd...)
    # if '1st' in v: return 1
    if '2nd' in v: return 2
    if '3rd' in v: return 3
    if '4th' in v: return 4
    
    # Para casos de 5 o más, los agrupamos (Clamping)
    # Al igual que con el conteo total, el riesgo se satura en valores altos
    return np.nan

df['sequence_numeric'] = df['Sequence number'].apply(mapear_secuencia)

#Se elimina la columna original 
df = df.drop(["Sequence number"], axis=1)

df["sequence_numeric"].value_counts()

sequence_numeric
1.0    284473
2.0     85712
3.0     20042
4.0      4421
Name: count, dtype: int64

In [6948]:
# df["sequence_numeric"] = df["sequence_numeric"].clip(upper=4)
# df["sequence_numeric"].value_counts()

In [6949]:
df.columns

Index(['Race', 'Primary Site', 'Sex', 'Type of Reporting Source',
       'Target_Meses', 'age_numeric', 'histology_type', 'grade_clinical',
       'stage_numeric', 'tumor_size_mm', 'medical_era', 'income_ordinal',
       'surgery_intensity', 'radiation_binary', 'surgery_binary',
       'chemo_binary', 'urbanization_index', 'total_malignant_tumors',
       'total_benign_tumors', 'sequence_numeric'],
      dtype='object')

## Variable `Type of Reporting Source`

In [6950]:
df["Type of Reporting Source"].value_counts()

Type of Reporting Source
Hospital inpatient/outpatient or clinic                     409304
Other hospital outpatient unit or surgery center (2006+)      4111
Radiation treatment or medical oncology center (2006+)        3871
Laboratory only (hospital or private)                         1376
Physicians office/private medical practitioner (LMD)          1121
Nursing/convalescent home/hospice                              184
Name: count, dtype: int64

### Agrupación por Calidad de Información

In [6951]:
def mapear_fuente_reporte(valor):
    v = str(valor).lower()
    
    # Nivel 1: Centros con capacidad quirúrgica y hospitalaria
    if 'hospital' in v or 'surgery center' in v:
        return 1
    # Nivel 2: Especialistas y clínicas externas
    if 'physician' in v or 'radiation' in v or 'medical oncology' in v:
        return 2
    # Nivel 3: Fuentes limitadas o de cuidados paliativos
    if 'laboratory' in v or 'nursing' in v or 'hospice' in v:
        return 3
        
    return np.nan

df['reporting_source_quality'] = df['Type of Reporting Source'].apply(mapear_fuente_reporte)

#Se elimina la columna original 
df = df.drop(["Type of Reporting Source"], axis=1)


In [6952]:
df.columns

Index(['Race', 'Primary Site', 'Sex', 'Target_Meses', 'age_numeric',
       'histology_type', 'grade_clinical', 'stage_numeric', 'tumor_size_mm',
       'medical_era', 'income_ordinal', 'surgery_intensity',
       'radiation_binary', 'surgery_binary', 'chemo_binary',
       'urbanization_index', 'total_malignant_tumors', 'total_benign_tumors',
       'sequence_numeric', 'reporting_source_quality'],
      dtype='object')

In [6953]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 419967 entries, 0 to 537414
Data columns (total 20 columns):
 #   Column                    Non-Null Count   Dtype   
---  ------                    --------------   -----   
 0   Race                      419967 non-null  object  
 1   Primary Site              419967 non-null  object  
 2   Sex                       419967 non-null  category
 3   Target_Meses              419967 non-null  int64   
 4   age_numeric               419967 non-null  float64 
 5   histology_type            419967 non-null  object  
 6   grade_clinical            164309 non-null  object  
 7   stage_numeric             419967 non-null  float64 
 8   tumor_size_mm             355362 non-null  float64 
 9   medical_era               419967 non-null  int64   
 10  income_ordinal            419963 non-null  float64 
 11  surgery_intensity         419967 non-null  float64 
 12  radiation_binary          419967 non-null  int64   
 13  surgery_binary            419967 n

## Codificacion de las variables

### One-Hot Encoding

In [6954]:
df["Race"].value_counts()

Race
White    339636
Black     44663
Other     35668
Name: count, dtype: int64

In [6955]:
df = pd.get_dummies(df, columns=['Race'], prefix='race')

### LabelEncoder

In [6956]:
df["Primary Site"].value_counts()

Primary Site
Upper lobe       217446
Lower lobe       116685
Unspecified       48993
Middle lobe       19322
Main bronchus     17521
Name: count, dtype: int64

In [6957]:
le = LabelEncoder()
df['site_numeric'] = le.fit_transform(df['Primary Site'].astype(str))
df['site_numeric'].value_counts()

site_numeric
4    217446
0    116685
3     48993
2     19322
1     17521
Name: count, dtype: int64

## Creación de Interacciones Clave

In [6958]:
# 1. Agresividad por Tamaño: ¿Es un tumor pequeño pero avanzado, o grande y temprano?
df['size_stage_ratio'] = df['tumor_size_mm'] * df['stage_numeric']

# 2. Fragilidad por Edad: La etapa impacta diferente según la edad del paciente
df['age_stage_interaction'] = df['age_numeric'] * df['stage_numeric']

# 3. Carga Tumoral Relativa: Malignidad total ajustada por la etapa actual
df['total_burden_stage'] = df['total_malignant_tumors'] * df['stage_numeric']

# Llenamos posibles nulos generados si había nulos en stage o size
df[['size_stage_ratio', 'age_stage_interaction', 'total_burden_stage']] = \
    df[['size_stage_ratio', 'age_stage_interaction', 'total_burden_stage']].fillna(0)

print("Nuevas variables creadas exitosamente.")

Nuevas variables creadas exitosamente.


In [6959]:
df.columns

Index(['Primary Site', 'Sex', 'Target_Meses', 'age_numeric', 'histology_type',
       'grade_clinical', 'stage_numeric', 'tumor_size_mm', 'medical_era',
       'income_ordinal', 'surgery_intensity', 'radiation_binary',
       'surgery_binary', 'chemo_binary', 'urbanization_index',
       'total_malignant_tumors', 'total_benign_tumors', 'sequence_numeric',
       'reporting_source_quality', 'race_Black', 'race_Other', 'race_White',
       'site_numeric', 'size_stage_ratio', 'age_stage_interaction',
       'total_burden_stage'],
      dtype='object')

In [6960]:
df["urbanization_index"].value_counts()

urbanization_index
1.0    230028
2.0     88017
4.0     64721
3.0     36638
Name: count, dtype: int64

In [6961]:
df["histology_type"].value_counts()

Flushing oldest 200 entries.
  warn('Output cache limit (currently {sz} entries) hit.\n'


histology_type
Adenocarcinoma             231840
Squamous Cell Carcinoma     91584
Small Cell Carcinoma        53907
Carcinoma NOS               37366
Large Cell Carcinoma         5270
Name: count, dtype: int64

In [6962]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 419967 entries, 0 to 537414
Data columns (total 26 columns):
 #   Column                    Non-Null Count   Dtype   
---  ------                    --------------   -----   
 0   Primary Site              419967 non-null  object  
 1   Sex                       419967 non-null  category
 2   Target_Meses              419967 non-null  int64   
 3   age_numeric               419967 non-null  float64 
 4   histology_type            419967 non-null  object  
 5   grade_clinical            164309 non-null  object  
 6   stage_numeric             419967 non-null  float64 
 7   tumor_size_mm             355362 non-null  float64 
 8   medical_era               419967 non-null  int64   
 9   income_ordinal            419963 non-null  float64 
 10  surgery_intensity         419967 non-null  float64 
 11  radiation_binary          419967 non-null  int64   
 12  surgery_binary            419967 non-null  category
 13  chemo_binary              419967 n

In [6963]:
# A. Convertir Grado Clínico a números (Ordinal)
# grado_mapping = {
#     '1: Well Differentiated': 1,
#     '2: Moderately Differentiated': 2,
#     '3: Poorly Differentiated': 3,
#     '4: Undifferentiated/Anaplastic': 4,
#     'Unknown/Blank': np.nan # XGBoost maneja los NaN automáticamente
# }
# df['grade_clinical'] = df['grade_clinical'].map(grado_mapping)

df["grade_clinical"] = df["grade_clinical"].astype("category")
df["histology_type"] = df["histology_type"].astype("category")
df["Primary Site"] = df["Primary Site"].astype("category")

# C. Convertir Histología y Sitio a números (Label Encoding)
le = LabelEncoder()
# df['histology_type'] = le.fit_transform(df['histology_type'].astype(str))
# df['Primary Site'] = le.fit_transform(df['Primary Site'].astype(str))

print(df.dtypes.value_counts())

float64     10
int64        7
bool         3
category     1
category     1
category     1
category     1
category     1
category     1
Name: count, dtype: int64


In [6964]:
# # Filtramos para quedarnos solo con pacientes con seguimiento de largo plazo
# # Era 1 y 2 tienen datos históricos más erass.
# df = df[df['medical_era'].isin([1, 2])].copy()

# print(f"Registros originales: {len(df)}")
# print(f"Registros tras eliminar Era 3: {len(df)}")

## Split

In [6965]:
df.columns

Index(['Primary Site', 'Sex', 'Target_Meses', 'age_numeric', 'histology_type',
       'grade_clinical', 'stage_numeric', 'tumor_size_mm', 'medical_era',
       'income_ordinal', 'surgery_intensity', 'radiation_binary',
       'surgery_binary', 'chemo_binary', 'urbanization_index',
       'total_malignant_tumors', 'total_benign_tumors', 'sequence_numeric',
       'reporting_source_quality', 'race_Black', 'race_Other', 'race_White',
       'site_numeric', 'size_stage_ratio', 'age_stage_interaction',
       'total_burden_stage'],
      dtype='object')

In [6966]:
# 1. Definir X e y
X = df.drop(['Target_Meses', 'urbanization_index'], axis=1)
y = df['Target_Meses']

# 2. Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Modelo con parámetros para evitar sobreajuste
regressor = xgb.XGBRegressor(
    n_estimators=250,
    learning_rate=0.03,
    enable_categorical=True, # <--- AQUÍ LE DICES QUE MANEJE LAS CATEGORÍAS
    max_depth=8,
    objective='reg:squarederror',
    tree_method='hist',
    random_state=42
)

regressor.fit(X_train, y_train, eval_set=[(X_test, y_test)])

[0]	validation_0-rmse:19.85695
[1]	validation_0-rmse:19.56234
[2]	validation_0-rmse:19.28140
[3]	validation_0-rmse:19.01280
[4]	validation_0-rmse:18.75594
[5]	validation_0-rmse:18.51140
[6]	validation_0-rmse:18.27779
[7]	validation_0-rmse:18.05459
[8]	validation_0-rmse:17.84232
[9]	validation_0-rmse:17.64030
[10]	validation_0-rmse:17.44704
[11]	validation_0-rmse:17.26332
[12]	validation_0-rmse:17.08853
[13]	validation_0-rmse:16.92174
[14]	validation_0-rmse:16.76363
[15]	validation_0-rmse:16.61280
[16]	validation_0-rmse:16.46990
[17]	validation_0-rmse:16.33331
[18]	validation_0-rmse:16.20394
[19]	validation_0-rmse:16.08118
[20]	validation_0-rmse:15.96437
[21]	validation_0-rmse:15.85323
[22]	validation_0-rmse:15.74791
[23]	validation_0-rmse:15.64697
[24]	validation_0-rmse:15.55206
[25]	validation_0-rmse:15.46168
[26]	validation_0-rmse:15.37515
[27]	validation_0-rmse:15.29371
[28]	validation_0-rmse:15.21502
[29]	validation_0-rmse:15.14174
[30]	validation_0-rmse:15.07171
[31]	validation_0-

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'reg:squarederror'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,True


In [6967]:
# Realizar predicciones sobre el set de prueba
y_pred = regressor.predict(X_test)

# Calcular métricas
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"--- Métricas del Modelo ---")
print(f"MAE (Error Absoluto Medio): {mae:.2f} meses")
print(f"RMSE (Error Cuadrático Medio): {rmse:.2f} meses")
print(f"R² (Coeficiente de Determinación): {r2:.4f}")

--- Métricas del Modelo ---
MAE (Error Absoluto Medio): 10.06 meses
RMSE (Error Cuadrático Medio): 13.66 meses
R² (Coeficiente de Determinación): 0.5411


In [6968]:
df.shape

(419967, 26)

In [6969]:
import optuna
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error

def objective(trial):
    # Definimos el espacio de búsqueda de hiperparámetros
    param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'enable_categorical': True,
        'random_state': 42,
        # Hiperparámetros a optimizar
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 100, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 100, log=True),
    }

    # Implementamos Cross-Validation para asegurar robustez
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []

    for train_idx, val_idx in kf.split(X_train, y_train):
        X_t, X_v = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_t, y_v = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = xgb.XGBRegressor(**param)
        model.fit(
            X_t, y_t,
            eval_set=[(X_v, y_v)],
            verbose=False
        )
        
        preds = model.predict(X_v)
        rmse = root_mean_squared_error(y_v, preds)
        rmse_scores.append(rmse)

    return sum(rmse_scores) / len(rmse_scores)

# Ejecución del estudio
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100) # Puedes subir n_trials si tienes tiempo

print(f"Mejor RMSE: {study.best_value}")
print(f"Mejores parámetros: {study.best_params}")

[32m[I 2026-01-30 18:53:45,894][0m A new study created in memory with name: no-name-6d6e1b0e-b264-4567-b138-d0929735b7f1[0m
[32m[I 2026-01-30 18:54:50,630][0m Trial 0 finished with value: 13.699126052856446 and parameters: {'n_estimators': 778, 'learning_rate': 0.012773962334643249, 'max_depth': 8, 'subsample': 0.6039909417758929, 'colsample_bytree': 0.8069408810766714, 'gamma': 1.0626596258148858e-07, 'min_child_weight': 5, 'reg_alpha': 1.1945331970221003e-07, 'reg_lambda': 1.0612369745307367e-05}. Best is trial 0 with value: 13.699126052856446.[0m
[32m[I 2026-01-30 18:55:25,371][0m Trial 1 finished with value: 13.734976196289063 and parameters: {'n_estimators': 652, 'learning_rate': 0.026313823421642603, 'max_depth': 5, 'subsample': 0.7289906355362319, 'colsample_bytree': 0.5329060283558065, 'gamma': 3.592312801642995e-08, 'min_child_weight': 8, 'reg_alpha': 7.7817517787774735, 'reg_lambda': 0.11040931563416073}. Best is trial 0 with value: 13.699126052856446.[0m
[32m[I 202

Mejor RMSE: 13.694617462158202
Mejores parámetros: {'n_estimators': 750, 'learning_rate': 0.013257755847886066, 'max_depth': 8, 'subsample': 0.7921548971557303, 'colsample_bytree': 0.696008278981775, 'gamma': 0.04105171391979062, 'min_child_weight': 4, 'reg_alpha': 55.368393819792644, 'reg_lambda': 2.3379846191432518e-07}


In [6980]:
import sys
import plotly

print(f"Versión de Plotly: {plotly.__version__}")
print(f"Ruta del ejecutable: {sys.executable}")

Versión de Plotly: 6.5.2
Ruta del ejecutable: c:\Users\Usuario\miniconda3\envs\env_312project\python.exe


In [6982]:
import plotly.io as pio
pio.renderers.default = "notebook"

import optuna
# Fuerza a optuna a reconocer que plotly ya está cargado
optuna.visualization._plotly_imports._imports.check()

ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.

In [6981]:
optuna.visualization.plot_contour(study)

ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.

In [None]:

optuna.visualization.plot_optimization_history(study)

ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.

In [6976]:
import optuna.visualization as vis

# 1. Historial de mejora: ¿Cuántos trials necesitamos realmente?
# Ayuda a ver si el modelo llegó a una meseta (plateau)
vis.plot_optimization_history(study)

ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.

In [None]:
# 2. Importancia de Hiperparámetros: ¿Qué está moviendo la aguja?
# Crucial para entender qué parámetros ajustar en futuras iteraciones
vis.plot_param_importances(study)

In [None]:
# 3. Gráfico de Rebanada (Slice Plot): Relación individual
# Te permite ver la "densidad" de éxito para cada parámetro por separado
vis.plot_slice(study, params=['learning_rate', 'max_depth', 'n_estimators'])

Entrenar el mejor modelo

In [6984]:
from sklearn.metrics import root_mean_squared_error

# 1. Recuperamos los mejores parámetros del estudio
best_params = study.best_params

# 2. Añadimos/Actualizamos los parámetros fijos
best_params.update({
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'enable_categorical': True,
    'random_state': 42
})

# 3. Entrenamos el modelo final
# Nota: Usamos early_stopping_rounds aquí también para mayor seguridad
final_model = xgb.XGBRegressor(**best_params)

final_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=100 # Te avisará cada 100 árboles cómo va el error
)

# 4. Evaluación final corregida (sin el argumento 'squared')
final_preds = final_model.predict(X_test)
final_rmse = root_mean_squared_error(y_test, final_preds)

print(f"\n✅ Proceso terminado")
print(f"Mejor RMSE en Test: {final_rmse}")

[0]	validation_0-rmse:20.03545
[100]	validation_0-rmse:14.62414
[200]	validation_0-rmse:13.85819
[300]	validation_0-rmse:13.72081
[400]	validation_0-rmse:13.67762
[500]	validation_0-rmse:13.65904
[600]	validation_0-rmse:13.64980
[700]	validation_0-rmse:13.64494
[749]	validation_0-rmse:13.64305

✅ Proceso terminado
Mejor RMSE en Test: 13.643054008483887


In [6987]:
from sklearn.metrics import mean_absolute_error, r2_score

# 1. Calculamos el MAE
final_mae = mean_absolute_error(y_test, final_preds)

# 2. Calculamos el R2
final_r2 = r2_score(y_test, final_preds)

print(f"✅ Análisis de métricas terminado")
print(f"MAE: {final_mae:.4f} meses")
print(f"R² Score: {final_r2:.4f}")

✅ Análisis de métricas terminado
MAE: 10.0577 meses
R² Score: 0.5422


In [6985]:
final_preds

array([ 6.2775426, 27.091871 , 23.038103 , ..., 38.746124 ,  6.8351455,
       38.39293  ], shape=(83994,), dtype=float32)

In [6986]:
import joblib

# Guardar el modelo
joblib.dump(final_model, 'mejor_modelo_carlos.joblib')

# Para cargarlo:
# modelo_cargado = joblib.load('mejor_modelo_carlos.joblib')

['mejor_modelo_carlos.joblib']

## ENTRENAMIENTO SIN LA ERA 3, 2018-2022

In [None]:
# # Filtramos para quedarnos solo con pacientes con seguimiento de largo plazo
# # Era 1 y 2 tienen datos históricos más erass.
# df_eras = df[df['medical_era'].isin([1, 2])].copy()

# print(f"Registros originales: {len(df)}")
# print(f"Registros tras eliminar Era 3: {len(df_eras)}")

## Variable AGE para df_eras

In [None]:
# import pandas as pd

# # Definir los cortes por décadas (de 0 a 100+ años)
# bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 110]
# labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] # Representan las décadas

# # Crear la nueva variable de década
# df_eras['age_decade'] = pd.cut(df_eras['age_numeric'], bins=bins, labels=labels, right=False)

# # Convertir a tipo entero (importante para XGBoost)
# df_eras['age_decade'] = df_eras['age_decade'].astype(int)

# # Opcional: Podemos mantener la edad numérica original o eliminarla. 
# # A veces dejar ambas ayuda al modelo a ver el detalle y la tendencia general.
# print(df_eras[['age_numeric', 'age_decade']].head())

Split con eras

In [None]:
# # 1. Definir X e y para el set eras
# X_eras = df_eras.drop('Target_Meses', axis=1)
# y_eras = df_eras['Target_Meses']

# # 2. Split
# X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
#     X_eras, y_eras, test_size=0.2, random_state=42
# )

# # 3. Modelo XGBoost de alta capacidad
# regressor_eras = xgb.XGBRegressor(
#     n_estimators=2000,
#     learning_rate=0.03,
#     max_depth=12,
#     tree_method='hist',
#     random_state=42
# )

# # 4. Fit con Early Stopping
# regressor_eras.fit(
#     X_train_m, y_train_m, 
#     eval_set=[(X_test_m, y_test_m)],
#     verbose=100
# )

In [None]:
# # Realizar predicciones sobre el set de prueba
# y_pred = regressor_eras.predict(X_test)

# # Calcular métricas
# mae = mean_absolute_error(y_test, y_pred)
# rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# r2 = r2_score(y_test, y_pred)

# print(f"--- Métricas del Modelo ---")
# print(f"MAE (Error Absoluto Medio): {mae:.2f} meses")
# print(f"RMSE (Error Cuadrático Medio): {rmse:.2f} meses")
# print(f"R² (Coeficiente de Determinación): {r2:.4f}")

## Modelo BASE, HIPERPARAMETRIZACIÓN

In [None]:
# import xgboost as xgb
# from sklearn.model_selection import RandomizedSearchCV

# # 1. Configuración de alto rendimiento para CPU (AMD compatible)
# xgb_reg = xgb.XGBRegressor(
#     objective='reg:squarederror', 
#     tree_method='hist',           
#     device='cpu',                 # Estabilidad total en Windows/AMD
#     n_jobs=4,                     # Deja algunos hilos libres para que el sistema no se congele
#     random_state=42
# )

# param_dist = {
#     'n_estimators': [1000, 2000], 
#     'max_depth': [7, 8, 9, 10],   # Rangos más seguros para evitar Memory Error
#     'learning_rate': [0.01, 0.03, 0.05],
#     'subsample': [0.7, 0.8, 0.9],
#     'colsample_bytree': [0.7, 0.8],
#     'gamma': [0.1, 0.5, 1]        # Ayuda a que el R2 no se infle artificialmente
# }

# random_search = RandomizedSearchCV(
#     estimator=xgb_reg,
#     param_distributions=param_dist,
#     n_iter=10,                    # 10 iteraciones * 3 folds = 30 entrenamientos
#     scoring='neg_mean_absolute_error',
#     cv=3,
#     verbose=3,
#     n_jobs=-1,                    # Paraleliza la búsqueda de parámetros
#     random_state=42
# )

# # 2. Ejecutar (Asegúrate de que X_train no tenga las columnas 'object')
# random_search.fit(X_train, y_train)

# print(f"Mejor MAE encontrado: {-random_search.best_score_}")
# print(f"Mejores Parámetros: {random_search.best_params_}")

## Guarda el mejor modelo

In [None]:
# # Extraemos el mejor modelo de la búsqueda
# mejor_modelo2 = random_search.best_estimator_

# # Guardar en formato JSON (recomendado para XGBoost moderno)
# mejor_modelo2.save_model('../models/mejormodelo_xgb_supervivencia2.json')

In [None]:
# from sklearn.metrics import mean_absolute_error, r2_score
# import numpy as np

# # Realizar predicciones
# y_pred = mejor_modelo2.predict(X_test)

# # Calcular métricas
# mae = mean_absolute_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print(f"--- RENDIMIENTO DEL MODELO ---")
# print(f"Error Promedio (MAE): {mae:.2f} meses")
# print(f"Precisión (R2 Score): {r2:.4f}")