In [17]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

In [40]:
df = pd.read_csv('train.csv')
df.shape

(225000, 325)

In [42]:
selected_vars = [
    'SEXVAR',    # categorical: sex of respondent
    'GENHLTH',   # categorical: self-rated general health status
    'PHYSHLTH',  # numeric: number of days physical health not good
    'MENTHLTH',  # numeric: number of days mental health not good
    'POORHLTH',  # numeric: days poor physical/mental health limited activities
    'PRIMINSR',  # categorical: primary health insurance type
    'MEDCOST1',  # categorical: could not see doctor because of cost
    'CHECKUP1',  # categorical: last routine checkup time interval
    'EXERANY2',  # categorical: did any physical activity in past month
    'SLEPTIM1',  # numeric: average hours of sleep in 24h
    'CVDSTRK3',  # categorical: ever told had a stroke (CVD-related)
    'ASTHMA3',   # categorical: ever told had asthma
    'CHCCOPD3',  # categorical: ever told had COPD
    'ADDEPEV3',  # categorical: ever told had depressive disorder
    'CHCKDNY2',  # categorical: ever told had kidney disease
    'HAVARTH4',  # categorical: ever told had arthritis
    'DIABETE4',  # categorical: diabetic status
    'DIABAGE4',  # numeric: age when diagnosed with diabetes
    'MARITAL',   # categorical: marital status
    'EDUCA',     # categorical: education level
    'EMPLOY1',   # categorical: employment status
    'INCOME3',   # categorical: household income category
    'WEIGHT2',   # numeric: body weight
    'HEIGHT3',   # numeric: height
    'DIFFWALK',  # categorical: difficulty walking/climbing stairs
    'DIFFDRES',  # categorical: difficulty dressing/bathing
    'DIFFALON',  # categorical: difficulty doing errands alone
    'SMOKE100',  # categorical: smoked at least 100 cigarettes lifetime
    'SMOKDAY2',  # categorical: current smoking frequency
    'ECIGNOW2',  # categorical: current e-cigarette use
    'ALCDAY4',   # numeric: drinking frequency in past 30 days
    'DRNK3GE5',  # categorical: binge drinking in past 30 days
    'FLUSHOT7',  # categorical: received flu vaccine past 12 months
    'PNEUVAC4',  # categorical: ever received pneumonia vaccine
    'HIVTST7',   # categorical: ever tested for HIV
    'COVIDPOS',  # categorical: ever tested positive for COVID-19
    'COVIDVA1',  # categorical: received COVID-19 vaccine
    'COPDCOGH',  # categorical: chronic cough symptom severity (COPD)
    'COPDFLEM',  # categorical: chronic phlegm symptom severity (COPD)
    'COPDBRTH',  # categorical: shortness of breath symptom severity (COPD)
    'CNCRTYP2',  # categorical: type of cancer diagnosed (if any)
    'ACEDEPRS',  # categorical: childhood household depression exposure
    'ACEADNED',  # categorical: childhood neglect indicators
    'LSATISFY',  # numeric/categorical: satisfaction with life scale
    'SDHFOOD1',  # categorical: food insecurity
    'SDHBILLS',  # categorical: difficulty paying bills
    'SDHSTRE1',  # categorical: stress related to social determinants
    '_BMI5',     # numeric: calculated BMI *100
    '_BMI5CAT',  # categorical: BMI category
    'TARGET'     # numeric/categorical: target variable (CVD risk)
]

df_selected = df[selected_vars].copy()
df_selected.shape


(225000, 50)

In [45]:
var_types = {
    'SEXVAR': 'cat',
    'GENHLTH': 'cat',
    'PHYSHLTH': 'num',
    'MENTHLTH': 'num',
    'POORHLTH': 'num',
    'PRIMINSR': 'cat',
    'MEDCOST1': 'cat',
    'CHECKUP1': 'cat',
    'EXERANY2': 'cat',
    'SLEPTIM1': 'num',
    'CVDSTRK3': 'cat',
    'ASTHMA3': 'cat',
    'CHCCOPD3': 'cat',
    'ADDEPEV3': 'cat',
    'CHCKDNY2': 'cat',
    'HAVARTH4': 'cat',
    'DIABETE4': 'cat',
    'DIABAGE4': 'num',
    'MARITAL': 'cat',
    'EDUCA': 'cat',
    'EMPLOY1': 'cat',
    'INCOME3': 'cat',
    'WEIGHT2': 'num',
    'HEIGHT3': 'num',
    'DIFFWALK': 'cat',
    'DIFFDRES': 'cat',
    'DIFFALON': 'cat',
    'SMOKE100': 'cat',
    'SMOKDAY2': 'cat',
    'ECIGNOW2': 'cat',
    'ALCDAY4': 'num',
    'DRNK3GE5': 'cat',
    'FLUSHOT7': 'cat',
    'PNEUVAC4': 'cat',
    'HIVTST7': 'cat',
    'COVIDPOS': 'cat',
    'COVIDVA1': 'cat',
    'COPDCOGH': 'cat',
    'COPDFLEM': 'cat',
    'COPDBRTH': 'cat',
    'CNCRTYP2': 'cat',
    'ACEDEPRS': 'cat',
    'ACEADNED': 'cat',
    'LSATISFY': 'num',
    'SDHFOOD1': 'cat',
    'SDHBILLS': 'cat',
    'SDHSTRE1': 'cat',
    '_BMI5': 'num',
    '_BMI5CAT': 'cat',
    'TARGET': 'cat'   
}

target = df_selected['TARGET']
cat_features = [col for col, t in var_types.items() if t == 'cat' and col != 'TARGET']
num_features = [col for col, t in var_types.items() if t == 'num']

df_selected[cat_features] = df_selected[cat_features].replace({7: np.nan, 8: np.nan, 9: np.nan})
df_selected[num_features] = df_selected[num_features].replace({77: np.nan, 88: np.nan, 99: np.nan})

for col in cat_features:
    df_selected[col] = df_selected[col].astype('category')

df_selected.head()


Unnamed: 0,SEXVAR,GENHLTH,PHYSHLTH,MENTHLTH,POORHLTH,PRIMINSR,MEDCOST1,CHECKUP1,EXERANY2,SLEPTIM1,...,CNCRTYP2,ACEDEPRS,ACEADNED,LSATISFY,SDHFOOD1,SDHBILLS,SDHSTRE1,_BMI5,_BMI5CAT,TARGET
0,2.0,3.0,30.0,,,3.0,2.0,1.0,1.0,7.0,...,,,,1.0,5.0,2.0,5.0,3018.0,4.0,True
1,2.0,3.0,,15.0,14.0,10.0,2.0,3.0,2.0,6.0,...,,,,,,,,2615.0,3.0,False
2,1.0,2.0,,,,1.0,2.0,1.0,1.0,8.0,...,,2.0,5.0,1.0,5.0,2.0,4.0,3975.0,4.0,False
3,1.0,3.0,,,,1.0,2.0,1.0,1.0,10.0,...,,,,,,,,3323.0,4.0,False
4,1.0,3.0,,3.0,,,2.0,1.0,1.0,,...,,,,,,,,2375.0,2.0,False


In [46]:
cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=999)),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])


num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])


preprocessor = ColumnTransformer([
    ('cat', cat_transformer, cat_features),
    ('num', num_transformer, num_features)
], remainder='drop')

In [47]:
X = df_selected.drop('TARGET', axis=1)
y = df_selected['TARGET']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

y_train = y_train.astype(int)
y_test = y_test.astype(int)


In [48]:
rf_model = Pipeline([
    ('preprocess', preprocessor),  # aplica imputación y one-hot
    ('classifier', RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        n_jobs=-1,
        random_state=42
    ))
])

# Entrenar
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)[:, 1]  # probabilidad clase positiva

print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_prob))
print(classification_report(y_test, y_pred))

Accuracy: 0.9118222222222222
AUC: 0.8200071734992608
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     40972
           1       0.62      0.04      0.07      4028

    accuracy                           0.91     45000
   macro avg       0.77      0.52      0.51     45000
weighted avg       0.89      0.91      0.87     45000



In [57]:
df_test = pd.read_csv('test.csv')
ids = df_test['ID']
df_test.shape


(75000, 324)

In [58]:
selected_vars2 = [
    'SEXVAR',    # categorical: sex of respondent
    'GENHLTH',   # categorical: self-rated general health status
    'PHYSHLTH',  # numeric: number of days physical health not good
    'MENTHLTH',  # numeric: number of days mental health not good
    'POORHLTH',  # numeric: days poor physical/mental health limited activities
    'PRIMINSR',  # categorical: primary health insurance type
    'MEDCOST1',  # categorical: could not see doctor because of cost
    'CHECKUP1',  # categorical: last routine checkup time interval
    'EXERANY2',  # categorical: did any physical activity in past month
    'SLEPTIM1',  # numeric: average hours of sleep in 24h
    'CVDSTRK3',  # categorical: ever told had a stroke (CVD-related)
    'ASTHMA3',   # categorical: ever told had asthma
    'CHCCOPD3',  # categorical: ever told had COPD
    'ADDEPEV3',  # categorical: ever told had depressive disorder
    'CHCKDNY2',  # categorical: ever told had kidney disease
    'HAVARTH4',  # categorical: ever told had arthritis
    'DIABETE4',  # categorical: diabetic status
    'DIABAGE4',  # numeric: age when diagnosed with diabetes
    'MARITAL',   # categorical: marital status
    'EDUCA',     # categorical: education level
    'EMPLOY1',   # categorical: employment status
    'INCOME3',   # categorical: household income category
    'WEIGHT2',   # numeric: body weight
    'HEIGHT3',   # numeric: height
    'DIFFWALK',  # categorical: difficulty walking/climbing stairs
    'DIFFDRES',  # categorical: difficulty dressing/bathing
    'DIFFALON',  # categorical: difficulty doing errands alone
    'SMOKE100',  # categorical: smoked at least 100 cigarettes lifetime
    'SMOKDAY2',  # categorical: current smoking frequency
    'ECIGNOW2',  # categorical: current e-cigarette use
    'ALCDAY4',   # numeric: drinking frequency in past 30 days
    'DRNK3GE5',  # categorical: binge drinking in past 30 days
    'FLUSHOT7',  # categorical: received flu vaccine past 12 months
    'PNEUVAC4',  # categorical: ever received pneumonia vaccine
    'HIVTST7',   # categorical: ever tested for HIV
    'COVIDPOS',  # categorical: ever tested positive for COVID-19
    'COVIDVA1',  # categorical: received COVID-19 vaccine
    'COPDCOGH',  # categorical: chronic cough symptom severity (COPD)
    'COPDFLEM',  # categorical: chronic phlegm symptom severity (COPD)
    'COPDBRTH',  # categorical: shortness of breath symptom severity (COPD)
    'CNCRTYP2',  # categorical: type of cancer diagnosed (if any)
    'ACEDEPRS',  # categorical: childhood household depression exposure
    'ACEADNED',  # categorical: childhood neglect indicators
    'LSATISFY',  # numeric/categorical: satisfaction with life scale
    'SDHFOOD1',  # categorical: food insecurity
    'SDHBILLS',  # categorical: difficulty paying bills
    'SDHSTRE1',  # categorical: stress related to social determinants
    '_BMI5',     # numeric: calculated BMI *100
    '_BMI5CAT',  # categorical: BMI category
]

In [60]:
df_test = df_test[selected_vars2].copy()

df_test[cat_features] = df_test[cat_features].replace({7: np.nan, 8: np.nan, 9: np.nan})
df_test[num_features] = df_test[num_features].replace({77: np.nan, 88: np.nan, 99: np.nan})

for col in cat_features:
    df_test[col] = df_test[col].astype('category')

df_test.shape


(75000, 49)

In [None]:
predictions_bool = rf_model.predict(df_test)  
predictions_bool = predictions_bool.astype(bool)

df_out = pd.DataFrame({
    'ID': ids.astype(int),
    'TARGET_PRED': predictions_bool
})

df_out.to_csv('predictions.csv', index=False)