In [4]:
%pip install -U kaleido
%pip install --upgrade plotly

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [5]:
import numpy as np
import pandas as pd

import warnings 
warnings.filterwarnings('ignore', category=ResourceWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

import data_cleaning
import visualization
from pipeline_function import run_pipeline

train = pd.read_csv('./datasets/train.csv')
test = pd.read_csv('./datasets/test.csv') 

LABEL_VARS = ['sex', 'fbs', 'exang', 'slope']
ONEHOT_VARS = ['cp', 'restecg']

param_grid = [
    {
        'solver': ['liblinear'],
        'penalty': ['l1', 'l2'],
        'C': [0.01, 0.05, 0.1, 0.2, 0.5, 1.0, 5.0, 10.0],
        'class_weight': [None, 'balanced']
    },
    {
        'solver': ['lbfgs'],
        'penalty': ['l2'],
        'C': [0.01, 0.05, 0.1, 0.2, 0.5, 1.0, 5.0, 10.0],
        'class_weight': [None, 'balanced']
    }
]

In [6]:
df = pd.read_csv('./datasets/train.csv')

INT_COLS = ['sex', 'cp', 'restecg', 'fbs', 'exang', 'slope', 'ca', 'thal']
FLOAT_COLS = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
ALL_COLS = INT_COLS + FLOAT_COLS

df[ALL_COLS] = df[ALL_COLS].replace(['?', '<NA>', 'nan'], np.nan)
df[ALL_COLS] = df[ALL_COLS].apply(pd.to_numeric, errors='coerce')

df[INT_COLS] = df[INT_COLS].astype('Int64')
df[FLOAT_COLS] = df[FLOAT_COLS].astype(float)

minus_nine = (df[ALL_COLS] == -9).mean() * 100
minus_nine = minus_nine[minus_nine > 0].sort_values(ascending=False)

question_mark = (df.isna().mean() * 100).sort_values(ascending=False)
question_mark = question_mark[question_mark > 0]

visualization.plot_missing_plotly(minus_nine, 'Porcentaje de "-9" por Variable', 'Bupu', filename='./images/menos_9.png')
visualization.plot_missing_plotly(question_mark, 'Porcentaje de "?" por Variable', 'Bupu', filename='./images/interrogantes.png')

### -9 MODA / MEDIANA

- ? mediana/moda
- ? KNN
- ? MICE

In [7]:
results = run_pipeline(
    train, test,
    impute_minus_nine_func=data_cleaning.median_mode_imputation_minus_nine,
    impute_question_func=data_cleaning.median_mode_imputation_nan,
    label_vars=LABEL_VARS,
    onehot_vars=ONEHOT_VARS,
    model_params={'C': 0.1, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'},
    param_grid=param_grid,
    grid_search=True,
    submission=True,
    submission_path='./datasets/submission.csv'
)

X_train_scaled = results[2]
y_train = results[4]
model = results[5]        
feature_names = results[6]
coefs = model.coef_[0] 

visualization.plot_learning_curve_bias_variance(model, X_train_scaled, y_train, cv=5)
visualization.display_feature_importances(feature_names, coefs)

predictions, accuracy, *_ = run_pipeline(
    train, test,
    impute_minus_nine_func=data_cleaning.median_mode_imputation_minus_nine,
    impute_question_func=data_cleaning.knn_imputation_nan,
    label_vars=LABEL_VARS,
    onehot_vars=ONEHOT_VARS,
    model_params={'C': 0.1, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'},
    param_grid=param_grid,
    grid_search=True
)

predictions, accuracy, *_ = run_pipeline(
    train, test,
    impute_minus_nine_func=data_cleaning.median_mode_imputation_minus_nine,
    impute_question_func=data_cleaning.iterative_imputation_nan,
    label_vars=LABEL_VARS,
    onehot_vars=ONEHOT_VARS,
    model_params={'C': 0.1, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'},
    param_grid=param_grid,
    grid_search=True
)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Mejores parámetros: {'C': 0.1, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy Media (CV): 0.5451
Rango de Rendimiento (CV): 0.5451 +/- 0.0262 


 Submission guardado en: ./datasets/submission.csv

Distribución de predicciones:
0    101
1     41
2     14
3     28
Name: count, dtype: int64


Importancia de Características:


Unnamed: 0,Característica,Peso
14,combined_risk,-0.417501
7,oldpeak,-0.413683
1,sex,-0.27253
17,cp_2.0,0.268471
19,cp_4.0,-0.249916
13,chest_pain_severity,0.232254
3,chol,0.228059
5,thalach,0.196752
12,age_chol_interaction,0.187059
4,fbs,-0.181413


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Mejores parámetros: {'C': 1.0, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy Media (CV): 0.5355
Rango de Rendimiento (CV): 0.5355 +/- 0.0257 

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Mejores parámetros: {'C': 0.1, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy Media (CV): 0.5505
Rango de Rendimiento (CV): 0.5505 +/- 0.0141 



### -9 nueva categoría

- ? mediana/moda
- ? KNN
- ? MICE

In [8]:
predictions, accuracy, *_ = run_pipeline(
    train, test,
    impute_minus_nine_func=data_cleaning.categorize_minus_nine,
    impute_question_func=data_cleaning.median_mode_imputation_nan,
    label_vars=LABEL_VARS,
    onehot_vars=ONEHOT_VARS,
    model_params={'C': 0.1, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'},
    param_grid=param_grid,
    grid_search=True
)

predictions, accuracy, *_ = run_pipeline(
    train, test,
    impute_minus_nine_func=data_cleaning.categorize_minus_nine,
    impute_question_func=data_cleaning.knn_imputation_nan,
    label_vars=LABEL_VARS,
    onehot_vars=ONEHOT_VARS,
    model_params={'C': 0.1, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'},
    param_grid=param_grid,
    grid_search=True
)

predictions, accuracy, *_ = run_pipeline(
    train, test,
    impute_minus_nine_func=data_cleaning.categorize_minus_nine,
    impute_question_func=data_cleaning.iterative_imputation_nan,
    label_vars=LABEL_VARS,
    onehot_vars=ONEHOT_VARS,
    model_params={'C': 0.1, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'},
    param_grid=param_grid,
    grid_search=True
)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Mejores parámetros: {'C': 0.1, 'class_weight': None, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy Media (CV): 0.5437
Rango de Rendimiento (CV): 0.5437 +/- 0.0218 

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Mejores parámetros: {'C': 1.0, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy Media (CV): 0.5301
Rango de Rendimiento (CV): 0.5301 +/- 0.0312 

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Mejores parámetros: {'C': 0.05, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy Media (CV): 0.5383
Rango de Rendimiento (CV): 0.5383 +/- 0.0222 



### -9 valores sanos

- ? mediana/moda
- ? KNN
- ? MICE

In [9]:
predictions, accuracy, *_ = run_pipeline(
    train, test,
    impute_minus_nine_func=data_cleaning.healthy_values_minus_nine,
    impute_question_func=data_cleaning.median_mode_imputation_nan,
    label_vars=LABEL_VARS,
    onehot_vars=ONEHOT_VARS,
    model_params={'C': 0.1, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'},
    param_grid=param_grid,
    grid_search=True        
)

predictions, accuracy, *_ = run_pipeline(
    train, test,
    impute_minus_nine_func=data_cleaning.healthy_values_minus_nine,
    impute_question_func=data_cleaning.knn_imputation_nan,
    label_vars=LABEL_VARS,
    onehot_vars=ONEHOT_VARS,
    model_params={'C': 0.1, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'},
    param_grid=param_grid,
    grid_search=True
)

predictions, accuracy, *_ = run_pipeline(
    train, test,
    impute_minus_nine_func=data_cleaning.healthy_values_minus_nine,
    impute_question_func=data_cleaning.iterative_imputation_nan,
    label_vars=LABEL_VARS,
    onehot_vars=ONEHOT_VARS,
    model_params={'C': 0.1, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'},
    param_grid=param_grid,
    grid_search=True
)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Mejores parámetros: {'C': 0.1, 'class_weight': None, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy Media (CV): 0.5437
Rango de Rendimiento (CV): 0.5437 +/- 0.0218 

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Mejores parámetros: {'C': 1.0, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy Media (CV): 0.5301
Rango de Rendimiento (CV): 0.5301 +/- 0.0312 

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Mejores parámetros: {'C': 0.05, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy Media (CV): 0.5383
Rango de Rendimiento (CV): 0.5383 +/- 0.0222 



### -9 KNN

- ? mediana/moda
- ? KNN
- ? MICE

In [10]:
predictions, accuracy, *_ = run_pipeline(
    train, test,
    impute_minus_nine_func=data_cleaning.knn_imputation_minus_nine,
    impute_question_func=data_cleaning.median_mode_imputation_nan,
    label_vars=LABEL_VARS,
    onehot_vars=ONEHOT_VARS,
    model_params={'C': 0.1, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'},
    param_grid=param_grid,
    grid_search=True
)

predictions, accuracy, *_ = run_pipeline(
    train, test,
    impute_minus_nine_func=data_cleaning.knn_imputation_minus_nine,
    impute_question_func=data_cleaning.knn_imputation_nan,
    label_vars=LABEL_VARS,
    onehot_vars=ONEHOT_VARS,
    model_params={'C': 0.1, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'},
    param_grid=param_grid,
    grid_search=True
)

predictions, accuracy, *_ = run_pipeline(
    train, test,
    impute_minus_nine_func=data_cleaning.knn_imputation_minus_nine,
    impute_question_func=data_cleaning.iterative_imputation_nan,
    label_vars=LABEL_VARS,
    onehot_vars=ONEHOT_VARS,
    model_params={'C': 0.1, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'},
    param_grid=param_grid,
    grid_search=True
)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Mejores parámetros: {'C': 0.1, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy Media (CV): 0.5451
Rango de Rendimiento (CV): 0.5451 +/- 0.0270 

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Mejores parámetros: {'C': 0.05, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy Media (CV): 0.5382
Rango de Rendimiento (CV): 0.5382 +/- 0.0185 

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Mejores parámetros: {'C': 0.05, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy Media (CV): 0.5464
Rango de Rendimiento (CV): 0.5464 +/- 0.0189 



### -9 MICE

- ? mediana/moda
- ? KNN
- ? MICE

In [11]:
predictions, accuracy, *_ = run_pipeline(
    train, test,
    impute_minus_nine_func=data_cleaning.iterative_imputation_minus_nine,
    impute_question_func=data_cleaning.median_mode_imputation_nan,
    label_vars=LABEL_VARS,
    onehot_vars=ONEHOT_VARS,
    model_params={'C': 0.1, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'},
    param_grid=param_grid,
    grid_search=True
)

predictions, accuracy, *_ = run_pipeline(
    train, test,
    impute_minus_nine_func=data_cleaning.iterative_imputation_minus_nine,
    impute_question_func=data_cleaning.knn_imputation_nan,
    label_vars=LABEL_VARS,
    onehot_vars=ONEHOT_VARS,
    model_params={'C': 0.1, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'},
    param_grid=param_grid,
    grid_search=True
)

predictions, accuracy, *_ = run_pipeline(
    train, test,
    impute_minus_nine_func=data_cleaning.iterative_imputation_minus_nine,
    impute_question_func=data_cleaning.iterative_imputation_nan,
    label_vars=LABEL_VARS,
    onehot_vars=ONEHOT_VARS,
    model_params={'C': 0.1, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'},
    param_grid=param_grid,
    grid_search=True
)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Mejores parámetros: {'C': 10.0, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy Media (CV): 0.5410
Rango de Rendimiento (CV): 0.5410 +/- 0.0282 

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Mejores parámetros: {'C': 0.05, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy Media (CV): 0.5287
Rango de Rendimiento (CV): 0.5287 +/- 0.0102 

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Mejores parámetros: {'C': 0.01, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy Media (CV): 0.5396
Rango de Rendimiento (CV): 0.5396 +/- 0.0177 

