In [61]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, make_scorer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [7]:
# Data download
data_file_path = "./heart_disease_uci.csv"
df = pd.read_csv(data_file_path)

# Check data types
df.info()

As we can see, there are numeric & categorical features in the dataframe

We will impliment:
* Imputation
* Feature engineering
* Scaling
* PCA

In [10]:
# For numerical features - SimpleImputer, then StandardScaler
numerical_pipeline_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy = "mean")),
        ("scaler", StandardScaler())
    ]
)

In [12]:
# For categorical - SimpleImputer & OneHotEncoder
categorical_pipeline_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy = "most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown = "ignore"))
    ]
)

In [46]:
# Define function for handling missing values
def process_binaries(data):
    src_columns = data.columns
    for col in src_columns:
        if data[col].isna().sum() > 0:
            data[col+'_missing'] = data[col].isna()
        data[col].fillna(False, inplace=True)
        data = pd.concat([data, pd.get_dummies(data[col], prefix=col)], axis=1)
    data = data.drop(columns=src_columns)
    return data

binary_pipeline_transformer = FunctionTransformer(process_binaries)

In [47]:
trash_bins = ["id", "num"]
num_cols = [col for col in df.select_dtypes(include = ["int", "float"]).columns if col not in trash_bins]
cat_cols = [col for col in df.select_dtypes(exclude = ["int", "float"]).columns if col not in trash_bins and df[col].nunique() > 2]
bin_cols = [col for col in df.select_dtypes(exclude = ["int", "float"]).columns if col not in trash_bins and df[col].nunique() == 2]

In [48]:
# Union all transformers together
data_pipeline_transformer = ColumnTransformer(
    transformers = [
        ("numerical", numerical_pipeline_transformer, num_cols),
        ("categorical", categorical_pipeline_transformer, cat_cols),
        ("binary", binary_pipeline_transformer, bin_cols)
    ],
    remainder = "drop"
)

In [49]:
# Create full pipeline
preprocessor = Pipeline(
    steps = [
        ("data_transformer", data_pipeline_transformer)
    ]
)

# Create pipeline with model
classifier_pipeline = Pipeline(
    steps = [
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(multi_class = "multinomial", solver = "newton-cg"))
    ]
)

Let's visualize it

In [50]:
classifier_pipeline

In [51]:
# Create train & test splits for training
data_train, data_test = train_test_split(df, test_size = 0.3, stratify = df["num"])

In [52]:
# Fit pipeline and get predictions
classifier_pipeline.fit(data_train, data_train["num"])

preds = classifier_pipeline.predict(data_test)
print(classification_report(data_test["num"], preds))

              precision    recall  f1-score   support

           0       0.72      0.92      0.81       123
           1       0.48      0.45      0.46        80
           2       0.33      0.12      0.18        33
           3       0.25      0.25      0.25        32
           4       0.00      0.00      0.00         8

    accuracy                           0.58       276
   macro avg       0.36      0.35      0.34       276
weighted avg       0.53      0.58      0.55       276



### Cross-validation

In [55]:
f1 = make_scorer(f1_score, average = "macro")
scores = cross_val_score(
    classifier_pipeline, 
    data_train, 
    data_train["num"], 
    scoring = f1,
    cv = 4,
    n_jobs = -1)

In [56]:
scores

array([0.39773737, 0.40774128, 0.37874434, 0.36213871])

### Hyperparameter optimization

In [58]:
param_grid = {
    "classifier__solver": ["sag", "saga", "newton-cg", "lbfgs"],
    "classifier__C": np.logspace(-5, 2, 100),
    "classifier__penalty": ["l1", "l2"],
    "classifier__class_weight": ["balanced", None],
    "preprocessor__data_transformer__numerical__imputer__strategy": ["median", "mean"]
}

search = GridSearchCV(
    classifier_pipeline,
    param_grid,
    verbose = True,
    n_jobs = -1,
    cv = 3, 
    scoring = f1
)

search.fit(data_train.drop("num", axis = 1), data_train["num"])

Fitting 3 folds for each of 3200 candidates, totalling 9600 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [59]:
search.best_params_

{'classifier__C': 0.8902150854450375,
 'classifier__class_weight': 'balanced',
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga',
 'preprocessor__data_transformer__numerical__imputer__strategy': 'mean'}

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [60]:
search.best_score_

0.4109820743925199

### More models

In [62]:
clf1 = LogisticRegression(multi_class = "multinomial", random_state = 1234, solver = "saga")
clf2 = RandomForestClassifier(n_estimators = 50, random_state = 1234)
clf3 = GaussianNB()
clf4 = xgboost.XGBClassifier(random_state = 1234)

blending_classifier = VotingClassifier(
    estimators = [
        ("log_regression", clf1),
        ("random_forest", clf2),
        ("gnb", clf3),
        ("xgb", clf4)
    ]
)

classifier_pipeline = Pipeline(
    steps = [
        ("preprocessor", preprocessor),
        ("classifier", blending_classifier)
    ]
)

classifier_pipeline.fit(data_train, data_train["num"])

