# Imports and general settings

In [1]:
import time
from statistics import mean

import pandas as pd
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from src.persisters import DataPersister, ModelPersister
from src.readers import DatasetReader
from src.transformers import DataframeTransformer
from src.utils import get_model_log_string

pd.set_option('display.width', None)
pd.set_option('display.max_columns', 20)
start = time.time()

# Read symptoms
Reads sintomas.csv file and transforms its data into an usable dataframe.

In [None]:
symptoms_df = DatasetReader.read_symptoms('/data/sintomas.csv')
extended_symptoms_df = DataframeTransformer.split_symptoms(symptoms_df)
print('Symptoms')
extended_symptoms_df.head()

# Read diabetes.csv file
Reads diabetes.csv file and transforms its data into an usable dataframe.

In [None]:
diabetes_df = DatasetReader.read_data('/data/diabetes.csv')

# Gets only first measures by date of each patient
measures_diabetes_df = DataframeTransformer.split_dataframe_first_measures(diabetes_df, 1)

# Get variable dummies from column 'Sexo'
measures_diabetes_df = DataframeTransformer.get_dummies(measures_diabetes_df, ['Sexo'])

measures_diabetes_df.dropna(inplace=True)
print(f'\nN diabetes (diabetes=1): {measures_diabetes_df.size}')
measures_diabetes_df.head()

# Read no_diabetes.csv file
Reads no_diabetes.csv file and transforms its data into an usable dataframe.

In [None]:
no_diabetes_df = DatasetReader.read_data('/data/no_diabetes.csv')

# Gets only last measures by date of each patient
measures_no_diabetes_df = DataframeTransformer.split_dataframe_last_measures(no_diabetes_df, 0)

# Get variable dummies from column 'Sexo'
measures_no_diabetes_df = DataframeTransformer.get_dummies(measures_no_diabetes_df, ['Sexo'])

measures_no_diabetes_df.dropna(inplace=True)
print(f'\nN no diabetes (diabetes=0): {measures_no_diabetes_df.size}')
measures_no_diabetes_df.head()

# Test/train final dataframe creation

In [None]:
# Concatenates diabetes and non-diabetes dataframes into a single one.
measures_df = pd.concat([measures_diabetes_df, measures_no_diabetes_df], ignore_index=True)

# Joins created dataframe with each patient symptoms
measures_df = DataframeTransformer.df_merge_left_on_column(measures_df, extended_symptoms_df, on='ID')

# Replaces any NaN with the number 0
measures_df.fillna(0, inplace=True)

# Column ID is not needed for model training
measures_df.drop(['ID'], axis='columns', inplace=True)

# Save processed data
DataPersister.save(measures_df, 'train_test_dataset_001.csv')

print(f'\nN processed dataset: {measures_no_diabetes_df.size}')
measures_df.head()

# Correlations
See if there is any correlations

In [None]:
corr_matrix = measures_df.corr()
corr_matrix[corr_matrix >= 0.7]

It can be observed that *Colesterol* and *LDL-Calculado* features are highly correlated.

# Split into train/test datasets and standardize data
Split and standardize data.
Apply PCA analysis if selected.

In [7]:
# Split train/test datasets
train, test = train_test_split(measures_df, test_size=0.2, random_state=5, stratify=measures_df.diabetes)

# Standardize variables only with train data
columns_to_standardize = ['Edad', 'Colesterol', 'LDL-Calculado', 'Hb-Glicosilada', 'Trigliceridos', 'HDL-Colesterol']
sc = StandardScaler()
train[columns_to_standardize] = sc.fit_transform(train[columns_to_standardize])
test[columns_to_standardize] = sc.transform(test[columns_to_standardize])

# Features
train_x = train.loc[:, train.columns != 'diabetes']
test_x = test.loc[:, train.columns != 'diabetes']
# Target variables
train_y = train.diabetes
test_y = test.diabetes

apply_pca = True
if apply_pca:
    pca = PCA(.95) # Get n PCA components that fit 95% of the variance explanation
    train_x = pca.fit_transform(train_x)
    test_x = pca.transform(test_x)
    print(f'{pca.n_components_} components explain 95% of the variance.')

10 components explain 95% of the variance.


# Train chosen models

In [8]:
classifiers = [
    (AdaBoostClassifier(), ''),
    (DecisionTreeClassifier(), ''),
    (GaussianNB(), ''),
    (KNeighborsClassifier(), ''),
    (MLPClassifier(max_iter=5000), ''),  # 5000 iterations to ensure convergence
    (QuadraticDiscriminantAnalysis(), ''),
    (RandomForestClassifier(), ''),
    (SGDClassifier(max_iter=1000), ''),  # 1000 iterations to ensure convergence
    (SVC(kernel='linear'), 'linear'),
    (SVC(gamma=2), 'gamma_2'),
]

In [None]:
classifiers_scores = []
for model, description in classifiers:
    # Save model
    ModelPersister.save(model, description)
    score = mean(cross_val_score(model, train_x, train_y, scoring='accuracy'))
    classifiers_scores.append(score)
    print(get_model_log_string(model, description))
    print(f'Score: {score}')

# Select best model based on accuracy score
Select best model and predict test dataset.

In [None]:
# Best model
best_train_score = max(classifiers_scores)
best_model_idx = classifiers_scores.index(best_train_score)
best_model = classifiers[best_model_idx][0].fit(train_x, train_y)

print('\nBest model', get_model_log_string(classifiers[best_model_idx][0], classifiers[best_model_idx][1]))
print(f'\nTrain data {50*"#"}')
print(f'Accuracy score: {best_train_score}')

pred_y = best_model.predict(test_x)
print(f'\nTest data {50*"#"}')
print(classification_report(test_y, pred_y, target_names=['no diabetes', 'diabetes'], labels=[0, 1], digits=3))
print(f'AUC score: {roc_auc_score(test_y, pred_y)}')

In [11]:
print(f'\nElapsed time {time.time() - start} seconds')


Elapsed time 156.83001899719238 seconds


# Search for the best parameters configuration for the best model found

The best found model is the *RandomForestClassifier*

In [None]:
parameters = {
    'n_estimators': [10, 50, 100, 200, 500],
    'max_depth': [5, 10, 50, 100, None],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'max_features': [None, 'auto'],
    'criterion': ['gini', 'entropy'],
}
grid_search_cv = GridSearchCV(RandomForestClassifier(), parameters)
grid_search_cv.fit(train_x, train_y)

In [None]:
print('\nModel Random Forest')
print(f'\nTrain data {50*"#"}')
print(f'Best performing parameters: {grid_search_cv.best_params_}')
print(f'Accuracy score: {grid_search_cv.best_score_}')


grid_search_pred_y = grid_search_cv.predict(test_x)
print(f'\nTest data {50*"#"}')
print(classification_report(test_y, grid_search_pred_y, target_names=['no diabetes', 'diabetes'], labels=[0, 1], digits=3))
print(f'AUC score: {roc_auc_score(test_y, grid_search_pred_y)}')

In [None]:
print(f'\nElapsed time {time.time() - start} seconds')