# Imports and general settings

In [26]:
import time
from statistics import mean

import pandas as pd
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from src.persisters import DataPersister, ModelPersister
from src.readers import DatasetReader
from src.transformers import DataframeTransformer
from src.utils import get_model_log_string

pd.set_option('display.width', None)
pd.set_option('display.max_columns', 20)
start = time.time()

# Read symptoms
Reads sintomas.csv file and transforms its data into an usable dataframe.

In [27]:
symptoms_df = DatasetReader.read_symptoms('/data/sintomas.csv')
extended_symptoms_df = DataframeTransformer.split_symptoms(symptoms_df)
print('Symptoms')
extended_symptoms_df.head()

Symptoms


Unnamed: 0,ID,obesidad,hipotension,vision borrosa,sin sintomas,hiperlipidemia,hipertension,sedentarismo,fatiga,anemia,symptom_diabetes,sed habitual y necesidad de beber,sobrepeso,sin diagnostico
0,UOC00035209,0,0,0,0,1,0,1,0,0,0,0,0,0
1,UOC00058738,0,0,0,0,0,1,0,0,0,0,0,0,0
2,UOC00059812,0,0,0,0,0,0,1,0,0,0,1,0,0
3,UOC00067451,0,0,0,0,0,0,0,0,0,1,0,0,0
4,UOC00120895,0,0,0,0,0,0,0,0,0,0,0,0,1


# Read diabetes.csv file
Reads diabetes.csv file and transforms its data into an usable dataframe.

In [28]:
diabetes_df = DatasetReader.read_data('/data/diabetes.csv')

# Gets only first measures by date of each patient
measures_diabetes_df = DataframeTransformer.split_dataframe_first_measures(diabetes_df, 1)

# Get variable dummies from column 'Sexo'
measures_diabetes_df = DataframeTransformer.get_dummies(measures_diabetes_df, ['Sexo'])

measures_diabetes_df.dropna(inplace=True)
print(f'\nN diabetes (diabetes=1): {measures_diabetes_df.size}')
measures_diabetes_df.head()


N diabetes (diabetes=1): 23830


Unnamed: 0,ID,Edad,Colesterol,LDL-Calculado,Hb-Glicosilada,Trigliceridos,HDL-Colesterol,diabetes,Sexo_Hombre,Sexo_Mujer
0,UOC00035209,54,181.0,90.0,7.6,129.0,65.0,1,0,1
1,UOC00058738,51,257.0,159.0,9.5,311.0,36.0,1,1,0
2,UOC00059812,42,186.0,123.0,6.9,60.0,51.0,1,1,0
3,UOC00067451,80,183.0,101.0,6.4,244.0,33.0,1,1,0
5,UOC00140051,85,182.0,76.0,6.2,331.0,40.0,1,0,1


# Read no_diabetes.csv file
Reads no_diabetes.csv file and transforms its data into an usable dataframe.

In [29]:
no_diabetes_df = DatasetReader.read_data('/data/no_diabetes.csv')

# Gets only last measures by date of each patient
measures_no_diabetes_df = DataframeTransformer.split_dataframe_last_measures(no_diabetes_df, 0)

# Get variable dummies from column 'Sexo'
measures_no_diabetes_df = DataframeTransformer.get_dummies(measures_no_diabetes_df, ['Sexo'])

measures_no_diabetes_df.dropna(inplace=True)
print(f'\nN no diabetes (diabetes=0): {measures_no_diabetes_df.size}')
measures_no_diabetes_df.head()


N no diabetes (diabetes=0): 15130


Unnamed: 0,ID,Edad,Colesterol,LDL-Calculado,Hb-Glicosilada,Trigliceridos,HDL-Colesterol,diabetes,Sexo_Hombre,Sexo_Mujer
2,UOC00028665,60,318.0,219.0,5.7,264.0,46.0,0,1,0
4,UOC00030620,48,264.0,163.0,6.1,169.0,67.0,0,0,1
6,UOC00031047,62,225.0,154.0,5.1,99.0,51.0,0,1,0
8,UOC00037517,49,210.0,134.0,5.4,72.0,62.0,0,0,1
15,UOC00043642,92,155.0,88.0,5.5,106.0,46.0,0,1,0


# Test/train final dataframe creation

In [30]:
# Concatenates diabetes and non-diabetes dataframes into a single one.
measures_df = pd.concat([measures_diabetes_df, measures_no_diabetes_df], ignore_index=True)

# Joins created dataframe with each patient symptoms
measures_df = DataframeTransformer.df_merge_left_on_column(measures_df, extended_symptoms_df, on='ID')

# Replaces any NaN with the number 0
measures_df.fillna(0, inplace=True)

# Column ID is not needed for model training
measures_df.drop(['ID'], axis='columns', inplace=True)

# Save processed data
DataPersister.save(measures_df, 'train_test_dataset_001.csv')

print(f'\nProcessed dataset: {measures_no_diabetes_df.size}')
measures_df.head()


Processed dataset: 15130


Unnamed: 0,Edad,Colesterol,LDL-Calculado,Hb-Glicosilada,Trigliceridos,HDL-Colesterol,diabetes,Sexo_Hombre,Sexo_Mujer,obesidad,...,sin sintomas,hiperlipidemia,hipertension,sedentarismo,fatiga,anemia,symptom_diabetes,sed habitual y necesidad de beber,sobrepeso,sin diagnostico
0,54,181.0,90.0,7.6,129.0,65.0,1,0,1,0,...,0,1,0,1,0,0,0,0,0,0
1,51,257.0,159.0,9.5,311.0,36.0,1,1,0,0,...,0,0,1,0,0,0,0,0,0,0
2,42,186.0,123.0,6.9,60.0,51.0,1,1,0,0,...,0,0,0,1,0,0,0,1,0,0
3,80,183.0,101.0,6.4,244.0,33.0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,85,182.0,76.0,6.2,331.0,40.0,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0


# Split into train/test datasets and standardize data

In [31]:
# Split
train, test = train_test_split(measures_df, test_size=0.2, random_state=5, stratify=measures_df.diabetes)
train_x = train.loc[:, train.columns != 'diabetes']
test_x = test.loc[:, train.columns != 'diabetes']

# Standardize
train_x = StandardScaler().fit_transform(train_x)
test_x = StandardScaler().fit_transform(test_x)

# Target variables
train_y = train.diabetes
test_y = test.diabetes

# Train chosen models

In [32]:
classifiers = [
    (KNeighborsClassifier(), ''),
    (SVC(kernel='linear'), 'linear'),
    (SVC(gamma=2), 'gamma_2'),
    (DecisionTreeClassifier(), ''),
    (RandomForestClassifier(), ''),
    (AdaBoostClassifier(), ''),
    (GaussianNB(), ''),
    (QuadraticDiscriminantAnalysis(), ''),
    (SGDClassifier(), ''),
    (MLPClassifier(), ''),
]

In [33]:
classifiers_scores = []
for model, description in classifiers:
    # Save model
    ModelPersister.save(model, description)
    score = mean(cross_val_score(model, train_x, train_y))
    classifiers_scores.append(score)
    print(get_model_log_string(model, description))
    print(f'Score: {score}')


Model: KNeighborsClassifier
Score: 0.9364638638515043

Model: SVC (linear)
Score: 0.958923941227312

Model: SVC (gamma_2)
Score: 0.8064853685640202

Model: DecisionTreeClassifier
Score: 0.9297207474173766

Model: RandomForestClassifier
Score: 0.9560357245750504

Model: AdaBoostClassifier
Score: 0.9589223978268923

Model: GaussianNB
Score: 0.8299157303370787

Model: QuadraticDiscriminantAnalysis
Score: 0.8122659176029963

Model: SGDClassifier
Score: 0.9464074782895008





Model: MLPClassifier
Score: 0.958283430053093




# Select best model based on accuracy score
Select best model and predict test dataset.

In [42]:
# Best model
best_train_score = max(classifiers_scores)
best_model_idx = classifiers_scores.index(best_train_score)
best_model = classifiers[best_model_idx][0].fit(train_x, train_y)
pred_y = best_model.predict(test_x)

print('\nBest model\n', get_model_log_string(classifiers[best_model_idx][0], classifiers[best_model_idx][1]))
print(f'\nTrain accuracy score: {best_train_score}')
print(f'\nTest accuracy score: {accuracy_score(test_y, pred_y)}')
print(f'Test AUC score: {roc_auc_score(test_y, pred_y)}')


Best model
 
Model: SVC (linear)

Train accuracy score: 0.958923941227312

Test accuracy score: 0.9564102564102565
Test AUC score: 0.9571372231562778


In [35]:
print(f'\nElapsed time {time.time() - start} seconds')


Elapsed time 55.41949462890625 seconds
