Autor: Anderson Morillo Diaz

Departamento de ingeniería de sistemas, Universidad tecnológica de Bolívar, Parque Industrial y Tecnológico Carlos Vélez Pombo Km 1 Vía Turbaco, Cartagena, Colombia

# Content
- 1 Python Libraries

- 2 Data Content

- 3 Read and Analyse Data

- 4 Dependent Variable Analysis

- 5 Correlation Between Features

- 6 Distribution of Features

- 7 Preprocessing: Missing Value Problem

- 8 Preprocessing: Train-Test Split and Normalization

- 9 Modelling: comparing models

- 10 Metrics Accuracy, Precision, Recall, F1, MSE

- 11 Nonparametric statistics

- 12 Conclusion


# Python Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

## For Preprocessing
from sklearn.preprocessing import StandardScaler

from collections import Counter
####################
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


## Machine Learning Algorithm
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier

## To evaluate models
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import classification_report
from scikitplot.metrics import plot_roc
from numpy import argmax
import numpy as np
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('D:/Anderson/Downloads/data/train_diabetes.csv')

# DATA CONTENT

Columns Description:

numero ID de la persona

Number of times pregnant

Plasma glucose concentration 2 hours in an oral glucose tolerance test

Diastolic blood pressure (mm Hg)

Triceps skin fold thickness (mm)

2-Hour serum insulin (mu U/ml)

Body mass index (weight in kg/(height in m)^2)

Diabetes pedigree function

Age (years)

Class variable (0 or 1)

In [None]:
#se elimina la columna p_id dado que esta solo representa la identificación el participante
data = data.drop(columns=['p_id'])
rows, columns = data.shape
print('Rows--> ', rows)
print('Columns--> ', columns)

DATA ANALISIS
Como estan compuesto el data set


In [None]:
data.head()

In [None]:
data.describe()

por la naturaleza de los datos "glucose_concentration", "blood_pressure", "skin_fold_thickness", "serum_insulin", "bmi" estos no pueden ser 0 dado que sería un valor atípico y sin sentido, excepto que se presente dado para remplazar todas datos nulos en la toma de datos, por lo cual se toman como nulos para luego igualarlos a la nueva media presentada.

In [None]:
# NaN values of 0 for Glucose, Blood Pressure, Skin Thickness, Insulin, BMI
cols = ["glucose_concentration", "blood_pressure", "skin_fold_thickness", "serum_insulin", "bmi"]
for col in cols:
    data[col].replace(0,np.NaN,inplace=True)

# now we can see missing values
data.isnull().sum()

In [None]:
data.describe()

In [None]:
# cambiar los datos nulos con las medias de los valores
data = data.apply(lambda x: x.fillna(x.mean()), axis=0)
data.isnull().sum()

In [None]:
d = pd.DataFrame(data["diabetes"].value_counts())
fig = px.pie(d, values = "diabetes", names = ["no diabetes", "diabetes"], hole = 0.35, opacity = 0.8,
            labels = {"label" :"diabetes","diabetes":"Number of Samples"})
fig.update_layout(title = dict(text = "Pie Chart of Potability Feature"))
fig.update_traces(textposition = "outside", textinfo = "percent+label")
fig.show()

Dependent Variable Analaysis:

probar la depencia de las varaibles y si cumplen con los requisitos de normalización

In [None]:
sns.clustermap(data.corr(), cmap = "vlag", dendrogram_ratio = (0.1, 0.2), annot = True, linewidths = .8, figsize = (9,10))
plt.show()

In [None]:
non_diabetes = data.query("diabetes == 0")
diabetes = data.query("diabetes == 1")

plt.figure(figsize = (15,15))
for ax, col in enumerate(data.columns[:9]):
    plt.subplot(3,3, ax + 1)
    plt.title(col)
    sns.kdeplot(x = non_diabetes[col], label = "No diabetes")
    sns.kdeplot(x = diabetes[col], label = "diabetes")
    plt.legend()
plt.tight_layout()

Los datos presenta una correlación debil con los resultados de las diabetes, pero estos presentan una distribución normal a simple vista

# Preprocessing:

In [None]:
target = data['diabetes']
data.drop(columns='diabetes', axis=1, inplace=True)

scale = StandardScaler()
newData = pd.DataFrame(scale.fit_transform(data), columns=data.columns)

In [None]:
newData.head()

In [None]:
target.head()

In [None]:
# train test split
X_train, X_test, y_train, y_test  = train_test_split(newData, target, test_size=0.3, random_state=3)


print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")



In [None]:
# min-max normalization
x_train_max = np.max(X_train)
x_train_min = np.min(X_train)
X_train = (X_train - x_train_min)/(x_train_max-x_train_min)
X_test = (X_test - x_train_min)/(x_train_max-x_train_min)

# modeling

In [None]:

models = [('LR', LogisticRegression()),
          ('KNN', KNeighborsClassifier()),
          ('CART', DecisionTreeClassifier()),
          ('RF', RandomForestClassifier()),
          ('GB',GradientBoostingClassifier()),
          ("LightGBM", LGBMClassifier())]

# evaluate each model in turn
results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f std:(%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

El modelos que presenta mejor comportamiento es la regresion logictica, presentando mayor presición con 0.7529 y una desviación estandar menor que todos los otros modelos

In [None]:
models = [('LR', LogisticRegression())]

def build_and_test(X_tr, X_te, y_tr, y_te, class_weight=None, threshold=False):
    # Build and fit the model
    for name, model in models:
            kfold = KFold(n_splits=10)
            model.fit(X_tr, y_tr)

            # Test the model
            y_pred = model.predict(X_te)
            print(f'Precision score {model} %s' % precision_score(y_te, y_pred))
            print(f'Recall score {model} %s' % recall_score(y_te, y_pred))
            print(f'F1-score score {model} %s' % f1_score(y_te, y_pred))
            print(f'Accuracy score {model} %s' % accuracy_score(y_te, y_pred))

            y_score = model.predict_proba(X_te)
            fpr0, tpr0, thresholds = roc_curve(y_te, y_score[:, 1])
            roc_auc0 = auc(fpr0, tpr0)

            #Calculate the best threshold
            best_threshold = None
            if threshold:
                J = tpr0 - fpr0
                ix = argmax(J) # take the value which maximizes the J variable
                best_threshold = thresholds[ix]
                # adjust score according to threshold.
                y_score = np.array([[1, y[1]] if y[0] >= best_threshold else [0, y[1]] for y in     y_score])


            # Plot metrics
            plot_roc(y_te, y_score)
            plt.show()

            # Print a classification report
            print(classification_report(y_te,y_pred))
    return roc_auc0,fpr0,tpr0, best_threshold

# Oversample the smallest class

In [None]:
#ejecutar para un dataset pequeño arreglar
dt_test = pd.read_csv('D:/Anderson/Downloads/data/test_diabetes.csv')
dt_test = dt_test.drop(columns=['p_id'])

#target_t = dt_test['diabetes']
#data.drop(columns='diabetes', axis=1, inplace=True)

scale_t = StandardScaler()
newData_t = pd.DataFrame(scale_t.fit_transform(dt_test), columns=dt_test.columns)



'''model.fit(X_tr, y_tr)
         # Test the model
y_pred = model.predict(X_te)
'''

In [None]:
from imblearn.over_sampling import RandomOverSampler
over_sampler = RandomOverSampler(random_state=42)
X_res, y_res = over_sampler.fit_resample(X_train, y_train)
print(f"Training target statistics: {Counter(y_res)}")
print(f"Testing target statistics: {Counter(y_test)}")

In [None]:
roc_auc_ros,fpr_ros,tpr_ros, _ = build_and_test(X_res, X_test, y_res, y_test)

In [None]:
#Undersample the biggest dataset with random under sampler
from imblearn.under_sampling import RandomUnderSampler
under_sampler = RandomUnderSampler(random_state=42)
X_res, y_res = under_sampler.fit_resample(X_train, y_train)
print(f"Training target statistics: {Counter(y_res)}")
print(f"Testing target statistics: {Counter(y_test)}")

In [None]:
roc_auc_rus,fpr_rus,tpr_rus , _ = build_and_test(X_res, X_test, y_res, y_test)

# Bibliografia:

[1] ‘Documentaion scikit-learn’, https://scikit-learn.org/0.21/documentation.html, accessed 19 abril 2022

[2] ‘DiabetesClassificationProject’, https://www.kaggle.com/code/alibabaei78/diabetesclassificationproject, accessed 19 abril 2022

[3] ‘Diabetes Prediction Using Classification Models’, https://www.kaggle.com/code/simgeerek/diabetes-prediction-using-classification-models, accessed 19 abril 2022

[4] ‘How to balance a dataset in Python’,https://towardsdatascience.com/how-to-balance-a-dataset-in-python-36dff9d12704#:~:text=A%20balanced%20dataset%20is%20a,class%20weight, accessed 19 abril 2022