# Disease Symptom Prediction

En este notebook cargaremos el dataframe totalmente limpio y preprocesado que obtuvimos en el notebook anterior y realizaremos los diferentes modelos de Machine Learning, desempeño y validación.

### Recopilación de datos

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import datasets
from sklearn.preprocessing import LabelEncoder
# Normalizacion
from sklearn.preprocessing import MinMaxScaler

# Train, Test
from sklearn.model_selection import train_test_split

# Metricas
from sklearn.metrics import jaccard_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#Clasificadores
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

# Métodos de Validación
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from pickle import load
from pickle import dump

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
# Cargamos el DataFrame preprocesado del notebook anterior

df1 = load(open('DataFrame limpio.pkl', 'rb'))

In [4]:
df1.head()

Unnamed: 0,Disease,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,Fungal infection,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Modelo de Machine Learning y desempeño

Una vez procesados y evaluados todos los datos, se procede a entrenar el modelo de clasificación. Se utilizarán los siguientes métodos, mostrando en un DataFrame sus métricas para seleccionar el más adecuado.

- KNeighbors Classifier
- Radius Neighbors Classifier
- Nearest Centroid Classifier
- Gaussian Naive Bayes Classifier
- Logistic Regression
- Decision Tree Classifier
- Random Forest Classifier
- SVC
- AdaBoost Classifier
- Gradient Boosting Classifier

In [5]:
Disease_label_encoding = LabelEncoder()

Disease_label_encoding.fit(df1["Disease"].values)

Disease = Disease_label_encoding.transform(df1["Disease"].values)

df1["Disease"] = Disease

In [6]:
df1 = df1.drop_duplicates()

In [7]:
# Ahora vamos a separar la columna "objetivo" del resto de columnas
df_disease = df1["Disease"].copy()

# Como ya tenemos la columna "objetivo" en otra variable vamos a eliminarla del DataFrame original
df_no_col_obj = df1.drop("Disease", axis = 1)


In [8]:
X = np.asarray(df_no_col_obj)
y = np.asarray(df_disease)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [9]:
X.shape, y.shape

((304, 132), (304,))

In [10]:
models = [KNeighborsClassifier(), NearestCentroid(), GaussianNB(), LogisticRegression(),
         DecisionTreeClassifier(), RandomForestClassifier(), SVC(), GradientBoostingClassifier(), AdaBoostClassifier()]

In [11]:
datos_metricas = list()

for model in models:
    model.fit(X_train, y_train.ravel())
    
    yhat = model.predict(X_test)
    
    Jaccard_Index = jaccard_score(y_test, yhat, average = "macro")
    Exactitud = accuracy_score(y_test, yhat)
    Precision = precision_score(y_test, yhat, average = "macro")
    Sensibilidad = recall_score(y_test, yhat, average = "macro")
    F1_score = f1_score(y_test, yhat, average = "macro")
    
    datos_metricas.append([model, str(model).rstrip("()"), Jaccard_Index, Exactitud, Precision, Sensibilidad, F1_score])
    
df_metricas = pd.DataFrame(data = datos_metricas, columns = ["Model", "Modelo", "Jaccard_Index", "Exactitud", "Precision", 
                                                             "Sensibilidad", "F1_score"])


In [12]:
#for col in df1.columns:
    
#    sns.countplot(x = df1[col], hue = df1["Disease"])
#    plt.show()

In [13]:
mejor_modelo = df_metricas.sort_values("F1_score", ascending = False)["Model"].iloc[0]

mejor_modelo

In [14]:
df_metricas.drop("Model", axis = 1, inplace = True)
df_metricas.sort_values("F1_score", ascending = False)

Unnamed: 0,Modelo,Jaccard_Index,Exactitud,Precision,Sensibilidad,F1_score
1,NearestCentroid,1.0,1.0,1.0,1.0,1.0
2,GaussianNB,1.0,1.0,1.0,1.0,1.0
3,LogisticRegression,1.0,1.0,1.0,1.0,1.0
5,RandomForestClassifier,1.0,1.0,1.0,1.0,1.0
6,SVC,1.0,1.0,1.0,1.0,1.0
0,KNeighborsClassifier,0.924242,0.95082,0.924242,0.939394,0.929293
7,GradientBoostingClassifier,0.629902,0.819672,0.656863,0.666667,0.652941
4,DecisionTreeClassifier,0.521429,0.688525,0.614286,0.52619,0.558776
8,AdaBoostClassifier,0.365625,0.491803,0.485417,0.411458,0.410349


## Métodos de validación

In [15]:
# Stratified k-Fold

skfold = StratifiedKFold(n_splits = 10)
y_test_real, yhat = list(), list()


for train_index, test_index in skfold.split(X, y): 
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Modelo
    modelo = mejor_modelo
    modelo.fit(X_train, y_train)
    
    # Prediccion
    yhat1 = modelo.predict(X_test)
    yhat.extend(yhat1)
    
    # Valores reales
    y_test_real.extend(y_test)

print("Accuracy:", accuracy_score(y_test_real, yhat)) 

Accuracy: 1.0


In [16]:
# HOLD-OUT

lista_acc = list()

for i in range(500):
    X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size = 0.3, stratify = y)
    
    modelo = mejor_modelo
    modelo.fit(X_train, y_train)
    yhat = modelo.predict(X_test)
    
    #print("Accuracy:", accuracy_score(y_test, yhat))
    lista_acc.append(accuracy_score(y_test, yhat))

In [17]:
lista_acc

[1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0

## Creación de archivos: label encoder, modelo

In [18]:
dump(Disease_label_encoding, open('Disease_label_encoding.pkl', 'wb'))

In [19]:
dump(mejor_modelo, open('mejor_modelo.pkl', 'wb'))