# Practica de Clasificacion. BD: Diabetes

Predecir con una precision mayor al 95% los casos de diabetes en el conjunto de test. Preprocesar los datos de las tablas para poder entrenar varios modelos distintos. Tener en cuenta que la edad puede estar expresada en dias o años. 

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_ana = pd.read_csv('data/diabetes_v2/diabetes_train_analysis.csv')
df_inf = pd.read_csv('data/diabetes_v2/diabetes_train_info.csv')
df_ana_test = pd.read_csv('data/diabetes_v2/diabetes_test_analysis.csv')
df_inf_test = pd.read_csv('data/diabetes_v2/diabetes_test_info.csv')

In [3]:
df_ana.head()

Unnamed: 0,id,cholesterol,gluc,smoke,alco,active,pressure,diabetes
0,62538,low,low,0,0,1,100/80,0
1,49159,low,low,0,0,1,120/82,0
2,60683,low,low,0,0,1,120/80,0
3,42924,low,low,0,0,0,120\80,0
4,52888,low,low,0,0,0,120/80,0


In [4]:
df_inf.head()

Unnamed: 0,id,age,height,weight,gender
0,0,50,168,62.0,f
1,1,55,156,85.0,m
2,2,18857,165,64.0,male
3,3,17623,169,82.0,f
4,4,47,156,56.0,m


In [5]:
df_train = pd.merge(df_ana,df_inf,on="id")
df_test = pd.merge(df_ana_test,df_inf_test,on="id")

## Preprocesamiento de datos

In [6]:
def Transform(df):
    df.pressure = df.pressure.str.replace("\\","/", regex=False)
    df[["pressure1","pressure2"]] = df.pressure.str.split("/", expand=True).astype(float)
    df = df.drop("pressure",axis=1)
    df = df.drop("id",axis=1)
    df.gender = df.gender.str.replace("female","f")
    df.gender = df.gender.str.replace("male","m")
    df.loc[df["age"] < 150, "age"] = df.loc[df["age"] < 150, "age"] * 365
    df.weight.fillna(df.weight.mean(), inplace=True)
    df = pd.get_dummies(df, columns=["cholesterol","gluc","gender"])
    return df

In [7]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

df_train = Transform(df_train)
scaler = StandardScaler().fit(df_train[["age","height","weight","pressure1","pressure2"]])
df_train[["age","height","weight","pressure1","pressure2"]] = scaler.transform(df_train[["age","height","weight","pressure1","pressure2"]])
df_train.head()

Unnamed: 0,smoke,alco,active,diabetes,age,height,weight,pressure1,pressure2,cholesterol_high,cholesterol_low,cholesterol_medium,gluc_high,gluc_low,gluc_medium,gender_f,gender_m
0,0,0,1,0,0.11991,0.567395,0.12628,-0.175302,-0.086953,0,1,0,0,1,0,1,0
1,0,0,1,0,-0.618194,0.079476,-0.63953,-0.054742,-0.076428,0,1,0,0,1,0,0,1
2,0,0,1,0,1.030711,0.689375,-1.266102,-0.054742,-0.086953,0,1,0,0,1,0,0,1
3,0,0,0,0,0.353273,0.567395,-0.848387,-0.054742,-0.086953,0,1,0,0,1,0,0,1
4,0,0,0,0,-1.298867,0.201456,-0.500292,-0.054742,-0.086953,0,1,0,0,1,0,0,1


In [8]:
X_train = df_train.loc[:, df_train.columns != "diabetes"]
y_train = df_train.loc[:, "diabetes"]

In [9]:
df_test = Transform(df_test)
df_test[["age","height","weight","pressure1","pressure2"]] = scaler.transform(df_test[["age","height","weight","pressure1","pressure2"]])
df_test.head()

Unnamed: 0,smoke,alco,active,diabetes,age,height,weight,pressure1,pressure2,cholesterol_high,cholesterol_low,cholesterol_medium,gluc_high,gluc_low,gluc_medium,gender_f,gender_m
0,0,0,0,1,1.153256,0.079476,1.100946,-0.054742,-0.086953,0,1,0,0,0,1,1,0
1,0,0,1,0,-1.954061,-0.286463,-1.683816,-0.175302,-0.139576,0,1,0,0,1,0,0,1
2,0,0,0,1,1.607847,-1.018342,-0.012958,0.065819,0.018293,0,1,0,0,1,0,0,1
3,0,0,1,0,-0.913435,-0.286463,1.031327,-0.054742,-0.03433,0,1,0,0,1,0,0,1
4,0,0,0,0,-0.41476,-0.286463,-0.152197,0.005539,-0.086953,0,1,0,0,1,0,1,0


In [10]:
X_test = df_test.loc[:, df_test.columns != "diabetes"]
y_test = df_test.loc[:, "diabetes"]

## Cross Validation

## Probemos ahora distintos clasificadores

In [11]:
from sklearn.metrics import accuracy_score

def tryClassifier(clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_train)
    print("Train accuracy:",accuracy_score(y_train, y_pred))
    y_pred = clf.predict(X_test)
    print("Test accuracy:",accuracy_score(y_test, y_pred))
    return clf

In [17]:
from sklearn.linear_model import LogisticRegression
tryClassifier(LogisticRegression(max_iter=300));

Train accuracy: 0.9906166666666667
Test accuracy: 0.9907


In [13]:
from sklearn.neighbors import KNeighborsClassifier
tryClassifier(KNeighborsClassifier());

Train accuracy: 0.9716
Test accuracy: 0.9594


In [14]:
from sklearn.tree import DecisionTreeClassifier
clf = tryClassifier(DecisionTreeClassifier())
clf.feature_importances_

Train accuracy: 1.0
Test accuracy: 0.9816


array([0.00040626, 0.00069686, 0.08193203, 0.14296917, 0.00623408,
       0.15126961, 0.00898837, 0.11910039, 0.02480277, 0.03551205,
       0.00499759, 0.        , 0.29851917, 0.02067721, 0.06895002,
       0.03494443])

In [15]:
from sklearn.ensemble import HistGradientBoostingClassifier
tryClassifier(HistGradientBoostingClassifier());

Train accuracy: 0.9962
Test accuracy: 0.9905


In [16]:
from sklearn.svm import LinearSVC
tryClassifier(LinearSVC(max_iter=10000));

Train accuracy: 0.9967
Test accuracy: 0.9968
