**1. Carga de modulos**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

**2. Carga de bases de datos**

In [None]:
bd_diabetic_data = pd.read_csv('data/diabetic_data.csv')
bd_AdmissionSource = pd.read_csv('data/admission_source_id.csv')
bd_admission_type_id = pd.read_csv('data/admission_type_id.csv')
bd_discharge_disposition_id = pd.read_csv('data/discharge_disposition_id.csv')

*Comprobar correcta lectura de datos*

In [None]:
bd_AdmissionSource.head(10)

*Juntamos las bases de datos*

In [None]:
bd_diabetes1 = pd.merge(bd_diabetic_data,bd_AdmissionSource, on = "admission_source_id", how = "left")
bd_diabetes2 = pd.merge(bd_diabetes1,bd_admission_type_id, on = "admission_type_id", how = "left" )
bd_full = pd.merge(bd_diabetes2,bd_discharge_disposition_id, on = "discharge_disposition_id", how  = "left")

*Renombramos las variables por buenas prácticas*

In [None]:
columnsNameDiabetes = ["encounter_id", "patient_nbr", "race", "gender", "age", "weight", "admission_type_id", "discharge_disposition_id",
                       "admission_source_id", "time_in_hospital", "payer_code", "medical_specialty", "num_lab_procedures", "num_procedures",
                       "num_medications", "number_outpatient", "number_emergency", "number_inpatient", "diag_1", "diag_2", "diag_3",
                       "number_diagnoses", "max_glu_serum", "a1c_result", "metformin", "repaglinide", "nateglinide", "chlorpropamide",
                       "glimepiride", "acetohexamide", "glipizide", "glyburide", "tolbutamide", "pioglitazone", "rosiglitazone", "acarbose",
                       "miglitol", "troglitazone", "tolazamide", "examide", "citoglipton", "insulin", "glyburide_metformin", "glipizide_metformin",
                       "glimepiride_pioglitazone", "metformin_rosiglitazone", "metformin_pioglitazone", "change", "diabetes_med", "readmitted",
                       "description_x", "description_y", "description"]

**3. Analizamos la variable target**

In [None]:
print("Número de filas: " + str(bd_full.shape[0]))
print("Número de columnas: " + str(bd_full.shape[1]))

In [None]:
bd_full.info()

In [None]:
bd_full["readmitted"].describe()
bd_full.groupby("readmitted").size()

In [None]:
columnTarget = "target"
bd_full[columnTarget] = bd_full["readmitted"].map({"NO":0,"<30":1,">30":1})

**4. Diferenciamos los tipos de variables por buenas practicas**

In [None]:
columnsNumeric = ["time_in_hospital", "num_lab_procedures", "num_procedures", "num_medications", "number_outpatient", "number_emergency",
                   "number_inpatient", "number_diagnoses"]
columnsString = ["race", "gender", "age", "weight", "payer_code", "medical_specialty", "diag_1", "diag_2", "diag_3",
                   "max_glu_serum", "a1c_result", "metformin", "repaglinide", "nateglinide", "chlorpropamide", "glimepiride", "acetohexamide",
                   "glipizide", "glyburide", "tolbutamide", "pioglitazone", "rosiglitazone", "acarbose", "miglitol", "troglitazone", "tolazamide",
                   "examide", "citoglipton", "insulin", "glyburide_metformin", "glipizide_metformin", "glimepiride_pioglitazone", "metformin_rosiglitazone",
                   "metformin_pioglitazone", "change", "diabetes_med", "description_x", "description_y", "description"]
columnTarget = "target"

*Estudiamos las variables numericas*

In [None]:
bd_full[columnsNumeric].describe()

*analizamos las varianzas mas cercanas a cero*

In [None]:
bd_full[columnsNumeric].var().sort_values(ascending = False)

*Hacemos un analisis de frecuencia a la variable: number_emergency porqeu tiene la varianza mas pequeña*

*Revisamos las frecuencias de las variables numéricas*

In [None]:
def frecuencia (x):
    frec = bd_full.groupby(x).size()/bd_full[x].shape[0]
    print(frec)
    print("\n")

*A las variables "number_outpatient", "number_emergency" y "number_inpatient" hacemos un analisis de frecuencia*

In [None]:
print(frecuencia("number_outpatient"))
print(frecuencia("number_emergency"))
print(frecuencia("number_inpatient"))

*Viendo presencia de valores perdidos*

In [None]:
for x in columnsNumeric:
    print(x)
    print(bd_full.loc[(pd.isna(bd_full[x]))].shape[0])
    print("\n")

*Viendo presencia de atípicos*

In [None]:
plt.boxplot(bd_full["time_in_hospital"] , 0, 'gD')

In [None]:
# Función de Identificar Outliers:
def calcularOutliars(x):
  Q01 = x.quantile(0.25)
  Q03 = x.quantile(0.75)
  IQR = Q03 - Q01
  a = (x < (Q01 - 1.5 * IQR)) | (x > (Q03 + 1.5 * IQR))
  numOutliars = a[a == True].shape[0]
  pornumOutliars = numOutliars/x.shape[0]
  return pornumOutliars

In [None]:
calcularOutliars(bd_full["time_in_hospital"])

In [None]:
for x in columnsNumeric:
    print(x)
    print(calcularOutliars(bd_full[x]))
    print("\n")

In [None]:
#Imputamos los valores outliers en nuevas variables
#==================================================


In [None]:
#Variable number_diagnoses


In [None]:
bd_full[columnsNumeric].dtypes

In [None]:
#Retiramos variables numéricas
columnsNumeric.remove('number_diagnoses')
columnsNumeric.remove('num_lab_procedures')

In [None]:
#Aderimos las nuevas variables numéricas
columnsNumeric = columnsNumeric + ["number_diagnoses_imp"] + ["num_lab_procedures_imp"]

*Observamos que se acumulan en 0 pero no en un 90% (no en la gran mayoria), lo tendremos en cuenta en la exploración de datos para descartarlas de ser necesario con más evidencia (mean encoding)*

**Para las variables categoricas vemos sus distribuciones**

In [None]:
#Corregimos las variables que no están definidas de forma correcta como NaN (nan)
for x in columnsString:
    print(x)
    print(bd_full.groupby(x).size())
    print("\n")

In [None]:
#Reemplazamos los valores extraños
bd_full["race"] = bd_full["race"].replace("?",np.nan)
bd_full["gender"] = bd_full["gender"].replace("Unknown/Invalid",np.nan)
bd_full["weight"] = bd_full["weight"].replace("?",np.nan)
bd_full["payer_code"] = bd_full["payer_code"].replace("?",np.nan)
bd_full["medical_specialty"] = bd_full["medical_specialty"].replace("?",np.nan)
bd_full["medical_specialty"] = bd_full["medical_specialty"].replace("?",np.nan)

In [None]:
#Viendo presencia de valores perdidos
for x in columnsString:
    print(x)
    print(bd_full.loc[(pd.isna(bd_full[x]))].shape[0]/bd_full[x].shape[0])
    print("\n")

In [None]:
#Retiramos las variables con muchos NAs


In [None]:
#Como el porcentaje de nan es menor al 1% eliminamos los casos perdidos para un mejor ajuste
bd_full =  bd_full.dropna()

In [None]:
#Columnas categoricas
for x in columnsString:
  plt.title(x)
  bd_full.fillna("--NULL").groupby(x)[x].count().plot(kind = "bar")
  plt.show()

In [None]:
#Retiramos las variables diag_1 diag_2 diag_3
columnsNumeric.append("diag_1")
columnsNumeric.append("diag_2")
columnsNumeric.append("diag_3")

columnsString.remove("diag_1")
columnsString.remove("diag_2")
columnsString.remove("diag_3")

*Se observa que las variables diag_1, diag_2 y diag_3 son en su mayoria numéricas, revisamos el porcentaje de numéricas, no numericas y nulos.*

In [None]:
def changeType(x):
  try:
    a = float(x)
    if(np.isnan(a)):
      return -1
    else:
      return 1
  except:
    return 0
  
def diagToNumber(x):
  try:
    a = float(x)
    if(np.isnan(a)):
      return float("nan")
    else:
      return a
  except:
    return float("nan")

In [None]:
for x in ["diag_1", "diag_2", "diag_3"]:
  numNumeric = bd_full[bd_full[x].apply(changeType) == 1].shape[0]
  numString = bd_full[bd_full[x].apply(changeType) == 0].shape[0]
  numNull = bd_full[bd_full[x].apply(changeType) == -1].shape[0]
  print(x)
  print("Numeros: " + str(np.round(numNumeric / bd_full.shape[0] * 100, 2)) + "%")
  print("String: " + str(np.round(numString / bd_full.shape[0] * 100, 2)) + "%")
  print("Nulos: " + str(np.round(numNull / bd_full.shape[0] * 100, 2)) + "%")
  print("\n")

*Todos los numéricos son mayores a 90%, tomamos estos para transformar a numéricas estas variables*

In [None]:
for x in ["diag_1", "diag_2", "diag_3"]:
  bd_full[x] = bd_full[x].apply(diagToNumber)
  bd_full[x] = bd_full[x].apply(float)

*Transformamos las variables categoricas para que puedan ser leidas*

In [None]:
#Como el porcentaje de nan es menor al 1% eliminamos los casos perdidos para un mejor ajuste
bd_full =  bd_full.dropna()

In [None]:
#===================================================================
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

In [None]:
le = preprocessing.LabelEncoder()

In [None]:
for x in columnsString:
    le.fit(bd_full[x])
    le.transform(bd_full[x]) 
    bd_full[x + '_D'] = le.transform(bd_full[x])

In [None]:
columnsString_D = ['race_D', 'gender_D', 'age_D', 'max_glu_serum_D', 'a1c_result_D', 'metformin_D', 'repaglinide_D', 'nateglinide_D',
 'chlorpropamide_D', 'glimepiride_D', 'acetohexamide_D', 'glipizide_D', 'glyburide_D', 'tolbutamide_D', 'pioglitazone_D', 'rosiglitazone_D',
 'acarbose_D', 'miglitol_D', 'troglitazone_D', 'tolazamide_D', 'examide_D', 'citoglipton_D', 'insulin_D', 'glyburide_metformin_D',
 'glipizide_metformin_D', 'glimepiride_pioglitazone_D', 'metformin_rosiglitazone_D', 'metformin_pioglitazone_D', 'change_D',
 'diabetes_med_D', 'description_x_D', 'description_y_D', 'description_D']

**4. REDUCCIÓN DE DIMENSIONES**

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(bd_full[columnsNumeric])
pca.explained_variance_ratio_

In [None]:
for i in range(len(pca.components_)):
    print('% Var. explicada ('+str(i+1)+' componentes): ', np.cumsum(pca.explained_variance_ratio_)[i]*100)
    
plt.bar(range(1,len(pca.components_)+1),pca.explained_variance_ratio_, alpha=.2,color='0')
plt.plot(range(1,len(pca.components_)+1),np.cumsum(pca.explained_variance_ratio_),alpha=4)
plt.title("Varianza explicada y pareto")
plt.show()

In [None]:
bd_full[columnsNumeric].corr()

**5. ANALISIS CLUSTER**

In [None]:
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import pairwise_distances_argmin_min 

In [None]:
#Variables a trabajar
var_trabajar = columnsNumeric + columnsString_D
var_trabajar

In [None]:
x_train = np.array(bd_full[var_trabajar])

Nc = range(1, 20)
kmeans = [KMeans(n_clusters=i) for i in Nc]
kmeans
score = [kmeans[i].fit(x_train).score(x_train) for i in range(len(kmeans))]
score
plt.plot(Nc,score)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show()

In [None]:
#Evaluamos los indicadores para escoger el nro de cluster óptimos
ctdDf = int(0.1*bd_full.shape[0])
cluster = [kmeans[i].predict(x_train) for i in range(len(kmeans))]

for i in range(1,19):    
    print(str(i+1)+' clústeres:')
    print('Inercia: '+str(kmeans[i].inertia_))
    print('Silueta: '+str(metrics.silhouette_score(x_train, cluster[i], metric='euclidean',sample_size=ctdDf)))
    print('Distancias: '+str(pairwise_distances_argmin_min(kmeans[i].cluster_centers_, x_train)[0]))
    print("\n")

In [None]:
#Escogemos el nro de cluster adeucdo
kmeans = KMeans(n_clusters = 3)
kmeans.fit(x_train)

In [None]:
kmeans.cluster_centers_

In [None]:
kmeans.predict(x_train)
bd_full["cluster03"] = kmeans.predict(x_train)

In [None]:
bd_full.groupby('cluster03').size()

In [None]:
bd_full.shape

**6. Generamos nuestra base modeler**

In [None]:
columnsEvaluar = columnsNumeric + ["cluster03"] + columnsString_D + ["target"]
bd_modeler = bd_full[columnsEvaluar]

**7. Particionado de datos: train y Test**

In [None]:
from sklearn.model_selection import train_test_split
X = bd_modeler[columnsEvaluar]
y = bd_modeler[["target"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 43)

In [None]:
pdX_train = pd.DataFrame(X_train, columns = columnsEvaluar)
pdy_train = pd.DataFrame(y_train, columns = ["target"])
pdX_test = pd.DataFrame(X_test, columns = columnsEvaluar)
pdy_test = pd.DataFrame(y_test, columns = ["target"])

In [None]:
#Regresión logistica
#==============#============
from sklearn.linear_model import LogisticRegression

clasificador = LogisticRegression()
clasificador.fit(pdX_train[columnsEvaluar], y_train)   

prediction_train = clasificador.score(pdX_train[columnsEvaluar], y_train)
prediction_test = clasificador.score(pdX_test[columnsEvaluar], y_test) 

In [None]:
#Arboles de Decisiones:    
#==============#============
from sklearn.tree import DecisionTreeClassifier

clasificador2 = DecisionTreeClassifier(max_depth = 40)
clasificador2.fit(pdX_train[columnsEvaluar], y_train)

prediction2_train = clasificador2.score(pdX_train[columnsEvaluar], y_train)
prediction2_test = clasificador2.score(pdX_test[columnsEvaluar], y_test)

In [None]:
#Modelo GBoosting:
#=====#============
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

clasificador3 = GradientBoostingClassifier(n_estimators=200, max_depth = 4, learning_rate = .4)
clasificador3.fit(pdX_train[columnsEvaluar], pdy_train)

prediction3_train = clasificador3.score(pdX_train[columnsEvaluar], pdy_train)
prediction3_test = clasificador3.score(pdX_test[columnsEvaluar], pdy_test)