In [99]:
import pandas as pd
import numpy as np
import io
import gc
import time
import pickle

from ray.tune.sklearn import TuneSearchCV
from pprint import pprint
from datetime import date
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# settings
import warnings
warnings.filterwarnings("ignore")
gc.enable()

In [100]:
# Global Variables
random_state = 50

In [101]:
ROOT_PATH = 'C:\\Users\\Usuario\\Desktop\\Proyecto Final\\src\\data\\'

In [102]:
df_train = pd.read_csv(ROOT_PATH+'processed\\train_preprocessed.csv')
df_test = pd.read_csv(ROOT_PATH+'processed\\test_preprocessed.csv')

In [103]:
df_train.shape

(18047, 32)

In [104]:
df_test.shape

(9465, 30)

In [105]:
df_train.sample(3)

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Mother's age,Father's age,Status,Respiratory Rate (breaths/min),...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
8396,5.0,1.0,0.0,0.0,0.0,5.01586,28.0,24.0,0.0,1.0,...,1.0,3.0,3.0,1.0,0.0,0.0,0.0,1.0,Mitochondrial genetic inheritance disorders,Mitochondrial myopathy
1865,13.0,1.0,0.0,1.0,1.0,4.732814,49.0,39.0,0.0,2.0,...,2.0,8.710243,1.0,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
13745,13.0,1.0,0.0,1.0,0.0,4.985695,44.0,55.0,1.0,1.0,...,2.0,3.010961,2.0,1.0,0.0,0.0,0.0,1.0,Single-gene inheritance diseases,Tay-Sachs


In [106]:
df_train['GeneticDisorder-DisorderSubclass'] = df_train['Genetic Disorder'] + '<->' + df_train['Disorder Subclass']

In [107]:
df_train.sample(3)

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Mother's age,Father's age,Status,Respiratory Rate (breaths/min),...,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass,GeneticDisorder-DisorderSubclass
15103,0.0,0.0,0.0,0.0,0.0,4.907809,34.577225,58.0,0.0,2.0,...,6.521732,3.0,0.0,0.0,0.0,0.0,1.0,Multifactorial genetic inheritance disorders,Cancer,Multifactorial genetic inheritance disorders<-...
9867,1.0,1.0,1.0,1.0,0.0,4.944528,34.581056,34.0,1.0,1.0,...,6.706288,4.0,1.0,1.0,0.0,1.0,0.489092,Single-gene inheritance diseases,Cystic fibrosis,Single-gene inheritance diseases<->Cystic fibr...
10956,5.0,1.0,1.0,1.0,1.0,4.875159,26.0,54.0,1.0,2.0,...,8.221602,2.0,0.0,1.0,0.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome,Mitochondrial genetic inheritance disorders<->...


### Checando si el dataset esta balanceado

In [108]:
target_count = df_train['GeneticDisorder-DisorderSubclass'].value_counts()
target_count

Mitochondrial genetic inheritance disorders<->Leigh syndrome                         4683
Mitochondrial genetic inheritance disorders<->Mitochondrial myopathy                 3971
Single-gene inheritance diseases<->Cystic fibrosis                                   3145
Single-gene inheritance diseases<->Tay-Sachs                                         2556
Multifactorial genetic inheritance disorders<->Diabetes                              1653
Single-gene inheritance diseases<->Hemochromatosis                                   1228
Mitochondrial genetic inheritance disorders<->Leber's hereditary optic neuropathy     587
Multifactorial genetic inheritance disorders<->Alzheimer's                            133
Multifactorial genetic inheritance disorders<->Cancer                                  91
Name: GeneticDisorder-DisorderSubclass, dtype: int64

### Separando los datos

In [109]:
target_labels = df_train['GeneticDisorder-DisorderSubclass'].values

df_train.drop(['Genetic Disorder','Disorder Subclass', 'GeneticDisorder-DisorderSubclass'], axis=1, inplace=True)

In [110]:
# Split for genetic_disorder_labels
from sklearn.model_selection import train_test_split
X_train, X_cv, y_train, y_cv = train_test_split(df_train, target_labels, test_size=0.1, random_state=random_state)

### Over Sampling usando SMOTE(Synthetic Minority Oversampling Technique)

In [111]:
# https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
from imblearn.over_sampling import SMOTE
smote_overSampling = SMOTE()
X_train,y_train = smote_overSampling.fit_resample(X_train,y_train)
unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

{"Mitochondrial genetic inheritance disorders<->Leber's hereditary optic neuropathy": 4192,
 'Mitochondrial genetic inheritance disorders<->Leigh syndrome': 4192,
 'Mitochondrial genetic inheritance disorders<->Mitochondrial myopathy': 4192,
 "Multifactorial genetic inheritance disorders<->Alzheimer's": 4192,
 'Multifactorial genetic inheritance disorders<->Cancer': 4192,
 'Multifactorial genetic inheritance disorders<->Diabetes': 4192,
 'Single-gene inheritance diseases<->Cystic fibrosis': 4192,
 'Single-gene inheritance diseases<->Hemochromatosis': 4192,
 'Single-gene inheritance diseases<->Tay-Sachs': 4192}

### Escalando los datos: genetic_disorder

In [112]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_cv_scaled = scaler.transform(X_cv)

X_test_scaled = scaler.transform(df_test)

X_train_scaled

array([[ 1.25052383, -1.35514597, -0.89857408, ..., -1.24530959,
         1.02548754, -1.10482565],
       [ 0.00708236, -1.35514597, -0.89857408, ..., -1.24530959,
         1.02548754,  1.06653074],
       [ 1.74790042,  0.86380498,  1.33092444, ..., -1.24530959,
         1.02548754,  1.06653074],
       ...,
       [ 1.49921212,  0.86380498, -0.89857408, ..., -1.24530959,
        -0.52117055, -1.10482565],
       [-1.19715312,  0.51398496, -0.54709122, ...,  0.97716585,
        -1.16402341,  0.72421405],
       [ 0.25577066,  0.01881699, -0.89857408, ...,  0.97716585,
        -0.33024632, -1.10482565]])

# Elegiendo modelos y ajustando hiperparametros

### **GradientBoostingClassifier**

In [73]:
%%time

from sklearn.ensemble import GradientBoostingClassifier

gb_clas = GradientBoostingClassifier(random_state=random_state)

# GRADIENT BOOSTING
grid_gradient_boosting_param = {"loss": ["deviance"],
                          "learning_rate": [0.1, 0.2],
                          "n_estimators": [100,200], 
                          "max_depth": [4,5], 
                          "max_features": ["sqrt", 3, 4],
                          }
Grid_gbct = RandomizedSearchCV(estimator=gb_clas, param_distributions=grid_gradient_boosting_param, verbose=1, n_iter=200, scoring='f1_weighted', cv=5)
Grid_gbct.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Wall time: 2h 22min 31s


RandomizedSearchCV(cv=5, estimator=GradientBoostingClassifier(random_state=50),
                   n_iter=200,
                   param_distributions={'learning_rate': [0.1, 0.2],
                                        'loss': ['deviance'],
                                        'max_depth': [4, 5],
                                        'max_features': ['sqrt', 3, 4],
                                        'n_estimators': [100, 200]},
                   scoring='f1_weighted', verbose=1)

### **XGBoostClassifier**

In [93]:
%%time
import xgboost


#xgb_clas = xgboost.XGBClassifier(objective='multi:softmax',metric='multiclass',
#                                  eval_metric='mlogloss', num_iterations=1000,random_state=random_state)


xgb_clas = xgboost.XGBClassifier(eval_metric='mlogloss',verbose = 1,random_state=random_state)


# A parameter grid for XGBoost
grid_xgb_params = {
        "learning_rate": [0.1],
        'gamma': [ 1, 2],
        "n_estimators": [600,700],
        'max_depth': [4, 5]
        }


Grid_xgb = RandomizedSearchCV(estimator=xgb_clas, param_distributions=grid_xgb_params, verbose=1, n_iter=100, scoring='f1_weighted', cv=3)
Grid_xgb.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Parameters: { "verbose" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




KeyboardInterrupt: 

### **LGBMClassifier**

In [None]:
%%time
from lightgbm import LGBMClassifier

lgbm_clas = LGBMClassifier(objective='multi:softmax',random_state=random_state)

# A parameter grid for XGBoost

grid_lgbm_params = {
    'objective': 'multiclass',
    "n_estimators": [600],
    'learning_rate': [0.1],
    'max_depth': [6, 7]
}


Grid_lgbm = RandomizedSearchCV(estimator=lgbm_clas, param_distributions=grid_lgbm_params, verbose=1, n_iter=200, scoring='f1_weighted', cv=5)
Grid_lgbm.fit(X_train_scaled, y_train, eval_set=(X_cv_scaled, y_cv))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[1]	valid_0's multi_logloss: 2.07648
[2]	valid_0's multi_logloss: 1.98569
[3]	valid_0's multi_logloss: 1.91154
[4]	valid_0's multi_logloss: 1.85016
[5]	valid_0's multi_logloss: 1.79973
[6]	valid_0's multi_logloss: 1.75456
[7]	valid_0's multi_logloss: 1.71524
[8]	valid_0's multi_logloss: 1.6808
[9]	valid_0's multi_logloss: 1.65264
[10]	valid_0's multi_logloss: 1.62577
[11]	valid_0's multi_logloss: 1.60071
[12]	valid_0's multi_logloss: 1.5799
[13]	valid_0's multi_logloss: 1.55994
[14]	valid_0's multi_logloss: 1.54227
[15]	valid_0's multi_logloss: 1.52417
[16]	valid_0's multi_logloss: 1.50876
[17]	valid_0's multi_logloss: 1.4958
[18]	valid_0's multi_logloss: 1.4843
[19]	valid_0's multi_logloss: 1.47239
[20]	valid_0's multi_logloss: 1.4632
[21]	valid_0's multi_logloss: 1.45153
[22]	valid_0's multi_logloss: 1.44433
[23]	valid_0's multi_logloss: 1.43729
[24]	valid_0's multi_logloss: 1.43064
[25]	valid_0's multi_logloss: 1.42217
[2

### **CatboostClassifier** 

In [116]:
%%time

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from catboost import CatBoostClassifier

cbc_clas = CatBoostClassifier(loss_function='MultiClass', use_best_model=True, task_type="GPU")

# Best Params: {'random_strength': 0.5, 'max_ctr_complexity': 3, 'learning_rate': 0.1, 'l2_leaf_reg': 5, 'iterations': 500, 'depth': 10, 'border_count': 200, 'bagging_temperature': 0.03}

grid_cbc_params = {
    'depth':[9,10],
    'iterations':[100,500],
    'learning_rate':[0.01,0.1],
    'l2_leaf_reg':[1,5],
    'border_count':[200],
    'bagging_temperature':[0.03,0.09],
    'random_strength':[0.2,0.5],
    'max_ctr_complexity':[1,3]
}

Grid_cbc = RandomizedSearchCV(estimator=cbc_clas, param_distributions=grid_cbc_params, verbose=1, n_iter=200, scoring='f1_weighted', cv=5)
Grid_cbc.fit(X_train_scaled, y_train, eval_set=(X_cv_scaled, y_cv))

Fitting 5 folds for each of 128 candidates, totalling 640 fits


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 2.1755333	test: 2.1842934	best: 2.1842934 (0)	total: 259ms	remaining: 25.6s
1:	learn: 2.1547822	test: 2.1717272	best: 2.1717272 (1)	total: 482ms	remaining: 23.6s
2:	learn: 2.1349217	test: 2.1596594	best: 2.1596594 (2)	total: 725ms	remaining: 23.4s
3:	learn: 2.1158507	test: 2.1479192	best: 2.1479192 (3)	total: 967ms	remaining: 23.2s
4:	learn: 2.0974426	test: 2.1365092	best: 2.1365092 (4)	total: 1.19s	remaining: 22.5s
5:	learn: 2.0794352	test: 2.1252059	best: 2.1252059 (5)	total: 1.39s	remaining: 21.8s
6:	learn: 2.0617609	test: 2.1141610	best: 2.1141610 (6)	total: 1.6s	remaining: 21.2s
7:	learn: 2.0448539	test: 2.1033652	best: 2.1033652 (7)	total: 1.81s	remaining: 20.8s
8:	learn: 2.0283996	test: 2.0926779	best: 2.0926779 (8)	total: 2.02s	remaining: 20.4s
9:	learn: 2.0126169	test: 2.0824884	best: 2.0824884 (9)	total: 2.22s	remaining: 20s
10:	learn: 1.9971232	test: 2.0722832	best: 2.0722832 (10)	total: 2.42s	remaining: 19.6s
11:	learn: 1.9822635	test: 2.0626018	best: 2.0626018 (1

KeyboardInterrupt: 

## Revisando los mejores parámetros

In [78]:
print("Best GradientBoostingClassifier Params:", Grid_gbct.best_params_)

Best GradientBoostingClassifier Params: {'n_estimators': 200, 'max_features': 'sqrt', 'max_depth': 5, 'loss': 'deviance', 'learning_rate': 0.2}


In [115]:
print("Best LGBMClassifier Params:", Grid_lgbm.best_params_)

Best LGBMClassifier Params: {'objective': 'l', 'n_estimators': 600, 'max_depth': 7, 'learning_rate': 0.1}


In [72]:
print("Best XGBoostClassifier Params:", Grid_xgb.best_params_)
print("Best CatboostClassifier Params:", Grid_cbc.best_params_)

Best GradientBoostingClassifier Params: {'n_estimators': 100, 'max_features': 'sqrt', 'max_depth': 4, 'loss': 'deviance', 'learning_rate': 0.1}
Best XGBoostClassifier Params: {'n_estimators': 600, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.1, 'gamma': 1}
Best LGBMClassifier Params: {'objective': 'i', 'n_estimators': 600, 'max_depth': 6, 'learning_rate': 0.05, 'boosting_type': 'g'}


AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

In [79]:
print("Best GradientBoostingClassifier Params:", Grid_gbct.best_score_)

Best GradientBoostingClassifier Params: 0.6661951907503455


In [77]:
print("Best XGBoostClassifier Params:", Grid_xgb.best_score_)

print("Best CatboostClassifier Params:", Grid_cbc.best_score_)

Best GradientBoostingClassifier Params: 0.6661951907503455


AttributeError: 'RandomizedSearchCV' object has no attribute 'best_score_'

In [122]:
print("Best LGBMClassifier Params:", Grid_lgbm.best_score_)

Best LGBMClassifier Params: 0.7134963299888067


### Creando las predicciones

In [82]:
GradientBoostingpredictions_test = Grid_gbct.predict(X_test_scaled)

In [None]:
XGBoostpredictions_test = Grid_cbc.predict(X_test_scaled)

Catboostpredictions_test = Grid_cbc.predict(X_test_scaled)

In [119]:
LGBMpredictions_test = Grid_lgbm.predict(X_test_scaled)

## Creando el archivo Submission

In [84]:

def archivoSubmision(predictions_tests):
    readId = pd.read_csv(ROOT_PATH+'raw\\test.csv')

    predictions_genetic_disorder_test = []
    predictions_disorder_subclass_test = []

    for myString in predictions_tests:
        genetic_disorder, disorder_subclass = myString.split('<->')
        predictions_genetic_disorder_test.append(genetic_disorder) 
        predictions_disorder_subclass_test.append(disorder_subclass)

    
    submission = pd.DataFrame({
            "Patient Id": readId["Patient Id"],
            "Genetic Disorder": predictions_genetic_disorder_test,
            "Disorder Subclass": predictions_disorder_subclass_test,
        })
    return submission



In [85]:
archivoSubmision(GradientBoostingpredictions_test).to_csv('GradientBoostingsubmission.csv', index=False)

In [None]:
archivoSubmision(GradientBoostingpredictions_test).to_csv('GradientBoostingsubmission.csv', index=False)
archivoSubmision(XGBoostpredictions_test).to_csv('XGBoostsubmission.csv', index=False)

archivoSubmision(Catboostpredictions_test).to_csv('Catboostsubmission.csv', index=False)


In [120]:
archivoSubmision(LGBMpredictions_test).to_csv('LGBMsubmission.csv', index=False)

## Guardando los modelos

In [87]:

with open('GradientBoostingmodel.model', "wb") as archivo_salida:
    pickle.dump(Grid_gbct.best_estimator_, archivo_salida)


In [None]:
    
with open('XGBoostmodel.model', "wb") as archivo_salida:
    pickle.dump(Grid_xgb.best_estimator_, archivo_salida)
    
    
with open('Catboostmodel.model', "wb") as archivo_salida:
    pickle.dump(Grid_cbc.best_estimator_, archivo_salida)

In [121]:

with open('LGBMmodel.model', "wb") as archivo_salida:
    pickle.dump(Grid_lgbm.best_estimator_, archivo_salida)