# Diplodatos Kaggle Competition

We present this peace of code to create the baseline for the competition, and as an example of how to deal with these kind of problems. The main goals are that you:

1. Learn
1. Try different models and see which one fits the best the given data
1. Get a higher score than the given one in the current baseline example
1. Try to get the highest score in the class :)

In [89]:
# Import the required packages
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [90]:
# load the given labels
breed = pd.read_csv('../data/breed_labels.csv')
color = pd.read_csv('../data/color_labels.csv')
state = pd.read_csv('../data/state_labels.csv')

Now we take a look at the labels, just to understand what these are

In [91]:
breed.head()

Unnamed: 0,BreedID,Type,BreedName
0,1,1,Affenpinscher
1,2,1,Afghan Hound
2,3,1,Airedale Terrier
3,4,1,Akbash
4,5,1,Akita


In [92]:
color.head()

Unnamed: 0,ColorID,ColorName
0,1,Black
1,2,Brown
2,3,Golden
3,4,Yellow
4,5,Cream


In [93]:
state

Unnamed: 0,StateID,StateName
0,41336,Johor
1,41325,Kedah
2,41367,Kelantan
3,41401,Kuala Lumpur
4,41415,Labuan
5,41324,Melaka
6,41332,Negeri Sembilan
7,41335,Pahang
8,41330,Perak
9,41380,Perlis


And now we are ready to deal with the *original* dataset...

In [94]:
original_df = pd.read_csv('../data/train.csv')

In [95]:
original_df.columns

Index(['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'Description',
       'AdoptionSpeed', 'PID'],
      dtype='object')

In [96]:
original_df.describe()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,AdoptionSpeed,PID
count,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0
mean,1.454734,10.520412,265.469854,74.388868,1.779059,2.230675,3.236912,1.856738,1.860518,1.460971,1.72973,1.566528,1.912115,1.036666,1.584011,20.80996,41345.994613,2.5189,7477.025799
std,0.49797,18.374027,60.12149,123.43401,0.684763,1.743985,2.748595,2.974465,0.547535,0.593843,0.670791,0.701482,0.564041,0.198228,1.488348,78.397243,32.409109,1.176018,4310.921553
min,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,41324.0,0.0,0.0
25%,1.0,2.0,265.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,41326.0,2.0,3768.25
50%,1.0,3.0,266.0,0.0,2.0,2.0,2.0,0.0,2.0,1.0,2.0,1.0,2.0,1.0,1.0,0.0,41326.0,2.0,7473.5
75%,2.0,12.0,307.0,188.0,2.0,3.0,6.0,5.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,0.0,41401.0,4.0,11200.75
max,2.0,255.0,307.0,307.0,3.0,7.0,7.0,7.0,4.0,3.0,3.0,3.0,3.0,3.0,20.0,3000.0,41415.0,4.0,14992.0


Create a function to transform the datasets. This is done by means of a function so that the transformations are the same for the training and testing datasets... We replace the encodings just to make it easy to "visualize" the data

In [97]:
def transform_data(train_data_fname, test_data_fname):
    def transform_columns(df):
        df = df.drop(["Description"], axis=1)
        df.Type = df.Type.replace({1: 'Dog', 2: 'Cat'})
        df.Gender = df.Gender.replace({1:'Male', 2:'Female', 3:'Mixed'})
        df.MaturitySize = df.MaturitySize.replace({1:'S', 2:'M', 3:'L', 4:'XL', 0:'N/A'})
        df.FurLength = df.FurLength.replace({1:'S', 2:'M', 3:'L', 0:'N/A'})
        df.Vaccinated = df.Vaccinated.replace({1:'T', 2:'N', 3:'N/A'})
        df.Dewormed = df.Dewormed.replace({1:'T', 2:'F', 3:'N/A'})
        df.Sterilized = df.Sterilized.replace({1:'T', 2:'F', 3:'N/A'})
        df.Health = df.Health.replace({1:'Healthy', 2: 'MinorInjury', 3:'SeriousInjury', 0: 'N/A'})
        df.Color1 = df.Color1.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Color2 = df.Color2.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Color3 = df.Color3.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Breed1 = df.Breed1.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
        df.Breed2 = df.Breed2.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
        return df
    
    df_train = pd.read_csv(train_data_fname)
    df_train = transform_columns(df_train)
    df_test = pd.read_csv(test_data_fname)
    df_test = transform_columns(df_test)
    
    df = pd.concat([df_train, df_test], sort=True)

    # set dummy variables for everything
    # except from Age, Quantity, Fee
    df = pd.get_dummies(df)
    # get train and test back
    n = len(df_train)
    df_train = df.iloc[:n]
    df_test = df.iloc[n:]
    
    y = df_train['AdoptionSpeed']
    X = df_train.drop('AdoptionSpeed', axis=1)
    yy = None
    XX = df_test.drop('AdoptionSpeed', axis=1)

    return X, y, XX, yy

Load the data...

In [98]:
X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")

In [99]:
results = pd.DataFrame(columns=('clf', 'best_acc'))

In [100]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

#### Importar Grid Search y las métricas

In [101]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

### Decision Tree

#### Importar modelo

In [102]:
from sklearn.tree import DecisionTreeClassifier

####  Los hiperparámetros por omisión del sklearn

In [103]:
dt_basal = DecisionTreeClassifier(random_state=42)
scores = cross_val_score(estimator=dt_basal, X=X_train.drop(["PID"], axis=1), y=y_train, cv=3)
print(scores)
print(scores.mean())
#print(dt_basal)
results = results.append({'clf': dt_basal, 'best_acc': scores.mean()}, ignore_index=True)

[0.30809717 0.30741191 0.31482982]
0.31011296512160474


### Probamos con GridSearch

#### Probamos un Decision Tree agregando algunos hiperparámetros a los anteriores. Los hiperparámetros que evaluamos, con Grid Search, son:
- criterio: gini y entropy
- min_samples_leaf [1...7]
- min_samples_split [2 ... 200]

In [105]:
tree_param = {
    'criterion': ('gini', 'entropy'), 
    'min_samples_leaf': (1, 2, 5, 7),
    'min_samples_split': (2, 3, 5, 10, 25, 50, 100, 200)
}
             
tree = DecisionTreeClassifier(random_state=42)
tree_clf = GridSearchCV(tree, tree_param, scoring='accuracy', cv=5, iid=False)
tree_clf.fit(X_train.drop(["PID"], axis=1), y_train)
best_tree_clf = tree_clf.best_estimator_
#score general de accuracy
print('Best Decision Tree accuracy: ', tree_clf.best_score_)
print(best_tree_clf)
results = results.append({'clf': best_tree_clf, 'best_acc': tree_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Best Decision Tree accuracy:  0.37234878199372645
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=7, min_samples_split=200,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
The best classifier so far is: 
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=7, min_samples_split=200,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')


##### Los mejores hiperparámetros que selecciona grid search son:
- criterio : entropy
- min_samples_leaf : 7
- min_samples_split : 200

#### Luego probamos con más opciones para min_samples_leaf y min_samples_split, porque ambos daban los mejores hiperparámetros en el máximo del rango que probamos para ambos hiperparámetros (200 y 7, respectivamente). También agregamos class_weight para probar con "balanceada" además del None que es el valor por defecto. 

In [106]:
tree_param = {'criterion':('entropy',), 'min_samples_leaf':(7,10, 12, 15),
              'min_samples_split':(100,200, 300, 400), 'max_leaf_nodes':(None,),
             'class_weight':('balanced',None) }

tree = DecisionTreeClassifier(random_state=42)
tree_clf = GridSearchCV(tree, tree_param, scoring='accuracy', cv=5, iid=False)
tree_clf.fit(X_train.drop(["PID"], axis=1), y_train)
best_tree_clf = tree_clf.best_estimator_
#score general de accuracy
print('Best Decision Tree accuracy: ', tree_clf.best_score_)
print(best_tree_clf)
results = results.append({'clf': best_tree_clf, 'best_acc': tree_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Best Decision Tree accuracy:  0.374508938839338
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=12, min_samples_split=200,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
The best classifier so far is: 
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=12, min_samples_split=200,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')


#### El min_samples_split óptimo sigue siendo 200. Para min_samples_leaf, el mejor hiperparámetro da 12 (en vez de 7 como en el caso anterior). Para class_weight, sigue optimizándose con igual peso todas las clases. Cambia mńimamente el accuracy respecto al anterior

### Random Forest

#### importar el modelo

In [107]:
from sklearn.ensemble import RandomForestClassifier

#### Parámetros por defecto RF

In [108]:
rf_basal = RandomForestClassifier(random_state=42)

scores = cross_val_score(estimator=rf_basal, X=X_train.drop(["PID"], axis=1), y=y_train, cv=3)
print(scores)
print(scores.mean())
#print(dt_basal)
results = results.append({'clf': rf_basal, 'best_acc': scores.mean()}, ignore_index=True)



[0.33319838 0.32766302 0.30794165]
0.3229343517311454




#### Random search CV para el RF

In [109]:
from sklearn.model_selection import RandomizedSearchCV


# nro de árboles
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 10)]
# nro de features para agrupar
max_features = ['auto', 'sqrt']
# max nro de niveles del árbol
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# min nro de muestras para separar el nodo
min_samples_split = [2, 10, 100, 400]
# min nro de muestras por hoja
min_samples_leaf = [1, 2, 7, 12, 15]
# método para seleccionar las muestras
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
clf = RandomForestClassifier(random_state=42)
rf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, random_state=42, 
                               n_jobs = -1, scoring='accuracy', iid=False)
#rf_clf = GridSearchCV(clf, rf_param, scoring='accuracy', cv=3, iid=False)
rf_random.fit(X_train.drop(["PID"], axis=1), y_train)
rf_random.best_params_
best_rf_random_clf = rf_random.best_estimator_
print('Best Decision Tree accuracy: ', rf_random.best_score_)
print(best_rf_random_clf)
results = results.append({'clf': best_rf_random_clf, 'best_acc': rf_random.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  5.3min finished


Best Decision Tree accuracy:  0.3763989932178637
RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=60, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=780, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
The best classifier so far is: 
RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=60, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=780, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)


In [110]:
rf_rand_results = pd.DataFrame(rf_random.cv_results_)



In [111]:
rf_rand_results.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_n_estimators', 'param_min_samples_split',
       'param_min_samples_leaf', 'param_max_features', 'param_max_depth',
       'param_bootstrap', 'params', 'split0_test_score', 'split1_test_score',
       'split2_test_score', 'mean_test_score', 'std_test_score',
       'rank_test_score', 'split0_train_score', 'split1_train_score',
       'split2_train_score', 'mean_train_score', 'std_train_score'],
      dtype='object')

In [112]:
rf_resultados = rf_rand_results.sort_values(['mean_test_score'], ascending=False)[['params', 'mean_test_score']][:10]
for index, row in rf_resultados.iterrows():
    print(row['params'], row['mean_test_score'])


{'n_estimators': 780, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 60, 'bootstrap': False} 0.3763989932178637
{'n_estimators': 780, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 90, 'bootstrap': False} 0.3763989932178637
{'n_estimators': 670, 'min_samples_split': 100, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 90, 'bootstrap': False} 0.3759965407061358
{'n_estimators': 230, 'min_samples_split': 100, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 70, 'bootstrap': False} 0.37572592429761703
{'n_estimators': 1000, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': False} 0.37477841160724995
{'n_estimators': 340, 'min_samples_split': 100, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': None, 'bootstrap': False} 0.3742417722176749
{'n_estimators': 890, 'min_samples_split': 100, 'min_samples_leaf': 1, 'max_features': 'auto',

#### RF con grid search con los hiperparámetros que nos dieron. Pero salvo min_samples_leaf, no se ven otros valores que se mantengan dentro de un rango acotado (qué significa eso? ... que dan más o menos lo mismo, entonces ninguno optimiza significativamente?)

In [113]:
rf_param = {'n_estimators': (250, 500, 800), 
            'min_samples_leaf': (1,),
            'min_samples_split': (2, 50, 100), 
            'max_features': ('sqrt', 'auto'), 
            'max_depth': (50, 90, None),
            'bootstrap': (True, False)
            }

clf = RFT(random_state=42)
rf_clf = GridSearchCV(clf, rf_param, scoring='accuracy', cv=3, iid=False)
rf_clf.fit(X_train.drop(["PID"], axis=1), y_train)
best_rf_clf = rf_clf.best_estimator_

print('Best Decision Tree accuracy: ', rf_clf.best_score_)
print(best_rf_clf)

results = results.append({'clf': best_rf_clf, 'best_acc': rf_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Best Decision Tree accuracy:  0.37640189115709144
RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=90, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
The best classifier so far is: 
RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=90, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)


##### Los mejores hiperparámetros que selecciona grid search para el random forest son:
- criterio : gini
- min_samples_leaf : 1
- min_samples_split : 100

Pero la mejora es prácticamente nula respecto al random search (0.376399 0.376402))

### Knn

In [114]:
from sklearn.neighbors import KNeighborsClassifier

In [115]:
knn_basal = KNeighborsClassifier()

scores = cross_val_score(estimator=knn_basal, X=X_train.drop(["PID"], axis=1), y=y_train, cv=3)
print(scores)
print(scores.mean())
#print(dt_basal)
results = results.append({'clf': knn_basal, 'best_acc': scores.mean()}, ignore_index=True)

[0.33481781 0.31065209 0.30267423]
0.3160480432585906


#### Hicimos un grid search sobre algunos de los hiperparámetros para el knn

In [116]:
Knn_param = {
    'n_neighbors': [3, 5, 10], 
    'weights': ['uniform', 'distance'], 
    'p': [1, 2] 
}

knn_clf = GridSearchCV(KNeighborsClassifier(), Knn_param, scoring='accuracy', cv=3, iid=False)
knn_clf.fit(X_train.drop(["PID"], axis=1), y_train)
best_knn_clf = knn_clf.best_estimator_
print('Best Knn accuracy: ', knn_clf.best_score_)
print(best_grid_clf)
results = results.append({'clf': best_knn_clf, 'best_acc': knn_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Best Knn accuracy:  0.341565706915526
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=10, p=2,
           weights='distance')
The best classifier so far is: 
The best classifier so far is: 
RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=90, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)


#### Mejora respecto a los parámetros por omisión pero está bastante por debajo de decision tree y árboles de decisión - no seguimos probando con otros hiperparámetros para knn.

### SGD

In [130]:
from sklearn.linear_model import SGDClassifier

In [131]:
params = {
    'alpha': (1,0.0001), 
    'average': (False, ), 
    'class_weight': (None, ), 
    'early_stopping': (False, ), 
    'eta0': (0.0, ), 
    'fit_intercept': (True, ), 
    'l1_ratio': (0.15, ), 
    'learning_rate': ('optimal', ),
    'loss': ('hinge', 'modified_huber'), 
    'max_iter': (1000, ), 
    'n_iter': (None, ), 
    'n_iter_no_change': (5, ), 
    'n_jobs': (None, ), 
    'penalty': ('l1', 'l2',),  
    'power_t': (0.5, ), 
    'random_state': (42, ), 
    'shuffle': (True, ), 
    'tol': (0.1, ), 
    'validation_fraction': (0.1, ),
    'warm_start': (False, ),
}

clf_class = SGDClassifier

sgd_clf = GridSearchCV(clf_class(), params, scoring='accuracy', cv=3, iid=False)
sgd_clf.fit(X_train.drop(["PID"], axis=1), y_train)
best_sgd_clf = sgd_clf.best_estimator_

print('Best sgd accuracy: ', sgd_clf.best_score_)
print(best_sgd_clf)
results = results.append({'clf': best_sgd_clf, 'best_acc': sgd_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])




Best sgd accuracy:  0.2729843880791654
SGDClassifier(alpha=1, average=False, class_weight=None, early_stopping=False,
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=1000, n_iter=None,
       n_iter_no_change=5, n_jobs=None, penalty='l1', power_t=0.5,
       random_state=42, shuffle=True, tol=0.1, validation_fraction=0.1,
       verbose=0, warm_start=False)
The best classifier so far is: 
RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=90, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)




### Evaluamos resultados

In [132]:
for index, val in results.iterrows():
    print (val['clf'],val['best_acc'])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best') 0.31011296512160474
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best') 0.3519622095560508
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=7, min_samples_split=200,
            m

#### Calculamos sobre el conjunto de entrenamiento completo con los modelos e hiperparámetros que seleccionamos, y usamos ese modelo en validación. Calcular el accuracy y comparar los modelos.

In [133]:
for index, val in results.iterrows():
    print (val['clf'],val['best_acc'])
    clf = val['clf']
    clf_t = clf.fit(X_train, y_train)
    y_train_pred = clf_t.predict(X_train)
    acc_t = accuracy_score(y_train, y_train_pred)
    #clf_v = clf.fit(X_valid, y_valid)
    y_valid_pred = clf_t.predict(X_valid)
    acc_v = accuracy_score(y_valid, y_valid_pred)
    print(acc_t, acc_v, val['best_acc'])
    

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best') 0.31011296512160474
1.0 0.2992125984251969 0.31011296512160474
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best') 0.3519622095560508
0.47144592952612396 0.33858267716535434 0.3519622095560508
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decre



* Comparando la métrica sobre el train entero, y el de validación, con la que sale de hacer cross validation, se ve que los modelos overfitean cuando se ajustan al entrenamiento (sin cv), y los resultados de accuracy se parecen bastante entre validacion y cv. Es esperable, pero en algunos es muy notable.

----

* Tomando los resultados de validación los que mejor funcionan son:
    
    1. `Random forest con (bootstrap=False, class_weight=None, criterion='gini', max_depth=60, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=2, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=780, n_jobs=None, oob_score=False, random_state=42, verbose=0, warm_start=False)`
    
    2. `RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',max_depth=90, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=100, min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None, oob_score=False, random_state=42, verbose=0, warm_start=False)`
    
---

* Tomando los resultados de cv, los mejores son también esos dos modelos, pero están cerca las métricas de:

    1. `DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=7, min_samples_split=200, min_weight_fraction_leaf=0.0, presort=False, random_state=42, splitter='best')`

    2. `DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=12, min_samples_split=200, min_weight_fraction_leaf=0.0, presort=False, random_state=42, splitter='best')`

**And finally**, we predict the unknown label for the testing set



Best sgd accuracy:  0.2729843880791654
SGDClassifier(alpha=1, average=False, class_weight=None, early_stopping=False,
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=1000, n_iter=None,
       n_iter_no_change=5, n_jobs=None, penalty='l1', power_t=0.5,
       random_state=42, shuffle=True, tol=0.1, validation_fraction=0.1,
       verbose=0, warm_start=False)




In [None]:
yy = bb_clf.predict(XX.drop(["PID"], axis=1))
yy = yy.astype(np.int)
submission = pd.DataFrame(list(zip(XX.PID, yy)), columns=["PID", "AdoptionSpeed"])
submission.to_csv("../data/submission.csv", header=True, index=False)

The last thing we do is generating a file that should be *submitted* on kaggle

In [None]:
#clasificador = {clf:0}

yy = results.clf.iloc[0].predict(XX.drop(["PID"], axis=1))
yy = yy.astype(np.int)

In [None]:
submission = pd.DataFrame(list(zip(XX.PID, yy)), columns=["PID", "AdoptionSpeed"])

In [None]:
submission.to_csv("../data/submission.csv", header=True, index=False)