# Práctica 9

## Preparación del ambiente

In [92]:
import numpy as np
import pandas as pd
from six import StringIO 
import PIL.Image as img
from mnist import MNIST
import pydotplus
from sklearn.tree import _tree
from IPython.display import Image 
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, export_graphviz



## Funciones relevantes

In [26]:
def classification_metrics(X, y, estimator):
    ls_scores_roc = cross_val_score(estimator=estimator, X=X, y=y, scoring="accuracy", n_jobs=-1, cv=4)
    print(f'accuracy media: {np.mean(ls_scores_roc)}, desviación estándar: {np.std(ls_scores_roc)}')

## 1 Importar dataset

In [3]:
df= pd.read_csv("digits.csv")

In [4]:
df.head()

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 2 Separación de datos

In [5]:
X = df[[x for x in df.columns if x != "label"]]
y = df["label"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7)

In [7]:
y_train

3522    1
5656    1
4072    1
3737    2
655     8
       ..
2399    6
4755    3
5101    2
2489    8
630     0
Name: label, Length: 4200, dtype: int64

## 3 Modelado 

### Red neuronal

In [8]:
mlp = MLPClassifier()

In [9]:
mlp.fit(X_train, y_train)

MLPClassifier()

In [10]:
mlp.score(X_train,y_train)

1.0

In [11]:
mlp.score(X_test,y_test)

0.8816666666666667

### Árbol de desición

In [12]:
tree = DecisionTreeClassifier()

In [13]:
pipe = Pipeline([("tree", tree)])

In [14]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('tree', DecisionTreeClassifier())])

In [15]:
pipe.score(X_train, y_train)

1.0

In [46]:
pipe.score(X_test, y_test)

0.7544444444444445

## 4 Cross Validation

### Red neuronal

In [21]:
classification_metrics(X_train,y_train,mlp)

accurancy media: 0.8592857142857143, desviación estándar: 0.0032559034121850674


### Árbol de decisión

In [22]:
classification_metrics(X_train,y_train,tree)

accurancy media: 0.7397619047619047, desviación estándar: 0.016097421660843218


## 5 Hiperparametrización 

### Red neuronal

In [23]:
param_grid = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [27]:
search = RandomizedSearchCV(param_distributions=param_grid, cv=4, n_jobs=-1, scoring="accuracy", estimator=mlp, n_iter=10, verbose=5)

In [28]:
search.fit(X_train,y_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits
[CV 1/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=adam;, score=0.866 total time=   5.2s
[CV 2/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=adam;, score=0.866 total time=   5.9s
[CV 3/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=adam;, score=0.863 total time=   4.4s
[CV 4/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=adam;, score=0.849 total time=   3.8s
[CV 1/4] END activation=tanh, alpha=0.05, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=adam;, score=0.895 total time=   5.9s
[CV 2/4] END activation=tanh, alpha=0.05, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=adam;, score=0.874 total time=   6.2s
[CV 3/4] END activation=tanh, alpha=0.05, hidden_layer_sizes=(50, 5



[CV 1/4] END activation=relu, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=sgd;, score=0.815 total time=  21.2s




[CV 2/4] END activation=relu, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=sgd;, score=0.832 total time=  21.7s




[CV 3/4] END activation=relu, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=sgd;, score=0.782 total time=  21.2s




[CV 4/4] END activation=relu, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=sgd;, score=0.459 total time=  21.2s
[CV 1/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=adam;, score=0.871 total time=   7.3s
[CV 2/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=adam;, score=0.859 total time=   7.2s
[CV 3/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=adam;, score=0.874 total time=   6.6s
[CV 4/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=adam;, score=0.857 total time=   5.0s




[CV 1/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=sgd;, score=0.870 total time=  18.3s




[CV 2/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=sgd;, score=0.880 total time=  24.1s




[CV 3/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=sgd;, score=0.890 total time=  18.5s




[CV 4/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=sgd;, score=0.860 total time=  19.0s
[CV 1/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=adam;, score=0.885 total time=   9.9s
[CV 2/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=adam;, score=0.870 total time=  11.8s
[CV 3/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=adam;, score=0.875 total time=  11.8s
[CV 4/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=adam;, score=0.884 total time=   8.4s




[CV 1/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=sgd;, score=0.887 total time=  23.8s




[CV 2/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=sgd;, score=0.866 total time=  20.9s




[CV 3/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=sgd;, score=0.876 total time=  20.4s




[CV 4/4] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=sgd;, score=0.876 total time=  26.0s




[CV 1/4] END activation=tanh, alpha=0.05, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=sgd;, score=0.873 total time=  25.4s




[CV 2/4] END activation=tanh, alpha=0.05, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=sgd;, score=0.872 total time=  20.6s




[CV 3/4] END activation=tanh, alpha=0.05, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=sgd;, score=0.873 total time=  28.6s




[CV 4/4] END activation=tanh, alpha=0.05, hidden_layer_sizes=(50, 50, 50), learning_rate=adaptive, solver=sgd;, score=0.882 total time=  37.5s
[CV 1/4] END activation=tanh, alpha=0.05, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=adam;, score=0.878 total time=  12.6s
[CV 2/4] END activation=tanh, alpha=0.05, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=adam;, score=0.864 total time=  11.8s
[CV 3/4] END activation=tanh, alpha=0.05, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=adam;, score=0.888 total time=  15.4s
[CV 4/4] END activation=tanh, alpha=0.05, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=adam;, score=0.880 total time=  18.2s
[CV 1/4] END activation=tanh, alpha=0.05, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=adam;, score=0.878 total time=  10.0s
[CV 2/4] END activation=tanh, alpha=0.05, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=adam;, score=0.887 total 

RandomizedSearchCV(cv=4, estimator=MLPClassifier(), n_jobs=-1,
                   param_distributions={'activation': ['tanh', 'relu'],
                                        'alpha': [0.0001, 0.05],
                                        'hidden_layer_sizes': [(50, 50, 50),
                                                               (50, 100, 50),
                                                               (100,)],
                                        'learning_rate': ['constant',
                                                          'adaptive'],
                                        'solver': ['sgd', 'adam']},
                   scoring='accuracy', verbose=5)

In [29]:
search.best_estimator_

MLPClassifier(activation='tanh', hidden_layer_sizes=(50, 100, 50))

In [30]:
search.best_score_

0.8785714285714286

In [32]:
search.score(X_train,y_train)

0.9078571428571428

In [33]:
search.score(X_test,y_test)

0.8794444444444445

#### Perservación mejor modelo

In [34]:
pd.to_pickle(search.best_estimator_, "mlpDigits.pickle")

### Árbol de decisión

In [47]:
param_dist = {"tree__max_depth": range(1, 4),
              "tree__max_features": ["auto", "sqrt", "log2"],
              "tree__min_samples_leaf": [x/100 for x in range(5, 41, 1)] + [x for x in range(1, 15)], 
              "tree__criterion": ["gini", "entropy"],
              "tree__splitter": ["best", "random"],
              "tree__class_weight": ["balanced", None]}

In [38]:
search = GridSearchCV(param_grid=param_dist, cv=4, n_jobs=-1, scoring="accuracy", estimator=pipe, verbose=5,)

In [43]:
search.fit(X_train,y_train)

Fitting 4 folds for each of 3600 candidates, totalling 14400 fits
[CV 1/4] END tree__class_weight=balanced, tree__criterion=gini, tree__max_depth=1, tree__max_features=auto, tree__min_samples_leaf=0.05, tree__splitter=best;, score=0.181 total time=   0.1s
[CV 2/4] END tree__class_weight=balanced, tree__criterion=gini, tree__max_depth=1, tree__max_features=auto, tree__min_samples_leaf=0.05, tree__splitter=best;, score=0.197 total time=   0.1s
[CV 3/4] END tree__class_weight=balanced, tree__criterion=gini, tree__max_depth=1, tree__max_features=auto, tree__min_samples_leaf=0.05, tree__splitter=best;, score=0.212 total time=   0.1s
[CV 4/4] END tree__class_weight=balanced, tree__criterion=gini, tree__max_depth=1, tree__max_features=auto, tree__min_samples_leaf=0.05, tree__splitter=best;, score=0.205 total time=   0.1s
[CV 1/4] END tree__class_weight=balanced, tree__criterion=gini, tree__max_depth=1, tree__max_features=auto, tree__min_samples_leaf=0.05, tree__splitter=random;, score=0.177 t

GridSearchCV(cv=4,
             estimator=Pipeline(steps=[('tree', DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'tree__class_weight': ['balanced', None],
                         'tree__criterion': ['gini', 'entropy'],
                         'tree__max_depth': range(1, 4),
                         'tree__max_features': ['auto', 'sqrt', 'log2'],
                         'tree__min_samples_leaf': [0.05, 0.06, 0.07, 0.08,
                                                    0.09, 0.1, 0.11, 0.12, 0.13,
                                                    0.14, 0.15, 0.16, 0.17,
                                                    0.18, 0.19, 0.2, 0.21, 0.22,
                                                    0.23, 0.24, 0.25, 0.26,
                                                    0.27, 0.28, 0.29, 0.3, 0.31,
                                                    0.32, 0.33, 0.34, ...],
                         'tree__splitter': ['best', 'random']},
       

In [44]:
search.best_estimator_

Pipeline(steps=[('tree',
                 DecisionTreeClassifier(max_depth=3, max_features='sqrt',
                                        min_samples_leaf=11))])

In [45]:
search.best_score_

0.43547619047619046

In [60]:
search.score(X_test,y_test)

0.3661111111111111

In [54]:
param_distN  = {"tree__max_depth": range(4,5),
              "tree__max_features": ["auto", "sqrt", "log2"],
              "tree__min_samples_leaf": [x/100 for x in range(5, 41, 1)] + [x for x in range(1, 15)], 
              "tree__criterion": ["gini", "entropy"],
              "tree__splitter": ["best", "random"],
              "tree__class_weight": ["balanced", None]}

In [55]:
search2 = GridSearchCV(param_grid=param_distN, cv=4, n_jobs=-1, scoring="accuracy", estimator=pipe, verbose=5,)

In [57]:
search2.fit(X_train,y_train)

Fitting 4 folds for each of 1200 candidates, totalling 4800 fits
[CV 1/4] END tree__class_weight=balanced, tree__criterion=gini, tree__max_depth=4, tree__max_features=auto, tree__min_samples_leaf=0.05, tree__splitter=best;, score=0.457 total time=   0.1s
[CV 2/4] END tree__class_weight=balanced, tree__criterion=gini, tree__max_depth=4, tree__max_features=auto, tree__min_samples_leaf=0.05, tree__splitter=best;, score=0.453 total time=   0.1s
[CV 3/4] END tree__class_weight=balanced, tree__criterion=gini, tree__max_depth=4, tree__max_features=auto, tree__min_samples_leaf=0.05, tree__splitter=best;, score=0.485 total time=   0.1s
[CV 4/4] END tree__class_weight=balanced, tree__criterion=gini, tree__max_depth=4, tree__max_features=auto, tree__min_samples_leaf=0.05, tree__splitter=best;, score=0.428 total time=   0.1s
[CV 1/4] END tree__class_weight=balanced, tree__criterion=gini, tree__max_depth=4, tree__max_features=auto, tree__min_samples_leaf=0.05, tree__splitter=random;, score=0.410 to

GridSearchCV(cv=4,
             estimator=Pipeline(steps=[('tree', DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'tree__class_weight': ['balanced', None],
                         'tree__criterion': ['gini', 'entropy'],
                         'tree__max_depth': range(4, 5),
                         'tree__max_features': ['auto', 'sqrt', 'log2'],
                         'tree__min_samples_leaf': [0.05, 0.06, 0.07, 0.08,
                                                    0.09, 0.1, 0.11, 0.12, 0.13,
                                                    0.14, 0.15, 0.16, 0.17,
                                                    0.18, 0.19, 0.2, 0.21, 0.22,
                                                    0.23, 0.24, 0.25, 0.26,
                                                    0.27, 0.28, 0.29, 0.3, 0.31,
                                                    0.32, 0.33, 0.34, ...],
                         'tree__splitter': ['best', 'random']},
       

In [58]:
search2.best_estimator_

Pipeline(steps=[('tree',
                 DecisionTreeClassifier(criterion='entropy', max_depth=4,
                                        max_features='auto',
                                        min_samples_leaf=3))])

In [59]:
search2.best_score_

0.5245238095238095

In [61]:
search2.score(X_test,y_test)

0.5077777777777778

Aún añadiendo otro nivel de profundidad no se logra mejorar el modelo inicial.

## 6 Mejor modelo

Las redes neuronales se llegan a adaptamar mejor al problema esto es porque la forma de trabajar del árbol de decisión empieza a partir la imagen en sectores para predecir de mejor manera, pero muchos números tienen una estructura muy similar (1 y 7 son una linea larga, por ejemplo) y el segmentar se vuelve complicado.  
En cambio la red neuronal se va adaptando sus pesos para dar un mejor resultado

## 7 Conjunto prueba

In [294]:
def prueba(imagenes,mpl,pipe):
    prediccionMlp=[]
    prediccionPipe=[]
    for imagen in imagenes:
        im=img.open(imagen)
        im=im.resize((28,28))
        matriz= np.asarray(im)
        matriz= matriz.reshape(1,-1)
        ls_m=[]
        for j in range(784):
            if(matriz[0][j]==255):
                ls_m.append(0)
            else:
                ls_m.append(matriz[0][j])
        dic=dict(zip(X.columns, ls_m))
        x = pd.DataFrame(index = [x for x in range(1)],data=dic)
        prediccionMlp.append(mlp.predict(x))
        prediccionPipe.append(pipe.predict(x))
    return prediccionMlp, prediccionPipe

In [264]:
numeros=[]
for i in range(10):
    cadena = str(i)+".png"
    numeros.append(cadena)

In [295]:
prueba(numeros,mlp2,pipe)

([array([7]),
  array([7]),
  array([7]),
  array([5]),
  array([5]),
  array([5]),
  array([7]),
  array([5]),
  array([7]),
  array([7])],
 [array([7]),
  array([4]),
  array([4]),
  array([4]),
  array([4]),
  array([7]),
  array([4]),
  array([4]),
  array([4]),
  array([2])])

Como se lográ ver los resultados no son consistentes, puesto que al final no dejan de comparar los valores de cada pixel (los cuales varian bastante), entonces al estar en contacto con otro una escala de valores distintos (los blancos en su mayoria), se decanta por un valor, en cambio si contemplara es blanco o negro tendríamos mejor desempeño.

Aún volviendo los 255 a 0 sigue sin dar buenos resultados pero ya presentan algunas variaciones (en un inicio todos eran 2 por mpl y 7 por pipe)