# Redes neuronales

## Preparación de ambiente

### Carga de módulos

In [2]:
import numpy as np
import pandas as pd
from PIL import Image
from mnist import MNIST
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split, cross_val_score

### Funciones relevantes

In [3]:
def print_data(data):
    for row in data:
        print(''.join('{:3}'.format(value) for value in row))

In [4]:
def get_data(number):
    img = Image.open('sample%d_black_r.png'%(number)).convert('L')
    img_arr = np.array(img)
    WIDTH, HEIGHT = img.size
    data = list(img.getdata())
    data = [data[offset:offset+WIDTH] for offset in range(0, WIDTH*HEIGHT, WIDTH)]
    return data

In [5]:
def flatten_data(data):
    return[np.reshape(data, (28*28,))]

In [6]:
def classification_metrics(X, y, estimator):
    ls_scores_roc = cross_val_score(estimator=estimator, X=X, y=y, scoring="roc_auc", n_jobs=-1, cv=4)
    print(f"ROC media: {np.mean(ls_scores_roc):,.2f}, desviación estándar: {np.std(ls_scores_roc)}")

## Carga de datos

### Set de entrenamiento

In [7]:
mndata = MNIST('letters/')
mndata.select_emnist("letters")
images, labels = mndata.load_training()

FileNotFoundError: [Errno 2] No such file or directory: 'letters/emnist-letters-train-labels-idx1-ubyte.gz'

#### Convirtiendo a DataFrame

In [8]:
train = pd.DataFrame(data=images, columns=[f"{i+1}x{j+1}" for i in range(28) for j in range(28)])

In [9]:
train

Unnamed: 0,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,1x10,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
124796,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
124797,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
124798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
train["letter"] = [chr(ord('@')+x) for x in labels]

ValueError: Length of values (20800) does not match length of index (124800)

In [11]:
train.to_csv("letters.csv")

In [12]:
images, labels = mndata.load_testing()

In [21]:
print_data(np.reshape(train.loc[60, [x for x in train.columns if x != "letter"]].values, (28, 28)))

  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  2  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0 75 75  8  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  4215243113  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  4217250127  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0 21233249125  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0 38249233 82  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  7 95252218 43  4  4  4  4  2  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0 20114145234254249222217217217215158  8  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  3109245250254254254254250250250254245 77  2  0  0  0  0
  0  0  0  0  0  0  0  0  0  3108172149236252189128 51 37 51191250 84  2  0  0  0  0
  0  0  0  0  0  0  0  0  0  0 18 24 88234233 84  6  0  0  417525

In [22]:
val = pd.DataFrame(data=images, columns=[f"{i+1}x{j+1}" for i in range(28) for j in range(28)])
val["letter"] = [chr(ord('@')+x) for x in labels]

### EDA

In [23]:
for i in range(0, 21600, 800):
    print_data(np.reshape(images[i], (28, 28)))
    print("\n\n")

  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  4 21 32 37 37 20  1  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  4 22110172203217214138 15  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0 20 45114145232252254254232 93  6  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  3 67122203222249254254250207 51  4  1  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  5 47175231254254253232209142111119 93 53 18  0  0  0  0  0  0  0  0  0  0
  0  0  0 33190247253254253219 98 54 77177243236165 81  2  0  0  

IndexError: list index out of range

In [24]:
train["letter"].value_counts(True)

L    0.038462
G    0.038462
S    0.038462
X    0.038462
Z    0.038462
U    0.038462
H    0.038462
C    0.038462
J    0.038462
A    0.038462
T    0.038462
I    0.038462
P    0.038462
B    0.038462
N    0.038462
Y    0.038462
K    0.038462
V    0.038462
O    0.038462
R    0.038462
M    0.038462
Q    0.038462
D    0.038462
F    0.038462
E    0.038462
W    0.038462
Name: letter, dtype: float64

### Separación de set

In [25]:
xo_train = train[train["letter"].isin(["O", "X"])]

In [26]:
xo_train.to_csv("xo_train.csv", index = False)

In [27]:
xo_val = val[val["letter"].isin(["O", "X"])]
xo_val.to_csv("xo_val.csv", index = False)

In [28]:
X = xo_train[[x for x in xo_train.columns if x != "letter"]]
y = xo_train["letter"]

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7)
X_val = xo_val[[x for x in xo_val.columns if x != "letter"]]
y_val = xo_val["letter"]

In [31]:
y_train

36397     X
123740    X
5956      O
54089     O
60883     O
         ..
120568    X
23535     O
31102     X
65496     O
40813     O
Name: letter, Length: 6720, dtype: object

## Modelado

### Cross validation

In [32]:
mlp = MLPClassifier(max_iter=100)

In [33]:
mlp.fit(X_train, y_train)

MLPClassifier(max_iter=100)

In [34]:
classification_metrics(X=X_train, y=y_train, estimator=mlp)

ROC media: 1.00, desviación estándar: 0.0008883956621907968


In [35]:
mlp.score(X_test, y_test)

0.9954861111111111

In [36]:
mlp.score(X_val, y_val)

0.993125

### Hyperparametrización

In [37]:
param_grid = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [38]:
param_grid

{'hidden_layer_sizes': [(50, 50, 50), (50, 100, 50), (100,)],
 'activation': ['tanh', 'relu'],
 'solver': ['sgd', 'adam'],
 'alpha': [0.0001, 0.05],
 'learning_rate': ['constant', 'adaptive']}

In [39]:
search = RandomizedSearchCV(param_distributions=param_grid, cv=4, n_jobs=-1, scoring="roc_auc", estimator=mlp, n_iter=10, verbose=5)

In [40]:
search.fit(X=xo_train[[x for x in xo_train if x != "letter"]], y=xo_train["letter"])

Fitting 4 folds for each of 10 candidates, totalling 40 fits


RandomizedSearchCV(cv=4, estimator=MLPClassifier(max_iter=100), n_jobs=-1,
                   param_distributions={'activation': ['tanh', 'relu'],
                                        'alpha': [0.0001, 0.05],
                                        'hidden_layer_sizes': [(50, 50, 50),
                                                               (50, 100, 50),
                                                               (100,)],
                                        'learning_rate': ['constant',
                                                          'adaptive'],
                                        'solver': ['sgd', 'adam']},
                   scoring='roc_auc', verbose=5)

In [41]:
search.best_estimator_

MLPClassifier(hidden_layer_sizes=(50, 100, 50), learning_rate='adaptive',
              max_iter=100)

In [42]:
search.best_score_

0.9997560763888889

In [43]:
search.score(X_test, y_test)

1.0

In [44]:
search.score(X_val, y_val)

0.99920390625

### Preservación de modelo ganador

In [45]:
pd.to_pickle(search.best_estimator_, "/home/oscar/Documentos/Oscar/Diplomado/Generación 3/Módulo 2/tic_tac_toe/home/rnn_tictactoe.pickle")