In [1]:
import numpy as np
import pandas as pd
import random
from scipy.ndimage.interpolation import shift
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [2]:
TRAIN_PATH = "./digit-recognizer/train.csv"
TEST_PATH = "./digit-recognizer/test.csv"
SAMPLE_SUBMISSION_PATH = "./digit-recognizer/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"

TEST_SIZE = 0.1

## Importando dados para treinamento supervisionado

In [3]:
mnist = np.genfromtxt(TRAIN_PATH, dtype=int, delimiter=',', skip_header=1)
np.shape(mnist)

(42000, 785)

In [4]:
X, y = mnist[:,1:], mnist[:,0]

In [5]:
np.shape(X), np.shape(y)

((42000, 784), (42000,))

## Dividindo em conjunto de treino e validação 

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED)
np.shape(X_train), np.shape(y_train), np.shape(X_val), np.shape(y_val)

((37800, 784), (37800,), (4200, 784), (4200,))

## Treinando baseline para comparação 

In [9]:
gbc_clf = GradientBoostingClassifier()
gbc_clf.fit(X_train, y_train)

GradientBoostingClassifier()

In [10]:
y_gbc = gbc_clf.predict(X_val)

In [11]:
accuracy_score(y_val, y_gbc)

0.9416666666666667

In [None]:
# cross_val_score(gbc_clf, X_train, y_train, cv=3, scoring="accuracy")

# Busca pelos melhores hiperparâmetros para classificador

### Classificador com parâmetros default 

In [39]:
svc = SVC(kernel='poly', degree=4)
svc_poly_ovo = OneVsOneClassifier(svc)

In [40]:
svc_poly_ovo.fit(X_train, y_train)

OneVsOneClassifier(estimator=SVC(degree=4, kernel='poly'))

In [47]:
y_default = svc_poly_ovo.predict(X_val)

In [48]:
accuracy_score(y_val, y_default)

0.9638095238095238

In [None]:
# cross_val_score(svc_poly_ovo, X_train, y_train, cv=3, scoring="accuracy")

In [11]:
params = [{'estimator__C':[1, 10], 'estimator__gamma':[1, 10]}]

grid_search = GridSearchCV(svc_poly_ovo, params, cv=5, scoring='accuracy', return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=OneVsOneClassifier(estimator=SVC(degree=4,
                                                        kernel='poly')),
             param_grid=[{'estimator__C': [1, 10],
                          'estimator__gamma': [1, 10]}],
             return_train_score=True, scoring='accuracy')

In [24]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

0.9657671957671958 {'estimator__C': 1, 'estimator__gamma': 1}
0.9657671957671958 {'estimator__C': 1, 'estimator__gamma': 10}
0.9657671957671958 {'estimator__C': 10, 'estimator__gamma': 1}
0.9657671957671958 {'estimator__C': 10, 'estimator__gamma': 10}


In [41]:
best_params = grid_search.best_params_
best_params

{'estimator__C': 1, 'estimator__gamma': 1}

## Treinando e avaliando classificador com os melhores hiperparâmetros 

In [13]:
svc = SVC(kernel='poly', degree=4, C=best_params['estimator__C'], gamma=best_params['estimator__gamma'])
# svc = SVC(kernel='poly', degree=4, C=1, gamma=1)
svc_poly_ovo = OneVsOneClassifier(svc)

### Data Augmentation 

In [7]:
def shift_image(image, dx, dy):
    image = image.reshape((28, 28))
    shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
    return shifted_image.reshape([-1])

In [8]:
X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]

for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    for image, label in zip(X_train, y_train):
        X_train_augmented.append(shift_image(image, dx, dy))
        y_train_augmented.append(label)

X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)
np.shape(X_train_augmented), np.shape(y_train_augmented)

((189000, 784), (189000,))

### Treinando com conjunto de treino expandido 

In [19]:
svc_poly_ovo.fit(X_train_augmented, y_train_augmented)

OneVsOneClassifier(estimator=SVC(C=1, degree=4, gamma=1, kernel='poly'))

In [22]:
y_exp_pred = svc_poly_ovo.predict(X_val)

In [23]:
accuracy_score(y_val, y_exp_pred)

0.98

In [None]:
# cross_val_score(svc_poly_ovo, X_train, y_train, cv=3, scoring="accuracy")

In [27]:
X_test = np.genfromtxt(TEST_PATH, dtype=int, delimiter=',', skip_header=1)
np.shape(X_test)

(28000, 784)

In [28]:
results = svc_poly_ovo.predict(X_test)

In [29]:
sub = pd.read_csv(SAMPLE_SUBMISSION_PATH)
sub["Label"] = results
sub.to_csv(SUBMISSION_PATH,index=False)
sub.head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,4
4,5,3
