In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn.ensemble import RandomForestClassifier

In [5]:
data = datasets.fetch_openml('mnist_784', version=1)

In [6]:
X, y = data['data'], data['target']

In [51]:
y = y.astype(np.uint8)

In [52]:
X_train, y_train, X_test, y_test = X[:60000], y[:60000], X[60000:], y[60000:]

In [10]:
import joblib
model = joblib.load('model_97.pkl')

### Data Augmentation

In [11]:
from scipy.ndimage.interpolation import shift

### Algo

* convert np ndarray
* create empty array for augmented data
* create empty array for labels
* for each row: reshape to a 2d image, append label to labels_array
    * apply shift function
    * append new array to empty array
* concat aug and label array to original data and shuffle

In [66]:
def augmented_data(X, y):
    '''this will shift all images in X in different directions subsequently'''
    # define some variables
    data_arr = X.values
    shift_imgs = np.array([])
    shift_imgs_lables = []
        
    # loop through each instance
    for ix, x in enumerate(data_arr):

        # apply shift function
        shift_arr_u = np.reshape(shift(np.reshape(x, (28, 28)), [-1, 0], cval=0), -1)
        shift_arr_d = np.reshape(shift(np.reshape(x, (28, 28)), [1, 0], cval=0), -1)
        shift_arr_l = np.reshape(shift(np.reshape(x, (28, 28)), [0, 1], cval=0), -1)
        shift_arr_r = np.reshape(shift(np.reshape(x, (28, 28)), [0, -1], cval=0), -1)

        # append to augment and label array
        shift_imgs = np.concatenate([shift_imgs, shift_arr_u, shift_arr_d, shift_arr_r, shift_arr_l])
        shift_imgs_lables.append(y[ix])
        shift_imgs_lables.append(y[ix])
        shift_imgs_lables.append(y[ix])
        shift_imgs_lables.append(y[ix])
        
        # verbose
        if ix%500 == 0:
            print('{} instanes completed'.format(ix))

    # concatenate data
    aug_data = np.concatenate([data_arr, shift_imgs])
    aug_labels = pd.concat([y, pd.Series(shift_imgs_labels)])
    
    # shuffle data
    X_new = pd.DataFrame(aug_data).sample(frac=1, random_state=42).reset_index(drop=True)
    y_new = aug_labels.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return X_new, y_new
    

In [None]:
X_new, y_new = augmented_data(X_train, y_train)

0 instanes completed
500 instanes completed
1000 instanes completed
1500 instanes completed
2000 instanes completed
2500 instanes completed
3000 instanes completed
3500 instanes completed
4000 instanes completed
4500 instanes completed
5000 instanes completed
5500 instanes completed
6000 instanes completed
6500 instanes completed
7000 instanes completed
7500 instanes completed
8000 instanes completed
8500 instanes completed
9000 instanes completed
9500 instanes completed
10000 instanes completed
10500 instanes completed
11000 instanes completed
11500 instanes completed
12000 instanes completed
12500 instanes completed
13000 instanes completed
13500 instanes completed
14000 instanes completed
14500 instanes completed
15000 instanes completed
15500 instanes completed
16000 instanes completed
16500 instanes completed
17000 instanes completed
17500 instanes completed
18000 instanes completed
18500 instanes completed
19000 instanes completed
19500 instanes completed
20000 instanes completed