
# quickdraw-doodle-recognition

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.imports import *
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *
import dill as pickle

  from numpy.core.umath_tests import inner1d


In [3]:
PATH = '/home/borowis/s3'

In [None]:
ls {PATH}

In [4]:
bs = 256
BASE_SIZE = 256                  # image size
NUM_SAMPLES_PER_CLASS = 10       # set to -1 for all images
CHUNK_SIZE = 100                 # csv processing chunk

# csv data to images

In [5]:
def draw_cv2(raw_strokes, size=256, lw=2, time_color=True):
    colors = [(255, 0, 0) , (255, 255, 0),  (128, 255, 0),  (0, 255, 0), (0, 255, 128), (0, 255, 255), 
              (0, 128, 255), (0, 0, 255), (128, 0, 255), (255, 0, 255)]
    img = np.zeros((BASE_SIZE, BASE_SIZE, 3), np.uint8)
    for t, stroke in enumerate(raw_strokes):
        for i in range(len(stroke[0]) - 1):
            color = colors[min(t, len(colors)-1)]
            _ = cv2.line(img, (stroke[0][i], stroke[1][i]),
                         (stroke[0][i + 1], stroke[1][i + 1]), color, lw, lineType=cv2.LINE_4)
    img = img.astype(np.float32)/255
    if size != BASE_SIZE:
        return cv2.resize(img, (size, size))
    else:
        return img

In [30]:
class ImageIterator:
    def __init__(self, it):
        self.it = it

    def __iter__(self):
        return self

    def __next__(self):
        return draw_cv2(json.loads(self.it.__next__()))

def save_array(name, array):
    os.makedirs(f'{PATH}/data/{name}', exist_ok=True)
    barray = bcolz.carray(array, rootdir=f'{PATH}/data/{name}', mode='w')
    barray.flush()
    
def save_array_from_series(name, series): 
    os.makedirs(f'{PATH}/data/{name}', exist_ok=True)
    barray = bcolz.fromiter(ImageIterator(series.values.__iter__()), np.float32, len(series), 
                            rootdir=f'{PATH}/data/{name}', mode='w')
    barray.flush()
    
def load_array(name):
    return bcolz.open(rootdir=f'{PATH}/data/{name}')[:]

In [None]:
df = pd.read_csv(f'{PATH}/train/airplane.csv')

In [None]:
len(df); df.head()

In [None]:
img = draw_cv2(json.loads(df.iloc[0]['drawing']))
#plt.imshow(img)
#plt.savefig(f'{PATH}/train/images/{df.iloc[0]["key_id"]}.png')
cv2.imwrite(f'{PATH}/train/images/{df.iloc[0]["key_id"]}.webp', img)

In [None]:
#plt.imshow(plt.imread(f'{PATH}/train/images/{df.iloc[0]["key_id"]}.webp'))
plt.imshow(df.iloc[:5]['drawing'].apply(json.loads).apply(draw_cv2)[0])

In [None]:
categories = [fileName.replace('.csv', '') for fileName in os.listdir(f'{PATH}/train') if ('.csv' in fileName)]
categories2idx = {c:idx for idx, c in enumerate(categories)}
idx2category = {idx:c for idx, c in enumerate(categories)}

In [None]:
#pickle.dump(categories2idx, open(f'{PATH}/cat2idx.pkl', 'wb'))
#pickle.dump(idx2category, open(f'{PATH}/idx2cat.pkl', 'wb'))

categories2idx = pickle.load(open(f'{PATH}/cat2idx.pkl', 'rb'))
idx2category = pickle.load(open(f'{PATH}/idx2cat.pkl', 'rb'))

In [None]:
X_val = X_train = Y_val = Y_train = None
for category in categories:
    df = pd.read_csv(f'{PATH}/train/{category}.csv')
    idxs = np.random.permutation(len(df))[:NUM_SAMPLES_PER_CLASS]
    
    y = np.full(NUM_SAMPLES_PER_CLASS, categories2idx[category])
    imgs = np.stack(df.iloc[idxs]['drawing'].apply(json.loads).apply(draw_cv2).values)
    ((imgs_val, imgs_train), (y_val, y_train)) = split_by_idx(get_cv_idxs(NUM_SAMPLES_PER_CLASS, seed=None), imgs, y)
    
    if X_train is None:
        X_val, X_train = imgs_val, imgs_train
        Y_val, Y_train = y_val, y_train
    else:
        X_val, X_train = np.concatenate((X_val, imgs_val)), np.concatenate((X_train, imgs_train))
        Y_val, Y_train = np.concatenate((Y_val, y_val)), np.concatenate((Y_train, y_train))

In [None]:
#save_array('X_train', X_train)
#save_array('X_val', X_val)
#save_array('Y_train', Y_train)
#save_array('Y_val', Y_val)

X_train = load_array('X_train')
X_val = load_array('X_val')
Y_train = load_array('Y_train')
Y_val = load_array('Y_val')

In [6]:
df_test = pd.read_csv(f'{PATH}/test/test_simplified.csv')

In [31]:
X_test = save_array_from_series('X_test', df_test['drawing'])

ValueError: setting an array element with a sequence.

In [39]:
ImageIterator(df_test['drawing'].values.__iter__()).__next__().shape

(256, 256, 3)

In [42]:
np.fromiter(itertools.islice(ImageIterator(df_test['drawing'].values.__iter__()), 5), dtype=np.float32)

NameError: name 'float32' is not defined

# multi label

In [None]:
metrics = [accuracy]
f_model = resnet34

In [None]:
def get_data(sz):
    tfms = tfms_from_model(f_model, sz, transforms_basic, max_zoom = 1.1)
    return ImageClassifierData.from_arrays(PATH, (X_train, Y_train), (X_val, Y_val), bs=bs, tfms = tfms)

In [None]:
data = get_data(256)
x, y = next(iter(data.val_dl))

## size 64

In [None]:
data = get_data(64)

In [None]:
learn = ConvLearner.pretrained(f_model, data, metrics = metrics)

In [None]:
lrf = learn.lr_find()
learn.sched.plot()

In [None]:
lr = 0.2

In [None]:
learn.fit(lr, 3, cycle_len = 1, cycle_mult = 2)

In [None]:
lrs = np.array([lr / 9, lr / 3, lr])
learn.unfreeze()

In [None]:
learn.fit(lrs, 3, cycle_len = 1, cycle_mult = 2)

In [None]:
# learn.save('64')
learn.load('64')

In [None]:
learn.sched.plot_loss()

## size 128

In [None]:
learn.set_data(get_data(128))
learn.freeze()
learn.fit(lr, 3, cycle_len = 1, cycle_mult = 2)

In [None]:
learn.unfreeze()
learn.fit(lrs, 3, cycle_len = 1, cycle_mult = 2)

In [None]:
# learn.save('128')
learn.load('128')

In [None]:
learn.sched.plot_loss()

## size 256

In [None]:
learn.set_data(get_data(256))
learn.freeze()
learn.fit(lr, 3, cycle_len = 1, cycle_mult = 2)

In [None]:
learn.unfreeze()
learn.fit(lrs, 3, cycle_len = 1, cycle_mult = 2)

In [None]:
# learn.save('256')
learn.load('256')

In [None]:
learn.sched.plot_loss()

## end

In [None]:
log_preds, y = learn.TTA()

In [None]:
preds = np.mean(log_preds, axis = 0)
f2(preds, y)

## submission

In [None]:
val = learn.predict()

In [None]:
f2(val,data.val_y)

In [None]:
log_preds = learn.TTA(is_test=True)

In [None]:
th = opt_th(preds, y); th

In [None]:
preds = np.mean(log_preds[0], axis=0)
cls = np.array(data.classes)
res = np.array([" ".join(cls[(np.where(pp > th))]) for pp in preds])

In [None]:
fnames = [f[9:-4] for f in data.test_dl.dataset.fnames]

In [None]:
outp = pd.DataFrame({'image_name': fnames, 'tags': res})
outp.head()