In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import json
import os
import datetime as dt
from tqdm import tqdm
import pandas as pd
import numpy as np
from PIL import Image, ImageDraw, ImageOps
import tensorflow as tf
import cv2

In [5]:
cwd = os.getcwd()
cwd

'C:\\Users\\ACER\\Documents\\My mini Projects\\ML\\Drawing'

In [6]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


# Shuffling Dataset

In [7]:
def extract_label_from_csv(filename: str) -> str:
    """
        This method will extract the label from the file name
        Ex:
            from `elephant.csv` -> 'elephant'
    """
    return filename.split('.')[0]

In [8]:
class Simplified():
    """
        Preprocess the csv files
    """
    
    def __init__(self, input_path='dataset'):
        self.input_path = input_path

    def list_all_categories(self):
        """ 
            Returns list of all categories available in the csvs, by considering csv file name
        """        
        files = os.listdir(os.path.join(cwd, self.input_path))
        return sorted([extract_label_from_csv(f) for f in files], key=str.lower)

    def read_training_csv(self, category, nrows=None, usecols=None, drawing_transform=False):
        """
            category: str, compulsury
                Category in other words csv file name needs to read
            nrows:int, optional
                Number of rows of file to read. Useful for reading pieces of large files.
            usecols: list-like or callable, optional
                Return a subset of the columns. If list-like, all elements must either be 
                positional (i.e. integer indices into the document columns) or strings that
                correspond to column names provided either by the user in names or inferred 
                from the document header row(s)
            drawing_transform: bool, optional
                Whether the returning dataframe got structured drawing cordinates or str
        """
        
        df = pd.read_csv(os.path.join(self.input_path, category + '.csv'),
                         nrows=nrows, parse_dates=['timestamp'], usecols=usecols)
        if drawing_transform:
            df['drawing'] = df['drawing'].apply(json.loads)
        return df

In [9]:
s = Simplified('dataset')
NCSVS = 100 # number of shuffled files
categories = s.list_all_categories()
print(len(categories))

5


In [10]:
for y, cat in tqdm(enumerate(categories)):
    df = s.read_training_csv(cat, nrows=30000)
    # add column y with category id comming from emumerate
    df['y'] = y
    # add cv column by calculating random number using key_id
    df['cv'] = (df['key_id'] // 10 ** 7) % NCSVS
    
    for k in range(NCSVS):
        # create csv name using 1-NCSVS
        filename = 'train_k{}.csv'.format(k)
        # keep records where cv == k
        chunk = df[df['cv'] == k]
        chunk = chunk.drop(['key_id'], axis=1)
        
        if y == 0:
            chunk.to_csv('shuffled_csv/' + filename, index=False)
        else:
            chunk.to_csv('shuffled_csv/' + filename, mode='a', header=False, index=False)

5it [00:04,  1.13it/s]


In [19]:
for k in tqdm(range(NCSVS)):
    filename = 'shuffled_csv/train_k{}.csv'.format(k)
    
    # if the file exists
    if os.path.exists(filename):
        df = pd.read_csv(filename)
        # make a column with randomly assigned numbers
        df['rnd'] = np.random.rand(len(df))
        # sort by rnd column
        df = df.sort_values(by='rnd').drop('rnd', axis=1)
        df.to_csv(filename + '.gz', compression='gzip', index=False)
        os.remove(filename)
print(df.shape)

100%|██████████| 100/100 [00:57<00:00,  1.75it/s]

(1519, 7)





# Preprocessing

In [20]:
num_of_classes = 163
size = 64
STEPS = 1000
batchsize = 512
epochs = 15

def draw_cv2(raw_strokes, size=256, lw=6):
    img = np.zeros((64, 64), np.uint8)
    for stroke in raw_strokes:
        for i in range(len(stroke[0]) - 1):
            _ = cv2.line(img, (stroke[0][i], stroke[1][i]), (stroke[0][i + 1], stroke[1][i + 1]), 255, lw)
    if size != 256:
        return cv2.resize(img, (size, size))
    else:
        return img

#ADD DATA AUGMENTATION TO BOOST
def image_generator(size, batchsize, ks, lw=6):
    cnt = 0
    
    while True:
        for k in np.random.permutation(ks):
            
            filename = os.path.join(cwd, 'shuffled_csv/train_k{}.csv.gz'.format(k))
            
            for df in pd.read_csv(filename, chunksize=batchsize):
                df['drawing'] = df['drawing'].apply(ast.literal_eval)
                
                x = np.zeros((len(df), size, size))
                
                for i, raw_strokes in enumerate(df.drawing.values):
                    x[i] = draw_cv2(raw_strokes, size=size, lw=lw)
                
                # normalize the image arr    
                x = x / 255.
                # reshapes the arr
                x = x.reshape((len(df), size, size, 1)).astype(np.float32)
                # encode categories
                y = tf.keras.utils.to_categorical(df.y, num_classes=num_of_classes)
                
                cnt += 1
                if cnt == batchsize :
                    cnt=0  #don't forget to set this number to 0
                    yield x, y
                

def df_to_image_array(df, size=size, lw=6):
#     df['drawing'] = df['drawing'].apply(ast.literal_eval)
    x = np.zeros((len(df), size, size))
    for i, raw_strokes in enumerate(df.drawing.values):
        x[i] = draw_cv2(raw_strokes)
    x = x / 255.
    x = x.reshape((len(df), size, size, 1)).astype(np.float32)
    return x

In [22]:
import ast

valid_df = pd.read_csv(os.path.join(cwd, 'shuffled_csv/train_k{}.csv.gz'.format(NCSVS - 1)), nrows=30000)
x_valid = df_to_image_array(valid_df, size)
y_valid = tf.keras.utils.to_categorical(valid_df.y, num_classes=num_of_classes)
print(x_valid.shape, y_valid.shape)
print('Validation array memory {:.2f} GB'.format(x_valid.nbytes / 1024.**3 ))

(1519, 64, 64, 1) (1519, 163)
Validation array memory 0.02 GB


In [23]:
train_datagen = image_generator(size=size, batchsize=batchsize, ks=range(NCSVS - 1))

In [24]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.metrics import top_k_categorical_accuracy
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

In [25]:
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu', input_shape=(64, 64, 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(680, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_of_classes, activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 64, 64, 32)        320       
_________________________________________________________________
max_pooling2d (MaxPooling2D  (None, 32, 32, 32)        0         
)                                                                
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 32, 32, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling  (None, 16, 16, 64)        0         
2D)                                                              
_________________________________________________________________
dropout (Dropout)            (None, 16, 16, 64)        0         
_________________________________________________________________
flatten (Flatten)            (None, 16384)             0

In [26]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
hist = model.fit(
    train_datagen, steps_per_epoch=STEPS, epochs=epochs, verbose=1,
    validation_data=(x_valid, y_valid),
)



Epoch 1/15
   7/1000 [..............................] - ETA: 23:19:49 - loss: 2.9047 - accuracy: 0.2689

## 