In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import random
from ast import literal_eval
from sklearn.utils import shuffle
training_set = '../train_simplified/'
from os import listdir
from os.path import isfile, join
train_files = [join(training_set, f) for f in listdir(training_set) if isfile(join(training_set, f))]

In [2]:
# function to check GPU availability
def get_available_gpus():
    from tensorflow.python.client import device_lib
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

In [3]:
tf.test.is_gpu_available()

False

In [4]:
get_available_gpus()

[]

In [5]:
def clean_drawing(inkarray):
    inkarray = literal_eval(inkarray)
    stroke_lengths = [len(stroke[0]) for stroke in inkarray]
    total_points = sum(stroke_lengths)
    np_ink = np.zeros((total_points, 3), dtype=np.float32)
    current_t = 0
    for stroke in inkarray:
        for i in [0, 1]:
            np_ink[current_t:(current_t + len(stroke[0])), i] = stroke[i]
        current_t += len(stroke[0])
        np_ink[current_t - 1, 2] = 1  # stroke_end
    # Preprocessing.
    # 1. Size normalization.
    lower = np.min(np_ink[:, 0:2], axis=0)
    upper = np.max(np_ink[:, 0:2], axis=0)
    scale = upper - lower
    scale[scale == 0] = 1
    np_ink[:, 0:2] = (np_ink[:, 0:2] - lower) / scale
    # 2. Compute deltas.
    np_ink = np_ink[1:, 0:2] - np_ink[0:-1, 0:2]
    return np_ink

def load_and_clean(files,n_rows=100):
    """
    loads into a single dataframe a 100 random rows from each of the csv files in the parameter list "files"
    then cleans the drawing column by mapping the above function clean_drawing 
    """
    dfs = []
    counter = 0
    for file in files:
        print(counter)
        total_rows = sum(1 for line in open(file))
        skip = sorted(random.sample(range(1,total_rows),total_rows-n_rows))
        df = pd.read_csv(file, skiprows=skip)
        dfs += [df]
        counter += 1
    full_df = pd.concat(dfs)
    full_df['drawing'] = full_df['drawing'].map(clean_drawing)
    return full_df

In [7]:
df = load_and_clean(train_files[:15])
df = df.reset_index(drop=True)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [8]:
df['drawing'][3]

array([[-0.02479339,  0.50196075],
       [ 0.1404959 , -0.5490196 ],
       [-0.09090912,  0.02352941],
       [-0.21487603, -0.01960784],
       [-0.11570248, -0.02745098],
       [-0.10743802, -0.05098039],
       [-0.0661157 , -0.03921568],
       [-0.04958678, -0.05882353],
       [-0.00826446, -0.08235295],
       [ 0.04132231, -0.07058823],
       [ 0.03305785, -0.03529412],
       [ 0.12396694, -0.05490196],
       [ 0.09917356, -0.02352941],
       [ 0.12396693, -0.01176471],
       [ 0.22314051,  0.        ],
       [ 0.13223141,  0.01960784],
       [ 0.09090906,  0.02352941],
       [ 0.07438016,  0.04705882],
       [ 0.05785125,  0.06666667],
       [ 0.        ,  0.05882353],
       [-0.04132229,  0.04313727],
       [-0.09917361,  0.0745098 ],
       [-0.13223135,  0.05098039],
       [-0.09917355,  0.01176471],
       [-0.17355374, -0.01568627],
       [-0.14049587, -0.05098039],
       [-0.07438017, -0.09411766],
       [ 0.        , -0.07058823]], dtype=float32)