In [38]:
import pandas as pd
import numpy as np
import tensorflow as tf
import random
from ast import literal_eval
from sklearn.utils import shuffle
training_set = '../train_simplified/'
from os import listdir
from os.path import isfile, join
train_files = [join(training_set, f) for f in listdir(training_set) if isfile(join(training_set, f))]

In [39]:
# function to check GPU availability
def get_available_gpus():
    from tensorflow.python.client import device_lib
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

In [40]:
tf.test.is_gpu_available()

False

In [41]:
get_available_gpus()

[]

In [42]:
def clean_drawing(inkarray):
    inkarray = literal_eval(inkarray)
    stroke_lengths = [len(stroke[0]) for stroke in inkarray]
    total_points = sum(stroke_lengths)
    np_ink = np.zeros((total_points, 3), dtype=np.float32)
    current_t = 0
    for stroke in inkarray:
        for i in [0, 1]:
            np_ink[current_t:(current_t + len(stroke[0])), i] = stroke[i]
        current_t += len(stroke[0])
        np_ink[current_t - 1, 2] = 1  # stroke_end
    # Preprocessing.
    # 1. Size normalization.
    lower = np.min(np_ink[:, 0:2], axis=0)
    upper = np.max(np_ink[:, 0:2], axis=0)
    scale = upper - lower
    scale[scale == 0] = 1
    np_ink[:, 0:2] = (np_ink[:, 0:2] - lower) / scale
    # 2. Compute deltas.
    np_ink = np_ink[1:, 0:2] - np_ink[0:-1, 0:2]
    return np_ink

def load_and_clean(files,n_rows=100):
    """
    loads into a single dataframe a 100 random rows from each of the csv files in the parameter list "files"
    then cleans the drawing column by mapping the above function clean_drawing 
    """
    dfs = []
    counter = 0
    for file in files:
        print(counter)
        total_rows = sum(1 for line in open(file))
        skip = sorted(random.sample(range(1,total_rows),total_rows-n_rows))
        df = pd.read_csv(file, skiprows=skip)
        dfs += [df]
        counter += 1
    full_df = pd.concat(dfs)
    full_df['drawing'] = full_df['drawing'].map(clean_drawing)
    return full_df

In [43]:
df = load_and_clean(train_files[:5])
df = df.reset_index(drop=True)

0
1
2
3
4


In [47]:
df['drawing'][3]

array([[ 0.03030303,  0.04705882],
       [ 0.05050504,  0.4       ],
       [ 0.06060606,  0.16862744],
       [ 0.07070708, -0.9843137 ],
       [-0.20202023, -0.01568628],
       [-0.07070705,  0.00784314],
       [-0.17171718,  0.05490196],
       [-0.15151516,  0.11372549],
       [-0.01010101,  0.12156864],
       [ 0.11111111,  0.04705882],
       [ 0.22222224,  0.03921568],
       [ 0.25252524,  0.01960784],
       [ 0.22222221, -0.00392157],
       [ 0.10101014, -0.01960784],
       [ 0.09090906, -0.04705882],
       [-0.01010102, -0.08627452],
       [-0.21212119, -0.13333333],
       [-0.24242425, -0.07843137],
       [-0.08080807, -0.00392157],
       [-0.2626263 ,  0.32156864],
       [ 0.57575756, -0.22745098],
       [ 0.12121212, -0.07058825]], dtype=float32)