In [1]:
import pandas as pd
import numpy as np
import random
from ast import literal_eval
from sklearn.utils import shuffle
training_set = '../train_simplified/'
from os import listdir
from os.path import isfile, join
train_files = [join(training_set, f) for f in listdir(training_set) if isfile(join(training_set, f))]

In [2]:
# function to check GPU availability
def get_available_gpus():
    from tensorflow.python.client import device_lib
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

In [3]:
get_available_gpus()

[]

In [6]:
def clean_drawing(inkarray):
    inkarray = literal_eval(inkarray)
    stroke_lengths = [len(stroke[0]) for stroke in inkarray]
    total_points = sum(stroke_lengths)
    np_ink = np.zeros((total_points, 3), dtype=np.float32)
    current_t = 0
    for stroke in inkarray:
        for i in [0, 1]:
            np_ink[current_t:(current_t + len(stroke[0])), i] = stroke[i]
        current_t += len(stroke[0])
        np_ink[current_t - 1, 2] = 1  # stroke_end
    # Preprocessing.
    # 1. Size normalization.
    lower = np.min(np_ink[:, 0:2], axis=0)
    upper = np.max(np_ink[:, 0:2], axis=0)
    scale = upper - lower
    scale[scale == 0] = 1
    np_ink[:, 0:2] = (np_ink[:, 0:2] - lower) / scale
    # 2. Compute deltas.
    np_ink = np_ink[1:, 0:2] - np_ink[0:-1, 0:2]
    return np_ink

def load_and_clean(files,n_rows=100):
    """
    loads into a single dataframe a 100 random rows from each of the csv files in the parameter list "files"
    then cleans the drawing column by mapping the above function clean_drawing 
    """
    dfs = []
    counter = 0
    for file in files:
        print(counter)
        total_rows = sum(1 for line in open(file))
        skip = sorted(random.sample(range(1,total_rows),total_rows-n_rows))
        df = pd.read_csv(file, skiprows=skip)
        dfs += [df]
        counter += 1
    full_df = pd.concat(dfs)
    full_df['drawing'] = full_df['drawing'].map(clean_drawing)
    return full_df

In [7]:
df = load_and_clean(train_files[:15])
df = df.reset_index()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [8]:
df.head()

Unnamed: 0,index,countrycode,drawing,key_id,recognized,timestamp,word
0,0,UA,"[[0.0, -0.61176467], [-0.21276596, -0.3882353]...",5909976444829696,True,2017-01-31 18:35:02.272830,lollipop
1,1,US,"[[0.11965811, -0.33725488], [-0.059829056, -0....",4580607361286144,True,2017-01-24 02:33:08.397180,lollipop
2,2,KR,"[[-0.30252102, 0.0], [-0.16806723, -0.03137257...",4780922752729088,True,2017-03-05 09:16:05.520940,lollipop
3,3,GB,"[[-0.045454532, -0.19607842], [0.0181818, -0.4...",5457494996418560,True,2017-01-29 18:08:17.162730,lollipop
4,4,AU,"[[-0.008474559, 0.6535433], [0.559322, -0.8110...",5904597233172480,True,2017-01-23 10:32:31.943170,lollipop


In [11]:
test_df = pd.read_csv('../test_simplified.csv')

In [15]:
!ls


Preprocess.ipynb  README.md
