In [1]:
import pandas as pd
import numpy as np
import pickle
import re
import random
import sys

### Processing sim images

In [2]:
sim = pd.DataFrame(columns=['filename', 'xmins', 'xmaxs', 'ymins', 'ymaxs', 
                            'classes', 'classes_text', 'height', 'width'])

i = 0
h = 600
w = 800
labels = [b'red', b'yellow', b'green']

for idx, csvFile in enumerate(['data/red_sim.csv', 'data/yellow_sim.csv', 'data/green_sim.csv']):
    df = pd.read_csv(csvFile)

    for j in range(df.shape[0]):
        xmin, ymin, u, v = re.findall(r'\d+', df.iloc[j, 5])
        sim.loc[i] = [df.iloc[j, 0],
                     float(xmin)/w, 
                     (float(xmin)+float(u))/w, 
                     float(ymin)/h, 
                     (float(ymin)+float(v))/h, 
                     int(idx+1),
                     labels[idx],
                     int(h),
                     int(w)]        
        i = i + 1

sim.head()

Unnamed: 0,filename,xmins,xmaxs,ymins,ymaxs,classes,classes_text,height,width
0,sim_1507476460.png,0.15875,0.23125,0.476667,0.688333,1,b'red',600,800
1,sim_1507476460.png,0.4675,0.5375,0.483333,0.701667,1,b'red',600,800
2,sim_1507476460.png,0.7775,0.84875,0.491667,0.703333,1,b'red',600,800
3,sim_1507476461.png,0.1625,0.23,0.476667,0.688333,1,b'red',600,800
4,sim_1507476461.png,0.465,0.53875,0.478333,0.698333,1,b'red',600,800


In [3]:
# Makes sure that xmin < xmax and ymin < ymax
assert sim.apply(lambda row: (row['xmins'] <= row['xmaxs']) & 
                 (row['ymins'] <= row['ymaxs']), axis = 1).sum() == sim.shape[0]

In [4]:
sim[['filename', 'classes_text']].groupby('classes_text').count()

Unnamed: 0_level_0,filename
classes_text,Unnamed: 1_level_1
b'green',61
b'red',323
b'yellow',54


In [5]:
sim_groups = pd.DataFrame({'filename': sim.filename.unique()})

for col in sim.columns[1:]:
    sim_group = sim.groupby('filename')[col].apply(np.hstack).to_frame().reset_index()
    if col not in ['height', 'width']:
        sim_groups = pd.merge(sim_groups, sim_group, on='filename')
    else:
        sim_group[col] = sim_group[col].apply(lambda r: r if len(set(r)) > 1 else r[0])
        sim_groups = pd.merge(sim_groups, sim_group, on='filename')
        
sim_groups['class_text'] = sim_groups['classes_text'].apply(lambda r: r if len(set(r)) > 1 else r[0])

sim_groups.head()

Unnamed: 0,filename,xmins,xmaxs,ymins,ymaxs,classes,classes_text,height,width,class_text
0,sim_1507476460.png,"[0.15875, 0.4675, 0.7775]","[0.23125, 0.5375, 0.84875]","[0.476666666667, 0.483333333333, 0.491666666667]","[0.688333333333, 0.701666666667, 0.703333333333]","[1, 1, 1]","[b'red', b'red', b'red']",600,800,b'red'
1,sim_1507476461.png,"[0.1625, 0.465, 0.7775]","[0.23, 0.53875, 0.85]","[0.476666666667, 0.478333333333, 0.491666666667]","[0.688333333333, 0.698333333333, 0.703333333333]","[1, 1, 1]","[b'red', b'red', b'red']",600,800,b'red'
2,sim_1507476462.png,"[0.155, 0.46375, 0.78]","[0.22875, 0.54, 0.85125]","[0.476666666667, 0.481666666667, 0.493333333333]","[0.688333333333, 0.701666666667, 0.705]","[1, 1, 1]","[b'red', b'red', b'red']",600,800,b'red'
3,sim_1507476463.png,"[0.15625, 0.46625, 0.7775]","[0.22625, 0.54, 0.85375]","[0.473333333333, 0.483333333333, 0.493333333333]","[0.685, 0.696666666667, 0.701666666667]","[1, 1, 1]","[b'red', b'red', b'red']",600,800,b'red'
4,sim_1507476464.png,"[0.155, 0.4675, 0.78]","[0.225, 0.54, 0.8575]","[0.466666666667, 0.473333333333, 0.481666666667]","[0.683333333333, 0.693333333333, 0.698333333333]","[1, 1, 1]","[b'red', b'red', b'red']",600,800,b'red'


In [6]:
sim_groups[['filename', 'class_text']].groupby('class_text').count()

Unnamed: 0_level_0,filename
class_text,Unnamed: 1_level_1
b'green',21
b'red',112
b'yellow',18


In [7]:
assert sim_groups.dropna().shape == sim_groups.shape

### Processing site images

In [8]:
site = pd.DataFrame(columns=['filename', 'xmins', 'xmaxs', 'ymins', 'ymaxs', 
                            'classes', 'classes_text', 'height', 'width'])

i = 0
h = 1096
w = 1368
labels = [b'red', b'yellow', b'green']

for idx, pickleFile in enumerate(['data/red.p', 'data/yellow.p', 'data/green.p']):
    
    with open(pickleFile, 'rb') as f:
        df = pickle.load(f)
    
    if df.apply(lambda row: row['id'].find('site') > -1, axis = 1).sum() > 0: 
        
        for j in range(df.shape[0]):
            
            if df.iloc[j, 0].find('site') > -1: 
            
                best_box_prob = max(df.iloc[j, 2][0][df.iloc[j, 3][0] == 10])
                if best_box_prob > 0.9: 
                    box = df.iloc[j, 1][0][df.iloc[j, 2][0] == best_box_prob][0]
                    ymin, xmin, ymax, xmax = box[0], box[1], box[2], box[3]
                    site.loc[i] = [df.iloc[j, 0].split('/')[1],
                                  xmin, 
                                  xmax, 
                                  ymin, 
                                  ymax, 
                                  int(idx+1),
                                  labels[idx],
                                  int(h),
                                  int(w)]        
                    i = i + 1
                
site.head()

Unnamed: 0,filename,xmins,xmaxs,ymins,ymaxs,classes,classes_text,height,width
0,site_1507478284.39343118.png,0.528541,0.552816,0.358023,0.448821,1,b'red',1096,1368
1,site_1507478206.566551923.png,0.54906,0.570155,0.358148,0.441864,1,b'red',1096,1368
2,site_1507478130.261214017.png,0.533648,0.559435,0.355488,0.451758,1,b'red',1096,1368
3,site_1507478208.300426959.png,0.496642,0.52988,0.330892,0.445462,1,b'red',1096,1368
4,site_1507478285.520903110.png,0.468447,0.506748,0.335749,0.466605,1,b'red',1096,1368


In [9]:
assert site.apply(lambda row: (row['xmins'] <= row['xmaxs']) & 
                  (row['ymins'] <= row['ymaxs']), axis = 1).sum() == site.shape[0]

In [10]:
site[['filename', 'classes_text']].groupby('classes_text').count()

Unnamed: 0_level_0,filename
classes_text,Unnamed: 1_level_1
b'green',94
b'red',67


In [11]:
site_groups = pd.DataFrame({'filename': site.filename.unique()})

for col in site.columns[1:]:
    site_group = site.groupby('filename')[col].apply(np.hstack).to_frame().reset_index()
    if col not in ['height', 'width']:
        site_groups = pd.merge(site_groups, site_group, on='filename')
    else:
        site_group[col] = site_group[col].apply(lambda r: r if len(set(r)) > 1 else r[0])
        site_groups = pd.merge(site_groups, site_group, on='filename')
        
site_groups['class_text'] = site_groups['classes_text'].apply(lambda r: r if len(set(r)) > 1 else r[0])

site_groups.head()

Unnamed: 0,filename,xmins,xmaxs,ymins,ymaxs,classes,classes_text,height,width,class_text
0,site_1507478284.39343118.png,[0.528540670872],[0.552815794945],[0.358022719622],[0.448820501566],[1],[b'red'],1096,1368,b'red'
1,site_1507478206.566551923.png,[0.54905962944],[0.570155441761],[0.358147650957],[0.441863626242],[1],[b'red'],1096,1368,b'red'
2,site_1507478130.261214017.png,[0.533647656441],[0.559434711933],[0.355487972498],[0.45175793767],[1],[b'red'],1096,1368,b'red'
3,site_1507478208.300426959.png,[0.496641904116],[0.529880404472],[0.330892294645],[0.445461839437],[1],[b'red'],1096,1368,b'red'
4,site_1507478285.520903110.png,[0.468447417021],[0.506748378277],[0.335749238729],[0.466605275869],[1],[b'red'],1096,1368,b'red'


In [12]:
site_groups[['filename', 'class_text']].groupby('class_text').count()

Unnamed: 0_level_0,filename
class_text,Unnamed: 1_level_1
b'green',94
b'red',67


In [13]:
assert site_groups.dropna().shape == site_groups.shape

### Combine sim and site, split into train and test

In [14]:
sim_red = sim_groups[sim_groups.class_text == b'red'].reset_index(drop=True).loc[random.sample(range(112), 20)]
sim_green = sim_groups[sim_groups.class_text == b'green'].reset_index(drop=True).loc[random.sample(range(21), 21)]
sim_yellow = sim_groups[sim_groups.class_text == b'yellow'].reset_index(drop=True).loc[random.sample(range(18), 18)]

site_red = site_groups[site_groups.class_text == b'red'].reset_index(drop=True).loc[random.sample(range(67), 67)]
site_green = site_groups[site_groups.class_text == b'green'].reset_index(drop=True).loc[random.sample(range(94), 67)]

In [15]:
combined = pd.concat([sim_red, sim_green, sim_yellow, site_red, site_green]).reset_index(drop=True)

random.seed(42)

mySplit = random.sample(range(combined.shape[0]), combined.shape[0])

train = combined.loc[mySplit[:int(len(mySplit)*0.7)]]
test = combined.loc[mySplit[int(len(mySplit)*0.7):]]

In [16]:
print(train.shape)
train.head()
train[['filename', 'class_text']].groupby('class_text').count()

(135, 10)


Unnamed: 0_level_0,filename
class_text,Unnamed: 1_level_1
b'green',64
b'red',57
b'yellow',14


In [17]:
print(test.shape)
test.head()
test[['filename', 'class_text']].groupby('class_text').count()

(58, 10)


Unnamed: 0_level_0,filename
class_text,Unnamed: 1_level_1
b'green',24
b'red',30
b'yellow',4


In [18]:
print(train.shape); print(test.shape)

(135, 10)
(58, 10)


In [19]:
with open('data/train.p', 'wb') as f:
    pickle.dump(train, f)
with open('data/test.p', 'wb') as f:
    pickle.dump(test, f)

### Add Some Extra Images

In [20]:
import pandas as pd
import numpy as np
import pickle
import re
import random
import sys

In [21]:
extra = pd.DataFrame(columns=['filename', 'xmins', 'xmaxs', 'ymins', 'ymaxs', 
                              'classes', 'classes_text', 'height', 'width'])

i = 0
h = 600
w = 800
labels = [b'red', b'yellow', b'green']

for idx, csvFile in enumerate(['data/extra_png.csv']):
    df = pd.read_csv(csvFile)

    for j in range(df.shape[0]):
        xmin, ymin, u, v = re.findall(r'\d+', df.iloc[j, 5])
        class_label = re.sub("[^A-Za-z]", " ", df.iloc[j, 6].strip("")).split()[1].encode()
        extra.loc[i] = [df.iloc[j, 0],
                         float(xmin)/w, 
                         (float(xmin)+float(u))/w, 
                         float(ymin)/h, 
                         (float(ymin)+float(v))/h, 
                         int(labels.index(class_label)+1),
                         class_label,
                         int(h),
                         int(w)]        
        i = i + 1

extra.head()

Unnamed: 0,filename,xmins,xmaxs,ymins,ymaxs,classes,classes_text,height,width
0,green1.png,0.235,0.27375,0.395,0.493333,3,b'green',600,800
1,green1.png,0.48625,0.5225,0.433333,0.531667,3,b'green',600,800
2,green1.png,0.92125,0.955,0.648333,0.728333,3,b'green',600,800
3,green2.png,0.505,0.58125,0.376667,0.68,3,b'green',600,800
4,green3.png,0.505,0.6975,0.128333,0.885,3,b'green',600,800


In [22]:
extra_groups = pd.DataFrame({'filename': extra.filename.unique()})

for col in extra.columns[1:]:
    extra_group = extra.groupby('filename')[col].apply(np.hstack).to_frame().reset_index()
    if col not in ['height', 'width']:
        extra_groups = pd.merge(extra_groups, extra_group, on='filename')
    else:
        extra_group[col] = extra_group[col].apply(lambda r: r if len(set(r)) > 1 else r[0])
        extra_groups = pd.merge(extra_groups, extra_group, on='filename')
        
extra_groups['class_text'] = extra_groups['classes_text'].apply(lambda r: r if len(set(r)) > 1 else r[0])

extra_groups.head()

Unnamed: 0,filename,xmins,xmaxs,ymins,ymaxs,classes,classes_text,height,width,class_text
0,green1.png,"[0.235, 0.48625, 0.92125]","[0.27375, 0.5225, 0.955]","[0.395, 0.433333333333, 0.648333333333]","[0.493333333333, 0.531666666667, 0.728333333333]","[3, 3, 3]","[b'green', b'green', b'green']",600,800,b'green'
1,green2.png,[0.505],[0.58125],[0.376666666667],[0.68],[3],[b'green'],600,800,b'green'
2,green3.png,[0.505],[0.6975],[0.128333333333],[0.885],[3],[b'green'],600,800,b'green'
3,red1.png,[0.405],[0.525],[0.2],[0.62],[1],[b'red'],600,800,b'red'
4,red2.png,[0.58],[0.7925],[0.113333333333],[0.971666666667],[1],[b'red'],600,800,b'red'


In [23]:
with open('data/train.p', 'rb') as f:
    train = pickle.load(f)

In [24]:
train_extra = pd.concat([train, extra_groups]).reset_index(drop=True)

random.seed(42)

mySplit = random.sample(range(train_extra.shape[0]), train_extra.shape[0])

train_extra = train_extra.loc[mySplit].reset_index(drop=True)

train_extra.head()

Unnamed: 0,filename,xmins,xmaxs,ymins,ymaxs,classes,classes_text,height,width,class_text
0,site_1507478208.300426959.png,[0.496641904116],[0.529880404472],[0.330892294645],[0.445461839437],[1],[b'red'],1096,1368,b'red'
1,raw_1507415973.png,"[0.1575, 0.465, 0.7775]","[0.23, 0.5375, 0.84875]","[0.478333333333, 0.486666666667, 0.491666666667]","[0.685, 0.7, 0.705]","[2, 2, 2]","[b'yellow', b'yellow', b'yellow']",600,800,b'yellow'
2,site_1507478283.400125026.png,[0.541101157665],[0.564745664597],[0.355073392391],[0.445116072893],[1],[b'red'],1096,1368,b'red'
3,site_1507478153.971164941.png,[0.518493890762],[0.544734597206],[0.357018500566],[0.450143635273],[3],[b'green'],1096,1368,b'green'
4,raw_1507390377.png,"[0.1575, 0.465, 0.77375]","[0.23125, 0.53875, 0.85125]","[0.471666666667, 0.485, 0.493333333333]","[0.69, 0.698333333333, 0.705]","[2, 2, 2]","[b'yellow', b'yellow', b'yellow']",600,800,b'yellow'


In [25]:
print(train_extra.shape); print(train.shape);

(146, 10)
(135, 10)


In [26]:
with open('data/train_extra.p', 'wb') as f:
    pickle.dump(train_extra, f)