In [1]:
import pandas as pd; import numpy as np; import pickle
import matplotlib.pyplot as plt; import re; import random

### Processing sim images

In [2]:
sim = pd.DataFrame(columns=['filename', 'xmins', 'xmaxs', 'ymins', 'ymaxs', 
                            'classes', 'classes_text', 'height', 'width'])

i = 0
h = 600
w = 800
labels = [b'red', b'yellow', b'green']

for idx, csvFile in enumerate(['red_sim.csv', 'yellow_sim.csv', 'green_sim.csv']):
    df = pd.read_csv(csvFile)

    for j in range(df.shape[0]):
        xmin, ymin, u, v = re.findall(r'\d+', df.iloc[j, 5])
        sim.loc[i] = [df.iloc[j, 0],
                     float(xmin)/w, 
                     (float(xmin)+float(u))/w, 
                     float(ymin)/h, 
                     (float(ymin)+float(v))/h, 
                     int(idx),
                     labels[idx],
                     int(h),
                     int(w)]        
        i = i + 1

sim.head()

Unnamed: 0,filename,xmins,xmaxs,ymins,ymaxs,classes,classes_text,height,width
0,sim_1507476460.png,0.15875,0.23125,0.476667,0.688333,0,b'red',600,800
1,sim_1507476460.png,0.4675,0.5375,0.483333,0.701667,0,b'red',600,800
2,sim_1507476460.png,0.7775,0.84875,0.491667,0.703333,0,b'red',600,800
3,sim_1507476461.png,0.1625,0.23,0.476667,0.688333,0,b'red',600,800
4,sim_1507476461.png,0.465,0.53875,0.478333,0.698333,0,b'red',600,800


In [3]:
assert sim.apply(lambda row: (row['xmins'] <= row['xmaxs']) & 
                 (row['ymins'] <= row['ymaxs']), axis = 1).sum() == sim.shape[0]

In [4]:
sim[['filename', 'classes_text']].groupby('classes_text').count()

Unnamed: 0_level_0,filename
classes_text,Unnamed: 1_level_1
b'green',61
b'red',323
b'yellow',54


### Processing site images

In [5]:
site = pd.DataFrame(columns=['filename', 'xmins', 'xmaxs', 'ymins', 'ymaxs', 
                            'classes', 'classes_text', 'height', 'width'])

i = 0
h = 1096
w = 1368
labels = [b'red', b'yellow', b'green']

for idx, pickleFile in enumerate(['red.p', 'yellow.p', 'green.p']):
    
    with open(pickleFile, 'rb') as f:
        df = pickle.load(f)
    
    if df.apply(lambda row: row['id'].find('site') > -1, axis = 1).sum() > 0: 
        
        for j in range(df.shape[0]):
            best_box_prob = max(df.iloc[j, 2][0][df.iloc[j, 3][0] == 10])
            if best_box_prob > 0.9: 
                box = df.iloc[j, 1][0][df.iloc[j, 2][0] == best_box_prob][0]
                ymin, xmin, ymax, xmax = box[0], box[1], box[2], box[3]
                site.loc[i] = [df.iloc[j, 0].split('/')[1],
                              xmin, 
                              xmax, 
                              ymin, 
                              ymax, 
                              int(idx),
                              labels[idx],
                              int(h),
                              int(w)]        
                i = i + 1
                
site.head()

Unnamed: 0,filename,xmins,xmaxs,ymins,ymaxs,classes,classes_text,height,width
0,site_1507478284.39343118.png,0.528541,0.552816,0.358023,0.448821,0,b'red',1096,1368
1,sim_1507476959.png,0.46128,0.505635,0.725603,0.84652,0,b'red',1096,1368
2,site_1507478206.566551923.png,0.54906,0.570155,0.358148,0.441864,0,b'red',1096,1368
3,sim_1507476571.png,0.085387,0.106426,0.888765,0.953315,0,b'red',1096,1368
4,site_1507478130.261214017.png,0.533648,0.559435,0.355488,0.451758,0,b'red',1096,1368


In [6]:
site[['filename', 'classes_text']].groupby('classes_text').count()

Unnamed: 0_level_0,filename
classes_text,Unnamed: 1_level_1
b'green',115
b'red',176


### Combine sim and site, split into train and test

In [7]:
sim_red = sim[sim.classes_text == b'red'].reset_index(drop=True).loc[random.sample(range(323), 80)]
sim_green = sim[sim.classes_text == b'green'].reset_index(drop=True).loc[random.sample(range(62), 61)]
sim_yellow = sim[sim.classes_text == b'yellow'].reset_index(drop=True).loc[random.sample(range(54), 54)]

site_red = site[site.classes_text == b'red'].reset_index(drop=True).loc[random.sample(range(176), 100)]
site_green = site[site.classes_text == b'green'].reset_index(drop=True).loc[random.sample(range(115), 115)]

In [8]:
combined = pd.concat([sim_red, sim_green, sim_yellow, site_red, site_green]).reset_index(drop=True)

mySplit = random.sample(range(combined.shape[0]), combined.shape[0])

train = combined.loc[mySplit[:int(len(mySplit)*0.7)]]
test = combined.loc[mySplit[int(len(mySplit)*0.7):]]

In [9]:
print(train.shape)
train.head()
train[['filename', 'classes_text']].groupby('classes_text').count()

(287, 9)


Unnamed: 0_level_0,filename
classes_text,Unnamed: 1_level_1
b'green',117
b'red',126
b'yellow',43


In [10]:
print(test.shape)
test.head()
test[['filename', 'classes_text']].groupby('classes_text').count()

(123, 9)


Unnamed: 0_level_0,filename
classes_text,Unnamed: 1_level_1
b'green',58
b'red',54
b'yellow',11


In [11]:
with open('train.p', 'wb') as f:
    pickle.dump(train, f)
with open('test.p', 'wb') as f:
    pickle.dump(test, f)

In [12]:
with open('train.p', 'rb') as f:
    train = pickle.load(f)

train.head()

Unnamed: 0,filename,xmins,xmaxs,ymins,ymaxs,classes,classes_text,height,width
314,site_1507478179.709956884.png,0.085815,0.143748,0.305435,0.491955,2,b'green',1096,1368
327,site_1507478077.860780954.png,0.469002,0.49236,0.353283,0.451254,2,b'green',1096,1368
378,site_1507478101.844935894.png,0.396249,0.431602,0.332312,0.457547,2,b'green',1096,1368
169,raw_1507390377.png,0.1575,0.23125,0.471667,0.69,1,b'yellow',600,800
362,site_1507478252.50518989.png,0.841526,0.859004,0.365529,0.436848,2,b'green',1096,1368


In [13]:
with open('test.p', 'rb') as f:
    test = pickle.load(f)
    
test.head()

Unnamed: 0,filename,xmins,xmaxs,ymins,ymaxs,classes,classes_text,height,width
85,sim_1507476955.png,0.04375,0.065,0.858333,0.931667,2,b'green',600,800
127,sim_1507476785.png,0.71875,0.73625,0.956667,0.996667,2,b'green',600,800
384,site_1507478056.602447032.png,0.305223,0.357118,0.318458,0.481637,2,b'green',1096,1368
21,sim_1507476502.png,0.405,0.45875,0.641667,0.795,0,b'red',600,800
101,sim_1507476952.png,0.21125,0.2325,0.896667,0.96,2,b'green',600,800
