In [18]:
%matplotlib inline
import numpy as np
from skimage import io
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import os
import json

OUTPUT_DIR = '../datasets/dataset1/'
PREPROCCESSED_DATA_DIR = '../datasets/dataset1/'

In [19]:
def read_data_metadata(fn):
    data = None
    with open(fn, "r") as f:
        data = json.load(f)
    return data

In [20]:
def get_pairs(bee_list, path, y):
    size = len(bee_list)
    pairs = list()
    for i in range(size):
        for j in range(i+1, size):
            X1 = os.path.join(path, bee_list[i])
            X2 = os.path.join(path, bee_list[j])
            pairs.append({'X1': X1, 'X2': X2, 'y':y})
    return pairs
            

In [21]:
def load_frame_data(path):
    df = pd.DataFrame(columns=['X1', 'X2', 'y'])
    frames = os.listdir(path)
    
    for fr in frames:
        fr_path = os.path.join(path, fr)
        frame_bees = os.listdir(fr_path)
        pairs = get_pairs(frame_bees, fr_path, 1)
        df = df.append(pairs, ignore_index=True)
    return df
    

In [22]:
def load_track_data(path):
    df = pd.DataFrame(columns=['X1', 'X2', 'y'])
    tracks = os.listdir(path)
    
    for tr in tracks:
        tr_path = os.path.join(path, tr)
        track_bees = os.listdir(tr_path)
        if(len(track_bees) < 2):
            continue
        pairs = get_pairs(track_bees, tr_path, 0)
        df = df.append(pairs)
    return df

In [23]:
def create_csv():
    train_path = os.path.join(PREPROCCESSED_DATA_DIR, 'train_data')
    train_frame_path = os.path.join(train_path, 'frame')
    train_track_path = os.path.join(train_path, 'track')
    
    train_frame_df = load_frame_data(train_frame_path)
    train_track_df = load_track_data(train_track_path)
    train_df = train_frame_df.append(train_track_df, ignore_index=True)
    
    test_path = os.path.join(PREPROCCESSED_DATA_DIR, 'test_data')
    test_frame_path = os.path.join(test_path, 'frame')
    test_track_path = os.path.join( test_path, 'track')
    
    test_frame_df = load_frame_data(test_frame_path)
    test_track_df = load_track_data(test_track_path)
    test_df = test_frame_df.append(test_track_df, ignore_index=True)
    
    train_df = shuffle(train_df)
    test_df = shuffle(test_df)
    return train_df, test_df

In [24]:
train_df, test_df = create_csv()

In [25]:
train_df.head()

Unnamed: 0,X1,X2,y
13510,../datasets/dataset1/train_data/frame/0582/bee...,../datasets/dataset1/train_data/frame/0582/bee...,1
39159,../datasets/dataset1/train_data/track/0153/bee...,../datasets/dataset1/train_data/track/0153/bee...,0
41041,../datasets/dataset1/train_data/track/0147/bee...,../datasets/dataset1/train_data/track/0147/bee...,0
30285,../datasets/dataset1/train_data/track/0038/bee...,../datasets/dataset1/train_data/track/0038/bee...,0
44191,../datasets/dataset1/train_data/track/0071/bee...,../datasets/dataset1/train_data/track/0071/bee...,0


In [26]:
train_df.y[train_df.y==1].size, train_df.y[train_df.y==0].size

(21460, 25447)

In [27]:
test_df.head()

Unnamed: 0,X1,X2,y
19339,../datasets/dataset1/test_data/track/0232/bee0...,../datasets/dataset1/test_data/track/0232/bee0...,0
17010,../datasets/dataset1/test_data/track/0261/bee0...,../datasets/dataset1/test_data/track/0261/bee0...,0
13273,../datasets/dataset1/test_data/track/0235/bee0...,../datasets/dataset1/test_data/track/0235/bee0...,0
6608,../datasets/dataset1/test_data/frame/0866/bee0...,../datasets/dataset1/test_data/frame/0866/bee0...,1
9932,../datasets/dataset1/test_data/frame/0803/bee0...,../datasets/dataset1/test_data/frame/0803/bee0...,1


In [28]:
test_df.y[test_df.y==1].size, test_df.y[test_df.y==0].size

(12737, 12042)

In [29]:
train_df.to_csv(os.path.join(OUTPUT_DIR, "train_pairs.csv"))
test_df.to_csv(os.path.join(OUTPUT_DIR, "test_pairs.csv"))

In [30]:
meta = read_data_metadata(os.path.join(OUTPUT_DIR, "metadata"))

In [31]:
meta["csv_metadata"] = {
    "train_fn" : "train_pairs.csv",
    "test_fn" : "test_pairs.csv",
    "train_size" :  len(train_df),
    "test_size" : len(test_df),
    "class_info" : {
        "train_not_same_class" : train_df.y[train_df.y==1].size,
        "train_same_class" : train_df.y[train_df.y==0].size,
        "test_not_same_class" : test_df.y[test_df.y==1].size,
        "test_same_class" : test_df.y[test_df.y==0].size,
    }
}

In [32]:
with open(os.path.join(OUTPUT_DIR, "metadata"), "w") as f:
    f.write(json.dumps(meta, indent=2))