In [17]:
%matplotlib inline
import numpy as np
from skimage import io
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import os
import json

import sys
sys.path.append("../")
from utils.metadata import *

OUTPUT_DIR = '../datasets/body_sept/'
PREPROCCESSED_DATA_DIR = '../datasets/body_sept/'

In [18]:
def get_pairs(bee_list, path, y):
    size = len(bee_list)
    pairs = list()
    for i in range(size):
        for j in range(i+1, size):
            X1 = os.path.join(path, bee_list[i])
            X2 = os.path.join(path, bee_list[j])
            pairs.append({'X1': X1, 'X2': X2, 'y':y})
    return pairs

In [19]:
def load_frame_data(path):
    df = pd.DataFrame(columns=['X1', 'X2', 'y'])
    frames = os.listdir(path)[::2]
    
    for fr in frames:
        fr_path = os.path.join(path, fr)
        frame_bees = os.listdir(fr_path)
        pairs = get_pairs(frame_bees, fr_path, 1)
        if(len(pairs) == 0):
            continue
        df = df.append(pairs,  ignore_index=True)
    return df
    

In [20]:
def load_track_data(path):
    df = pd.DataFrame(columns=['X1', 'X2', 'y'])
    tracks = os.listdir(path)
    
    for tr in tracks:
        tr_path = os.path.join(path, tr)
        track_bees = os.listdir(tr_path)
        if(len(track_bees) < 2):
            continue
        pairs = get_pairs(track_bees, tr_path, 0)
        df = df.append(pairs)
    return df

In [21]:
def create_csv():
    train_path = os.path.join(PREPROCCESSED_DATA_DIR, 'train_data')
    train_frame_path = os.path.join(train_path, 'frame')
    train_track_path = os.path.join(train_path, 'track')
    
    train_frame_df = load_frame_data(train_frame_path)
    train_track_df = load_track_data(train_track_path)
    train_df = train_frame_df.append(train_track_df, ignore_index=True)
    
    test_path = os.path.join(PREPROCCESSED_DATA_DIR, 'test_data')
    test_frame_path = os.path.join(test_path, 'frame')
    test_track_path = os.path.join( test_path, 'track')
    
    test_frame_df = load_frame_data(test_frame_path)
    test_track_df = load_track_data(test_track_path)
    test_df = test_frame_df.append(test_track_df, ignore_index=True)
    
    train_df = shuffle(train_df)
    test_df = shuffle(test_df)
    return train_df, test_df

In [22]:
train_df, test_df = create_csv()

In [23]:
train_df.head()

Unnamed: 0,X1,X2,y
494,../datasets/body_sept/train_data/frame/001356/...,../datasets/body_sept/train_data/frame/001356/...,1
199138,../datasets/body_sept/train_data/frame/007325/...,../datasets/body_sept/train_data/frame/007325/...,1
120221,../datasets/body_sept/train_data/frame/007673/...,../datasets/body_sept/train_data/frame/007673/...,1
349392,../datasets/body_sept/train_data/track/001719/...,../datasets/body_sept/train_data/track/001719/...,0
223821,../datasets/body_sept/train_data/frame/008181/...,../datasets/body_sept/train_data/frame/008181/...,1


In [24]:
train_df.y[train_df.y==1].size, train_df.y[train_df.y==0].size

(236090, 222307)

In [25]:
test_df.head()

Unnamed: 0,X1,X2,y
114599,../datasets/body_sept/test_data/track/003403/B...,../datasets/body_sept/test_data/track/003403/B...,0
29521,../datasets/body_sept/test_data/frame/017443/B...,../datasets/body_sept/test_data/frame/017443/B...,1
40710,../datasets/body_sept/test_data/frame/017047/B...,../datasets/body_sept/test_data/frame/017047/B...,1
161357,../datasets/body_sept/test_data/track/003068/B...,../datasets/body_sept/test_data/track/003068/B...,0
162363,../datasets/body_sept/test_data/track/003506/B...,../datasets/body_sept/test_data/track/003506/B...,0


In [26]:
test_df.y[test_df.y==1].size, test_df.y[test_df.y==0].size

(96647, 67787)

In [27]:
train_df.to_csv(os.path.join(OUTPUT_DIR, "train_pairs.csv"))
test_df.to_csv(os.path.join(OUTPUT_DIR, "test_pairs.csv"))

In [28]:
meta = Metadata()
meta.read_data_metadata(os.path.join(OUTPUT_DIR, "metadata"))

In [29]:
meta.metadata

{'csv_metadata': {'class_info': {'test_not_same_class': 192190,
   'test_same_class': 67787,
   'train_not_same_class': 470929,
   'train_same_class': 222307},
  'test_fn': 'test_pairs.csv',
  'test_size': 259977,
  'train_fn': 'train_pairs.csv',
  'train_size': 693236},
 'dataset_meta': {'info': {'test_frames': 'Test data frames was created by taking the frames after the first 800',
   'test_track_count': 304,
   'test_tracks': 'Test data tracks was created by taking the tracks after the first 800 frames, but each track need to be more than 25 frames long.',
   'train_frames': 'Train data frames was created by taking the first 600 frames',
   'train_track_count': 0,
   'train_tracks': 'Train data tracks was created by taking the tracks in the first 600 frames, but each track need to be more than 25 frames long.'}}}

In [30]:
meta.add_metadata("csv_metadata", {
    "train_fn" : "train_pairs.csv",
    "test_fn" : "test_pairs.csv",
    "train_size" :  len(train_df),
    "test_size" : len(test_df),
    "class_info" : {
        "train_not_same_class" : train_df.y[train_df.y==1].size,
        "train_same_class" : train_df.y[train_df.y==0].size,
        "test_not_same_class" : test_df.y[test_df.y==1].size,
        "test_same_class" : test_df.y[test_df.y==0].size,
    }
})

In [31]:
meta.save(os.path.join(OUTPUT_DIR, "metadata"))

In [32]:
meta.metadata

{'csv_metadata': {'class_info': {'test_not_same_class': 96647,
   'test_same_class': 67787,
   'train_not_same_class': 236090,
   'train_same_class': 222307},
  'test_fn': 'test_pairs.csv',
  'test_size': 164434,
  'train_fn': 'train_pairs.csv',
  'train_size': 458397},
 'dataset_meta': {'info': {'test_frames': 'Test data frames was created by taking the frames after the first 800',
   'test_track_count': 304,
   'test_tracks': 'Test data tracks was created by taking the tracks after the first 800 frames, but each track need to be more than 25 frames long.',
   'train_frames': 'Train data frames was created by taking the first 600 frames',
   'train_track_count': 0,
   'train_tracks': 'Train data tracks was created by taking the tracks in the first 600 frames, but each track need to be more than 25 frames long.'}}}