In [1]:
%matplotlib inline
import numpy as np
from skimage import io
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import os
import json

import sys
sys.path.append("../")
from utils.metadata import *

OUTPUT_DIR = '../datasets/dataset1/'
PREPROCCESSED_DATA_DIR = '../datasets/dataset1/'

In [2]:
def get_pairs(bee_list, path, y):
    size = len(bee_list)
    pairs = list()
    for i in range(size):
        for j in range(i+1, size):
            X1 = os.path.join(path, bee_list[i])
            X2 = os.path.join(path, bee_list[j])
            pairs.append({'X1': X1, 'X2': X2, 'y':y})
    return pairs
            

In [3]:
def load_frame_data(path):
    df = pd.DataFrame(columns=['X1', 'X2', 'y'])
    frames = os.listdir(path)
    
    for fr in frames:
        fr_path = os.path.join(path, fr)
        frame_bees = os.listdir(fr_path)
        pairs = get_pairs(frame_bees, fr_path, 1)
        df = df.append(pairs, ignore_index=True)
    return df
    

In [4]:
def load_track_data(path):
    df = pd.DataFrame(columns=['X1', 'X2', 'y'])
    tracks = os.listdir(path)
    
    for tr in tracks:
        tr_path = os.path.join(path, tr)
        track_bees = os.listdir(tr_path)
        if(len(track_bees) < 2):
            continue
        pairs = get_pairs(track_bees, tr_path, 0)
        df = df.append(pairs)
    return df

In [5]:
def create_csv():
    train_path = os.path.join(PREPROCCESSED_DATA_DIR, 'train_data')
    train_frame_path = os.path.join(train_path, 'frame')
    train_track_path = os.path.join(train_path, 'track')
    
    train_frame_df = load_frame_data(train_frame_path)
    train_track_df = load_track_data(train_track_path)
    train_df = train_frame_df.append(train_track_df, ignore_index=True)
    
    test_path = os.path.join(PREPROCCESSED_DATA_DIR, 'test_data')
    test_frame_path = os.path.join(test_path, 'frame')
    test_track_path = os.path.join( test_path, 'track')
    
    test_frame_df = load_frame_data(test_frame_path)
    test_track_df = load_track_data(test_track_path)
    test_df = test_frame_df.append(test_track_df, ignore_index=True)
    
    train_df = shuffle(train_df)
    test_df = shuffle(test_df)
    return train_df, test_df

In [6]:
train_df, test_df = create_csv()

In [7]:
train_df.head()

Unnamed: 0,X1,X2,y
39643,../datasets/dataset1/train_data/track/0133/bee...,../datasets/dataset1/train_data/track/0133/bee...,0
24899,../datasets/dataset1/train_data/track/0055/bee...,../datasets/dataset1/train_data/track/0055/bee...,0
34874,../datasets/dataset1/train_data/track/0053/bee...,../datasets/dataset1/train_data/track/0053/bee...,0
30967,../datasets/dataset1/train_data/track/0006/bee...,../datasets/dataset1/train_data/track/0006/bee...,0
13982,../datasets/dataset1/train_data/frame/0025/bee...,../datasets/dataset1/train_data/frame/0025/bee...,1


In [8]:
train_df.y[train_df.y==1].size, train_df.y[train_df.y==0].size

(21460, 25447)

In [9]:
test_df.head()

Unnamed: 0,X1,X2,y
16580,../datasets/dataset1/test_data/track/0290/bee0...,../datasets/dataset1/test_data/track/0290/bee0...,0
20779,../datasets/dataset1/test_data/track/0238/bee0...,../datasets/dataset1/test_data/track/0238/bee0...,0
20238,../datasets/dataset1/test_data/track/0294/bee0...,../datasets/dataset1/test_data/track/0294/bee0...,0
6680,../datasets/dataset1/test_data/frame/0866/bee0...,../datasets/dataset1/test_data/frame/0866/bee0...,1
9825,../datasets/dataset1/test_data/frame/0853/bee0...,../datasets/dataset1/test_data/frame/0853/bee0...,1


In [10]:
test_df.y[test_df.y==1].size, test_df.y[test_df.y==0].size

(12737, 12042)

In [11]:
train_df.to_csv(os.path.join(OUTPUT_DIR, "train_pairs.csv"))
test_df.to_csv(os.path.join(OUTPUT_DIR, "test_pairs.csv"))

In [15]:
meta = Metadata()
meta.read_data_metadata(os.path.join(OUTPUT_DIR, "metadata"))

In [17]:
meta.metadata

{'dataset_meta': {'info': {'test_frames': 'Test data frames was created by taking the frames after the first 800',
   'test_track_count': 0,
   'test_tracks': 'Test data tracks was created by taking the tracks after the first 800 frames, but each track need to be more than 25 frames long.',
   'train_frames': 'Train data frames was created by taking the first 600 frames',
   'train_track_count': 0,
   'train_tracks': 'Train data tracks was created by taking the tracks in the first 600 frames, but each track need to be more than 25 frames long.'}},
 'raw_data_meta': {'folder': 'raw_data/dataset1/',
  'format': 'bee{id}--{frame}.jpg',
  'frames_amount': 1000,
  'id_amount': 304,
  'images_amount': 9171,
  'images_dim': [230, 105, 3],
  'title': 'Dataset 1 Metadata file'}}

In [18]:
meta.add_metadata("csv_metadata", {
    "train_fn" : "train_pairs.csv",
    "test_fn" : "test_pairs.csv",
    "train_size" :  len(train_df),
    "test_size" : len(test_df),
    "class_info" : {
        "train_not_same_class" : train_df.y[train_df.y==1].size,
        "train_same_class" : train_df.y[train_df.y==0].size,
        "test_not_same_class" : test_df.y[test_df.y==1].size,
        "test_same_class" : test_df.y[test_df.y==0].size,
    }
})

In [19]:
meta.save(os.path.join(OUTPUT_DIR, "metadata"))

In [20]:
meta.metadata

{'csv_metadata': {'class_info': {'test_not_same_class': 12737,
   'test_same_class': 12042,
   'train_not_same_class': 21460,
   'train_same_class': 25447},
  'test_fn': 'test_pairs.csv',
  'test_size': 24779,
  'train_fn': 'train_pairs.csv',
  'train_size': 46907},
 'dataset_meta': {'info': {'test_frames': 'Test data frames was created by taking the frames after the first 800',
   'test_track_count': 0,
   'test_tracks': 'Test data tracks was created by taking the tracks after the first 800 frames, but each track need to be more than 25 frames long.',
   'train_frames': 'Train data frames was created by taking the first 600 frames',
   'train_track_count': 0,
   'train_tracks': 'Train data tracks was created by taking the tracks in the first 600 frames, but each track need to be more than 25 frames long.'}},
 'raw_data_meta': {'folder': 'raw_data/dataset1/',
  'format': 'bee{id}--{frame}.jpg',
  'frames_amount': 1000,
  'id_amount': 304,
  'images_amount': 9171,
  'images_dim': [230, 