In [3]:
import matplotlib.pyplot as plt
from matplotlib import gridspec
import numpy as np
import torch
import os
import pandas as pd
import pickle
import seaborn as sns
import yaml
from daart.data import DataGenerator, compute_sequence_pad
from daart.eval import get_precision_recall, run_lengths
from daart.io import get_expt_dir
from daart.transforms import ZScore

from daart_utils.data import DataHandler
from daart_utils.models import compute_model_predictions, get_default_hparams
#from daart_utils.paths import data_path, results_path
from daart_utils.plotting import plot_heatmaps
import ssm
from ssm.util import random_rotation, find_permutation

In [4]:

path = '/home/bsb2144/daart_utils/data/mouse-oft-aligned/labels-hand'

In [5]:
# oft data
data = pd.read_csv(path + '/OFT_39_labels.csv', index_col=0)

display(data[:5])

Unnamed: 0,Background,Supported,Unsupported,Grooming,Other
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1


In [16]:
vids = [
    'OFT_5',
    'OFT_6',
    'OFT_11',
    'OFT_12',
    'OFT_14',
    'OFT_15',
    'OFT_16',
    'OFT_23',
    'OFT_24',
    'OFT_38',
    'OFT_39',
    'OFT_41',
    'OFT_43',
    'OFT_44',
    'OFT_49',
    'OFT_50',
    'OFT_51',
    'OFT_52',
    'OFT_54',
    'OFT_58',
]

for vid in vids:
    data = pd.read_csv(path + '/{}_labels.csv'.format(vid), index_col=0)
    data['Background'] = data['Other']
    data.drop('Other', axis=1, inplace=True)
    data.to_csv(path + '/{}_labels.csv'.format(vid))
    
    

In [35]:
train_vids = [
    'OFT_5',
    'OFT_6',
    'OFT_11',
    'OFT_12',
    'OFT_14',
    'OFT_15',
    'OFT_16',
    'OFT_23',
    'OFT_24',
    'OFT_38',
]

train_dfs = []
# load labels
for s in train_vids:
    #print('session ' + s)
    lab_path = path + '/' + s + '_labels.csv'
    lab_df = pd.read_csv(lab_path, index_col=0)
    train_dfs.append(lab_df)

In [36]:
def get_labels_start_end_time(array, bg_class=[0], use=1, chunk_size=50):
    labels = []
    starts = []
    ends = []
    
    frame_wise_labels = array.copy()
    #print('pre', frame_wise_labels[:100])
    frame_wise_labels[array != use] = 0
    #print('post', frame_wise_labels[:100])
    
    last_label = frame_wise_labels[0]
    if frame_wise_labels[0] not in bg_class:
        labels.append(frame_wise_labels[0])
        starts.append(0)
    for i in range(len(frame_wise_labels)):
        if frame_wise_labels[i] != last_label:
            if frame_wise_labels[i] not in bg_class:
                labels.append(frame_wise_labels[i])
                starts.append(i)
            if last_label not in bg_class:
                ends.append(i)
                
            last_label = frame_wise_labels[i]
            
    if last_label not in bg_class:
        ends.append(i + 1)
        
    ends = [e-chunk_size for e in ends]
    lens = [e-s for e,s in zip(ends, starts)]
    
    to_del = []
    for l, ln in enumerate(lens): 
        if ln < 1:
            to_del.append(l)
            
    offset = 0
    for l in to_del:
        starts.pop(l-offset)
        ends.pop(l-offset)
        labels.pop(l-offset)
        lens.pop(l-offset)
        offset += 1
            
    lens = [e-s for e,s in zip(ends, starts)]
    return (labels, starts, ends, lens)

In [39]:
import random 

chunk_size = 50
# save dfs as np arrays
names = ['Background','Supported','Unsupported','Grooming']

# main loop
#goals = [50, 100, 250, 500, 1000]
goals = [2000, 3000, 4000, 5000]
vs = ['v1', 'v2', 'v3', 'v4', 'v5']
seeds = [1, 1001, 2001, 3001, 4001]

#v = 'v1'
#goal = 500

for goal in goals:
    print('GOAL: ', goal)
    for v, seed in zip(vs, seeds):
        print('VERSION: ', v)
        labels_int = []
        for df in train_dfs:
            temp = np.zeros(df.shape[0])
            
            # convert from 1-hot vector
            inds = list(df.idxmax(axis=1))
            
            # convert to label name
            np_inds = np.array([names.index(i) for i in inds])

            labels_int.append(np_inds)

        classes = ['Supported', 'Unsupported', 'Grooming']
        
        # keeps track of num labels per class
        num_labeled = np.zeros(len(classes))

        random.seed(seed)

        # create array to hold new labels - for all the train dfs
        new_labels = [np.zeros_like(a) for a in labels_int]


        # iterate through classes
        for c, cl in enumerate(classes):
            print('cl', cl)
            tries = 0
            # keep adding labeled chunks until we reach goal
            while num_labeled[c] < goal:
                random.seed(seed)

                # choose a video and update seed
                vid = random.randint(0, len(train_vids)-1)
                seed += 1
                
                # pulling the ground truth classes
                arr = labels_int[vid]

                # choose index  
                labels, starts, ends, lens = get_labels_start_end_time(np.array(arr), bg_class=[0], use=(c+1),chunk_size=chunk_size)

                # create list of all valid start inds
                start_inds = []
                for s,e in zip(starts, ends):
                    start_inds += list(range(s,e))

                # pick a start index
                random.seed(seed)
                if len(start_inds) < 1:
                    tries += 1
                    if tries > 100:
                        break
                    continue
                si_ind = random.randint(0, len(start_inds)-1)
                seed += 1
                
                # chosen start index
                si = start_inds[si_ind]

                # update new labels array
                new_labels[vid][si:si+chunk_size] = (c+1)

                # remove chose indexes from list of possible inds
                labels_int[vid][si:si+chunk_size] = 0

                # update num labeled
                num_labeled[c] += chunk_size
   
            print('num', num_labeled[c])

        # create df based on new labels and save
        new_dfs = []
        for arr in new_labels:
            lab_np = np.zeros((len(arr), len(names)))
            for ind, l in enumerate(arr):
                lab_np[ind, l] = 1
            lab_df = pd.DataFrame(lab_np, dtype=int)
            new_dfs.append(lab_df)

        # save labels
        # format of file - 'sess_' + s + '_' + num_labels + '_' + seed version + '_labels.csv'
        # i.e. sess_05_500_v1_labels.csv
        for d, df in enumerate(new_dfs):
            lab_path = path + '/' + train_vids[d] + '_' + str(goal) + '_' + v + '_labels.csv'
            df.columns = names
            df.to_csv(lab_path)
        

GOAL:  2000
VERSION:  v1
cl Supported
num 2000.0
cl Unsupported
num 2000.0
cl Grooming
num 2000.0
VERSION:  v2
cl Supported
num 2000.0
cl Unsupported
num 2000.0
cl Grooming
num 1950.0
VERSION:  v3
cl Supported
num 2000.0
cl Unsupported
num 2000.0
cl Grooming
num 2000.0
VERSION:  v4
cl Supported
num 2000.0
cl Unsupported
num 2000.0
cl Grooming
num 1900.0
VERSION:  v5
cl Supported
num 2000.0
cl Unsupported
num 2000.0
cl Grooming
num 1950.0
GOAL:  3000
VERSION:  v1
cl Supported
num 3000.0
cl Unsupported
num 3000.0
cl Grooming
num 1900.0
VERSION:  v2
cl Supported
num 3000.0
cl Unsupported
num 3000.0
cl Grooming
num 2150.0
VERSION:  v3
cl Supported
num 3000.0
cl Unsupported
num 3000.0
cl Grooming
num 2000.0
VERSION:  v4
cl Supported
num 3000.0
cl Unsupported
num 3000.0
cl Grooming
num 1900.0
VERSION:  v5
cl Supported
num 3000.0
cl Unsupported
num 3000.0
cl Grooming
num 1950.0
GOAL:  4000
VERSION:  v1
cl Supported
num 4000.0
cl Unsupported
num 4000.0
cl Grooming
num 1950.0
VERSION:  v2
cl Su