In [1]:
import numpy as np
import pandas as pd
import re
import logging
import click
import os
import sklearn
import random 
import pickle 

from sklearn.model_selection import StratifiedGroupKFold
from tqdm import tqdm
from dotenv import find_dotenv, load_dotenv
from mouse_facial_expressions.paths import *

load_dotenv(find_dotenv())

project_dir = Path('..').resolve()
project_dir

task1_path = Path(os.environ['MFE_TASKS']) / 'task1'



In [2]:
task1_path = Path(os.environ['MFE_TASKS']) / 'task1'
df = pd.read_pickle(task1_path / 'dataset_df.pkl')

folds = task1_path.glob('fold*.pkl')
fold_df = pd.DataFrame({'foldpath': folds})
fold_df['fold_index'] = fold_df.foldpath.apply(lambda x: int(re.match('.*(\d+)', x.parts[-1]).group(1)))
fold_df = fold_df.sort_values('fold_index')

In [3]:
df = pd.read_pickle(task1_path / 'dataset_df.pkl')

with open(task1_path / 'fold1.pkl', 'rb') as fp:
    fold_data = pickle.load(fp)
    train = fold_data['train']
    test = fold_data['test']
    
df.loc[train[0]['indices']]

Unnamed: 0,mouse,date_of_birth,treatment,injection_time,notes,image,video,recording,camera,year,...,hour,minutes,seconds,animal,start,end,discard,Notes,video_time,label
47655,f13,25 October 22,low,12:14,,f13_rec1_preinjection/frame03760.png,f13_rec1_preinjection,1,Basler_acA1920-40um,2023,...,11,59,29,f13,,,,,11:59,0.0
47646,f13,25 October 22,low,12:14,,f13_rec1_preinjection/frame05710.png,f13_rec1_preinjection,1,Basler_acA1920-40um,2023,...,11,59,29,f13,,,,,11:59,0.0
47598,f13,25 October 22,low,12:14,,f13_rec1_preinjection/frame04790.png,f13_rec1_preinjection,1,Basler_acA1920-40um,2023,...,11,59,29,f13,,,,,11:59,0.0
47606,f13,25 October 22,low,12:14,,f13_rec1_preinjection/frame02760.png,f13_rec1_preinjection,1,Basler_acA1920-40um,2023,...,11,59,29,f13,,,,,11:59,0.0
47618,f13,25 October 22,low,12:14,,f13_rec1_preinjection/frame02550.png,f13_rec1_preinjection,1,Basler_acA1920-40um,2023,...,11,59,29,f13,,,,,11:59,0.0


In [4]:

import pickle
import pandas as pd
import os
import re
import torch

from torch.utils.data import Dataset
from pathlib import Path
from mouse_facial_expressions.paths import get_extracted_frames_folder
from skimage.io import imread
import torchvision

class Task1FoldDataset(Dataset):
    def __init__(self, samples, df, transform=None):
        super().__init__()
        self.samples = samples
        self.df = df
        self.transform = transform
        self.frame_dir = Path(get_extracted_frames_folder())
        
    def __len__(self):
        return len(self.samples)
    
    def get_image(self, imagepath):
        return self.transform(imread(self.frame_dir / imagepath))
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        label = sample['label']
        indices = sample['indices']
        images = torch.stack(self.df.loc[indices].image.apply(self.get_image).tolist())
        return images, label

class Task1Folds:
    def __init__(self):
        self.train_transform = torchvision.transforms.Compose([
            torchvision.transforms.ToPILImage(),
            torchvision.transforms.TrivialAugmentWide(),
            torchvision.transforms.ToTensor()
        ])

        self.test_transform = torchvision.transforms.Compose([
            torchvision.transforms.ToTensor(),
        ])
        
        task1_path = Path(os.environ['MFE_TASKS']) / 'task1'
        self.df = pd.read_pickle(task1_path / 'dataset_df.pkl')

        folds = task1_path.glob('fold*.pkl')
        fold_df = pd.DataFrame({'foldpath': folds})
        fold_df['fold_index'] = fold_df.foldpath.apply(lambda x: int(re.match('.*(\d+)', x.parts[-1]).group(1)))
        fold_df = fold_df.sort_values('fold_index')
        fold_df = fold_df.set_index('fold_index')
        self.fold_df = fold_df
        
    def __len__(self):
        return len(self.fold_df)
    
    def __getitem__(self, idx):
        row = self.fold_df.loc[idx]
        with open(row.foldpath, 'rb') as fp:
            fold_data = pickle.load(fp)
            
        train = fold_data['train']
        train_dataset = Task1FoldDataset(samples=train, df=self.df, transform=self.train_transform)
        
        test = fold_data['test']
        test_dataset = Task1FoldDataset(samples=test, df=self.df, transform=self.test_transform)
        return train_dataset, test_dataset

temp = Task1Folds()

train, test = temp[0]
train[0]


(tensor([[[[0.1686, 0.1686, 0.1647,  ..., 0.2863, 0.2863, 0.2980],
           [0.1686, 0.1647, 0.1647,  ..., 0.2863, 0.2980, 0.3059],
           [0.1725, 0.1686, 0.1647,  ..., 0.2902, 0.3020, 0.3059],
           ...,
           [0.4627, 0.4627, 0.4627,  ..., 0.1412, 0.1373, 0.1255],
           [0.4627, 0.4627, 0.4627,  ..., 0.1412, 0.1373, 0.1373],
           [0.4627, 0.4627, 0.4627,  ..., 0.1451, 0.1412, 0.1451]],
 
          [[0.1686, 0.1686, 0.1647,  ..., 0.2863, 0.2863, 0.2980],
           [0.1686, 0.1647, 0.1647,  ..., 0.2863, 0.2980, 0.3059],
           [0.1725, 0.1686, 0.1647,  ..., 0.2902, 0.3020, 0.3059],
           ...,
           [0.4627, 0.4627, 0.4627,  ..., 0.1412, 0.1373, 0.1255],
           [0.4627, 0.4627, 0.4627,  ..., 0.1412, 0.1373, 0.1373],
           [0.4627, 0.4627, 0.4627,  ..., 0.1451, 0.1412, 0.1451]],
 
          [[0.1686, 0.1686, 0.1647,  ..., 0.2863, 0.2863, 0.2980],
           [0.1686, 0.1647, 0.1647,  ..., 0.2863, 0.2980, 0.3059],
           [0.1725, 0.16

In [5]:
# ls {Path(get_extracted_frames_folder()) / 'f10_rec0_acclimation/'}

In [6]:
processed_videos_folder = Path(get_processed_video_folder())
dlc_facial_labels_folder = Path(get_dlc_facial_labels_folder())
extracted_frames_folder = Path(get_extracted_frames_folder())

assert processed_videos_folder.exists()
assert dlc_facial_labels_folder.exists()
if not extracted_frames_folder.exists():
    extracted_frames_folder.mkdir(parents=True)

dlc_files = list(dlc_facial_labels_folder.glob("*.h5"))
data = []
for dlc_file in tqdm(dlc_files, leave=False):
    fname, _ = dlc_file.parts[-1].split("DLC")
    video = processed_videos_folder / f"{fname}.mp4"
    data.append(dict(video=video, dlc_file=dlc_file, fname=fname))
files_df = pd.DataFrame(data)

len(files_df.fname.unique())

                                       

288

In [7]:
kfold_splits = 10
frameset_size = 5
train_size = 50000
test_size = 1000
seed = 13641

log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=log_fmt)
    
logger = logging.getLogger(__name__)

task_description = """
Task1 classification task using sets of {frameset_size} frames

Classes:
- 0: All animals at preinjection
- 1: LPS High dose at 4 hours

Split over {kfold_splits} stratified kfolds grouped by mouse.
Training and test sets are sampled from the training and test frames.
""".format(
    frameset_size=frameset_size,
    train_size=train_size,
    test_size=test_size,
    kfold_splits=kfold_splits
)
logger.info(task_description)

logger.info('Seeding %i', seed)
np.random.seed(seed)
random.seed(seed)

logger.info('Loading treatment csv')
treatments_df = pd.read_csv(project_dir / 'data/raw/treatments_20230627.csv')
frames_dir = Path(get_extracted_frames_folder())

logger.info('Loading frame paths')
frames_folder_df = pd.DataFrame(dict(image=list(frames_dir.glob('*/*.png'))))
frames_folder_df['video'] = frames_folder_df.image.apply(lambda x: x.parts[-2])
frames_folder_df['mouse'] = frames_folder_df.video.apply(lambda x: re.match('([mf]\d+)', x).group(1))
frames_folder_df['recording'] = frames_folder_df.video.apply(lambda x: int(re.match('.*rec(\d+)', x).group(1)))
frames_folder_df['image'] = frames_folder_df.image.apply(lambda x: Path(*x.parts[-2:]))

logger.info('Loading raw video details')
raw_videos_df = pd.read_csv(project_dir / 'data/raw/raw_videos_20230627.csv')
raw_videos_df.recording = raw_videos_df.recording.fillna(-1).astype(int)
raw_videos_df['video_time'] = raw_videos_df.apply(lambda x: f"{x.hour:02}:{x.minutes:02}", axis=1)
raw_videos_df['mouse'] = raw_videos_df.animal

logger.info('Merging frames')
combined_df = frames_folder_df.merge(treatments_df, how='left', on='mouse')
combined_df = combined_df.merge(raw_videos_df, how='left', on=['mouse', 'recording'])
combined_df = combined_df.drop_duplicates('image') # Not sure why duplicate frames are coming up

s = combined_df.groupby('video').count()['image']
len(s[s != 200])


2023-07-13 00:45:55,274 - __main__ - INFO - 
Task1 classification task using sets of 5 frames

Classes:
- 0: All animals at preinjection
- 1: LPS High dose at 4 hours

Split over 10 stratified kfolds grouped by mouse.
Training and test sets are sampled from the training and test frames.

2023-07-13 00:45:55,275 - __main__ - INFO - Seeding 13641
2023-07-13 00:45:55,275 - __main__ - INFO - Loading treatment csv
2023-07-13 00:45:55,277 - __main__ - INFO - Loading frame paths
2023-07-13 00:45:55,795 - __main__ - INFO - Loading raw video details
2023-07-13 00:45:55,801 - __main__ - INFO - Merging frames


3

In [8]:
most_frames = combined_df[combined_df.video == 'f11_rec3_2h-postinjection']

len(most_frames.image.unique())


200

In [9]:
s = combined_df.groupby('video').count()['image']
s[s!=200]

video
f14_rec2_1h-postinjection     26
f16_rec2_1h-postinjection    164
f2_rec3_2h-postinjection     184
Name: image, dtype: int64

In [10]:
kfold_splits = 10
frameset_size = 5
train_size = 50000
test_size = 1000
seed = 13641

log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=log_fmt)
    
logger = logging.getLogger(__name__)

output_path = Path(os.environ['MFE_TASKS']) / 'task1'
task_description = """
Task1 classification task using sets of {frameset_size} frames

Classes:
- 0: All animals at preinjection
- 1: LPS High dose at 4 hours

Split over {kfold_splits} stratified kfolds grouped by mouse.
Training and test sets are sampled from the training and test frames.
""".format(
    frameset_size=frameset_size,
    train_size=train_size,
    test_size=test_size,
    kfold_splits=kfold_splits
)
logger.info(task_description)
with open(output_path / 'README.txt', 'w') as fp:
    fp.write(task_description)

logger.info('Seeding %i', seed)
np.random.seed(seed)
random.seed(seed)

logger.info('Loading treatment csv')
treatments_df = pd.read_csv(project_dir / 'data/raw/treatments_20230627.csv')
frames_dir = Path(get_extracted_frames_folder())

logger.info('Loading frame paths')
frames_folder_df = pd.DataFrame(dict(image=list(frames_dir.glob('*/*.png'))))
frames_folder_df['video'] = frames_folder_df.image.apply(lambda x: x.parts[-2])
frames_folder_df['mouse'] = frames_folder_df.video.apply(lambda x: re.match('([mf]\d+)', x).group(1))
frames_folder_df['recording'] = frames_folder_df.video.apply(lambda x: int(re.match('.*rec(\d+)', x).group(1)))
frames_folder_df['image'] = frames_folder_df.image.apply(lambda x: Path(*x.parts[-2:]))

logger.info('Loading raw video details')
raw_videos_df = pd.read_csv(project_dir / 'data/raw/raw_videos_20230627.csv')
raw_videos_df.recording = raw_videos_df.recording.fillna(-1).astype(int)
raw_videos_df['video_time'] = raw_videos_df.apply(lambda x: f"{x.hour:02}:{x.minutes:02}", axis=1)
raw_videos_df['mouse'] = raw_videos_df.animal

logger.info('Merging frames')
combined_df = treatments_df.merge(frames_folder_df, how='left', on='mouse')
combined_df = combined_df.merge(raw_videos_df, how='left', on=['mouse', 'recording'])

logger.info('Filtering to include only high and low dose animals')
combined_df = combined_df[combined_df.treatment.isin(['high', 'saline'])]
combined_df = combined_df[combined_df.recording.isin([1, 4])]

# Control mouse that showed sickness/pain behavior before experiment start
# combined_df = combined_df[combined_df.mouse != 'm18'] 

logger.info('Assigning labels (1: high dose at 4h, 0: everything else)')
# Label everything a 1
combined_df['label'] = np.ones(shape=combined_df.shape[0], dtype=int)

# Label control situations
combined_df.loc[combined_df.recording == 1, 'label'] = 0
combined_df.loc[combined_df.treatment == 'saline', 'label'] = 0

logger.info('Saving the dataset')
combined_df.to_pickle(output_path / 'dataset_df.pkl')

logger.info('Creating %i stratified kfold splits, grouped by mouse', kfold_splits)
cv = StratifiedGroupKFold(kfold_splits)
splits = list(cv.split(combined_df.index, groups=combined_df.mouse, y=combined_df.label))
for fold, split in enumerate(splits): 
    train, test = split
    train_df = combined_df.loc[combined_df.index[train]]
    test_df = combined_df.loc[combined_df.index[test]]
    logger.info('\nSplit %i\n - train mice: %s\n - test mice: %s', 
                fold, 
                ', '.join(train_df.mouse.unique()),
                ', '.join(test_df.mouse.unique()))
    
    logger.info('Fetching train samples')
    videos = train_df.video.unique()
    train_samples = []
    for _ in tqdm(list(range(train_size)), leave=False):
        video = random.choice(videos)
        video_df = train_df[train_df.video==video]
        indices = np.random.choice(video_df.index, size=frameset_size)
        label = video_df.iloc[0].label
        train_samples.append(dict(indices=indices, label=label))
        
    logger.info('Fetching test samples')
    videos = test_df.video.unique()
    test_samples = []
    for _ in tqdm(list(range(test_size)), leave=False):
        video = random.choice(videos)
        video_df = test_df[test_df.video==video]
        indices = np.random.choice(video_df.index, size=frameset_size)
        label = video_df.iloc[0].label
        test_samples.append(dict(indices=indices, label=label))
    
    logger.info('Saving samples')
    data = dict(train=train_samples, test=test_samples)
    with open(output_path / f'fold{fold}.pkl', 'wb') as fp:
        pickle.dump(data, fp)
        

2023-07-13 00:45:55,992 - __main__ - INFO - 
Task1 classification task using sets of 5 frames

Classes:
- 0: All animals at preinjection
- 1: LPS High dose at 4 hours

Split over 10 stratified kfolds grouped by mouse.
Training and test sets are sampled from the training and test frames.

2023-07-13 00:45:55,993 - __main__ - INFO - Seeding 13641
2023-07-13 00:45:55,993 - __main__ - INFO - Loading treatment csv
2023-07-13 00:45:55,994 - __main__ - INFO - Loading frame paths
2023-07-13 00:45:56,375 - __main__ - INFO - Loading raw video details
2023-07-13 00:45:56,380 - __main__ - INFO - Merging frames
2023-07-13 00:45:56,403 - __main__ - INFO - Filtering to include only high and low dose animals
2023-07-13 00:45:56,461 - __main__ - INFO - Assigning labels (1: high dose at 4h, 0: everything else)
2023-07-13 00:45:56,463 - __main__ - INFO - Saving the dataset
2023-07-13 00:45:56,469 - __main__ - INFO - Creating 10 stratified kfold splits, grouped by mouse
2023-07-13 00:45:56,489 - __main__ 

KeyboardInterrupt: 

In [None]:
kfold_splits = 8
frameset_size = 5
train_size = 50000
test_size = 1000
seed = 13641




2023-07-12 10:42:12,498 - __main__ - INFO - 
Classification task using sets of 5 frames

Classes:
- 0: All animals at preinjection
- 1: LPS High dose at 4 hours

Split over 8 stratified kfolds grouped by mouse.
Training and test sets are sampled from the training and test frames.




In [None]:

combined_df.loc[data['train'][0]['indices']]

Unnamed: 0,mouse,date_of_birth,treatment,injection_time,notes,image,video,recording,camera,year,...,hour,minutes,seconds,animal,start,end,discard,Notes,video_time,label
3614,m4,11 January 2022,high,13:07,,m4_rec4_4h-postinjection/frame02190.png,m4_rec4_4h-postinjection,4,Basler_acA1920-40um,2022,...,17,16,48,m4,0:54,-1,1.0,early stop,17:16,1
3538,m4,11 January 2022,high,13:07,,m4_rec4_4h-postinjection/frame05160.png,m4_rec4_4h-postinjection,4,Basler_acA1920-40um,2022,...,17,16,48,m4,0:54,-1,1.0,early stop,17:16,1
3703,m4,11 January 2022,high,13:07,,m4_rec4_4h-postinjection/frame02550.png,m4_rec4_4h-postinjection,4,Basler_acA1920-40um,2022,...,17,22,35,m4,0,-1,,re-recorded,17:22,1


In [None]:
ls -lah

total 6.9M
drwxrwxr-x  2 andre andre 4.0K Jul 12 01:17 [0m[01;34m.[0m/
drwxrwxr-x 12 andre andre 4.0K Jul 11 15:10 [01;34m..[0m/
-rw-rw-r--  1 andre andre  78K Jun 27 14:24 01-video-processing.ipynb
-rw-rw-r--  1 andre andre 1.4M Jun 27 14:24 02-find-information-cards.ipynb
-rw-rw-r--  1 andre andre 1.3M Jun 29 14:52 03-generate-dlc-projects.ipynb
-rw-rw-r--  1 andre andre  68K Jul  1 00:48 04-apply-dlc-model-to-frames.ipynb
-rw-rw-r--  1 andre andre    0 Jul  3 00:23 05-dataset-validations.ipynb
-rw-rw-r--  1 andre andre  18K Jul 12 01:16 06-creating-task-splits.ipynb
-rw-rw-r--  1 andre andre  86K Jul 11 18:58 99-training-test1.ipynb
-rw-rw-r--  1 andre andre 4.0M Jul 12 01:17 fold0.pkl
-rw-rw-r--  1 andre andre    0 Jun 27 14:24 .gitkeep
