# MUW Neuroscience Seminar WS 23/24
## Group Work: CAL MS21 Dataset

## Preprocessing Goal

The originally nested data of the Mouse Social Behaviour Challenge is simplified and reduced to just the keypoints of the mouse body parts and the annotations.

In [1]:
import numpy as np
import pandas as pd

In [2]:
def load_task1_data(data_path):
  """
  Load data for task 1:
      The vocaubulary tells you how to map behavior names to class ids;
      it is the same for all sequences in this dataset.
  """
  data_dict = np.load(data_path, allow_pickle=True).item()
  dataset = data_dict['annotator-id_0']
  # Get any sequence key.
  sequence_id = list(data_dict['annotator-id_0'].keys())[0]
  vocabulary = data_dict['annotator-id_0'][sequence_id]['metadata']['vocab']
  return dataset, vocabulary


training_data, vocab = load_task1_data('data/calms21_task1_train.npy') #check where you created the files in the loading notebook
test_data, _ = load_task1_data('data/calms21_task1_test.npy') #check where you created the files in the loading notebook

In [3]:
print("Sample dataset keys: ", list(training_data.keys())[:3])
print("Vocabulary: ", vocab)
print("Number of train Sequences: ", len(training_data))
print("Number of test Sequences: ", len(test_data))

Sample dataset keys:  ['task1/train/mouse001_task1_annotator1', 'task1/train/mouse002_task1_annotator1', 'task1/train/mouse003_task1_annotator1']
Vocabulary:  {'attack': 0, 'investigation': 1, 'mount': 2, 'other': 3}
Number of train Sequences:  70
Number of test Sequences:  19


In [4]:
sequence_names = list(training_data.keys())
sample_sequence_key = sequence_names[0]
single_sequence = training_data[sample_sequence_key]
print("Name of our sample sequence: ", sample_sequence_key)
print("Sequence keys: ", single_sequence.keys())
print("Sequence metadata: ", single_sequence['metadata'])
print(f"Number of Frames in Sequence \"{sample_sequence_key}\": ", len(single_sequence['annotations']))
print(f"Keypoints data shape of Sequence \"{sample_sequence_key}\": ", single_sequence['keypoints'].shape)

Name of our sample sequence:  task1/train/mouse001_task1_annotator1
Sequence keys:  dict_keys(['keypoints', 'scores', 'annotations', 'metadata'])
Sequence metadata:  {'annotator-id': 0, 'vocab': {'attack': 0, 'investigation': 1, 'mount': 2, 'other': 3}}
Number of Frames in Sequence "task1/train/mouse001_task1_annotator1":  21364
Keypoints data shape of Sequence "task1/train/mouse001_task1_annotator1":  (21364, 2, 2, 7)


In [5]:
# simplify data in a dataframe

def transform_dataset(dataset):

  sequence_names = list(dataset.keys())

  data = []

  #columns
  mice = ['m1', 'm2']
  coordinates = ['x', 'y']
  bodyparts = ['nose', 'left_ear', 'right_ear', 'neck', 'left_hip', 'right_hip', 'tail_base']

  print('We have ', len(sequence_names), ' sequences')

  for sequence in sequence_names:

      for f, frame in enumerate(dataset[sequence]['keypoints']):

        tabdata = {}
        id = sequence + str(f)
        tabdata = {'sequence': sequence, 'frame': f, 'id': id}
        tabdata['label'] = dataset[sequence]['annotations'][f]

        # create a column for each mouse + coordinate + bodypart column
        for m, mouse in enumerate(frame):

          for c, coordinate in enumerate(mouse):

            for b, c_bodypart in enumerate(coordinate):

              column_name = mice[m] + '_' + coordinates[c] + '_' + bodyparts[b]
              tabdata[column_name] = c_bodypart

        data.append(tabdata)

  print('We have ', len(data), ' frames in total in the dataset')
  dataset_new = data

  return dataset_new

In [6]:
train_data = transform_dataset(training_data)
test_data =  transform_dataset(test_data)

We have  70  sequences
We have  507738  frames in total in the dataset
We have  19  sequences
We have  262107  frames in total in the dataset


In [7]:
# converting the train data into a dataframe
df_train = pd.DataFrame(train_data)
df_train.head()

Unnamed: 0,sequence,frame,id,label,m1_x_nose,m1_x_left_ear,m1_x_right_ear,m1_x_neck,m1_x_left_hip,m1_x_right_hip,...,m2_x_left_hip,m2_x_right_hip,m2_x_tail_base,m2_y_nose,m2_y_left_ear,m2_y_right_ear,m2_y_neck,m2_y_left_hip,m2_y_right_hip,m2_y_tail_base
0,task1/train/mouse001_task1_annotator1,0,task1/train/mouse001_task1_annotator10,3,831.659204,805.659204,775.659204,780.659204,711.659204,711.659204,...,796.915924,840.915924,766.915924,253.216902,195.216902,193.216902,179.216902,152.216902,102.216902,97.216902
1,task1/train/mouse001_task1_annotator1,1,task1/train/mouse001_task1_annotator11,1,833.050439,809.050439,778.050439,783.050439,723.050439,717.050439,...,799.907019,846.907019,766.907019,259.539977,204.539977,201.539977,188.539977,153.539977,105.539977,98.539977
2,task1/train/mouse001_task1_annotator1,2,task1/train/mouse001_task1_annotator12,1,838.718976,816.718976,776.718976,787.718976,730.718976,713.718976,...,800.195703,860.195703,777.195703,256.902935,208.902935,205.902935,193.902935,150.902935,112.902935,99.902935
3,task1/train/mouse001_task1_annotator1,3,task1/train/mouse001_task1_annotator13,1,826.757507,815.757507,774.757507,785.757507,743.757507,711.757507,...,794.788861,856.788861,786.788861,263.420539,206.420539,206.420539,193.420539,147.420539,113.420539,97.420539
4,task1/train/mouse001_task1_annotator1,4,task1/train/mouse001_task1_annotator14,1,822.045709,812.045709,768.045709,779.045709,749.045709,709.045709,...,789.578644,862.578644,793.578644,263.366469,202.366469,201.366469,190.366469,143.366469,120.366469,95.366469


In [13]:
# converting the test data into a dataframe
df_test = pd.DataFrame(test_data)
df_test.head()

Unnamed: 0,sequence,frame,id,label,m1_x_nose,m1_x_left_ear,m1_x_right_ear,m1_x_neck,m1_x_left_hip,m1_x_right_hip,...,m2_x_left_hip,m2_x_right_hip,m2_x_tail_base,m2_y_nose,m2_y_left_ear,m2_y_right_ear,m2_y_neck,m2_y_left_hip,m2_y_right_hip,m2_y_tail_base
0,task1/test/mouse071_task1_annotator1,0,task1/test/mouse071_task1_annotator10,3,720.980978,654.980978,698.980978,661.980978,550.980978,589.980978,...,781.674377,830.674377,759.674377,485.749376,460.749376,434.749376,437.749376,404.749376,348.749376,340.749376
1,task1/test/mouse071_task1_annotator1,1,task1/test/mouse071_task1_annotator11,1,733.408734,663.408734,705.408734,668.408734,554.408734,593.408734,...,780.197046,832.197046,761.197046,490.161792,468.161792,438.161792,442.161792,411.161792,355.161792,347.161792
2,task1/test/mouse071_task1_annotator1,2,task1/test/mouse071_task1_annotator12,1,742.879459,670.879459,708.879459,675.879459,559.879459,598.879459,...,781.179437,833.179437,765.179437,498.969705,471.969705,441.969705,444.969705,414.969705,359.969705,351.969705
3,task1/test/mouse071_task1_annotator1,3,task1/test/mouse071_task1_annotator13,1,751.682971,679.682971,714.682971,681.682971,568.682971,607.682971,...,783.078973,835.078973,767.078973,499.599931,470.599931,442.599931,442.599931,418.599931,362.599931,354.599931
4,task1/test/mouse071_task1_annotator1,4,task1/test/mouse071_task1_annotator14,1,754.39967,687.39967,722.39967,688.39967,577.39967,615.39967,...,781.261469,834.261469,769.261469,500.060898,466.060898,436.060898,436.060898,417.060898,363.060898,356.060898


In [14]:
# check label distribution - imbalanced classification problem
df_train.groupby(['label'])['label'].count()

label
0     14039
1    146615
2     28615
3    318469
Name: label, dtype: int64

In [15]:
df_test.groupby(['label'])['label'].count()

label
0     12630
1     61275
2     31848
3    156354
Name: label, dtype: int64

In [16]:
# adding a column for binary classification for attack
df_train['attack'] = np.where(df_train['label']==0, 1, 0)
df_train.groupby(['attack'])['attack'].count()

attack
0    493699
1     14039
Name: attack, dtype: int64

In [17]:
# adding a column for binary classification for attack
df_test['attack'] = np.where(df_test['label']==0, 1, 0)
df_test.groupby(['attack'])['attack'].count()

attack
0    249477
1     12630
Name: attack, dtype: int64

In [19]:
df_test.query('attack==1')

Unnamed: 0,sequence,frame,id,label,m1_x_nose,m1_x_left_ear,m1_x_right_ear,m1_x_neck,m1_x_left_hip,m1_x_right_hip,...,m2_x_right_hip,m2_x_tail_base,m2_y_nose,m2_y_left_ear,m2_y_right_ear,m2_y_neck,m2_y_left_hip,m2_y_right_hip,m2_y_tail_base,attack
80540,task1/test/mouse075_task1_annotator1,585,task1/test/mouse075_task1_annotator1585,0,770.006976,829.006976,782.006976,811.006976,868.006976,793.006976,...,766.374866,814.374866,81.069857,78.069857,145.069857,113.069857,65.069857,153.069857,107.069857,1
80541,task1/test/mouse075_task1_annotator1,586,task1/test/mouse075_task1_annotator1586,0,750.598450,818.598450,767.598450,798.598450,864.598450,786.598450,...,773.866473,832.866473,129.526582,85.526582,147.526582,112.526582,70.526582,151.526582,114.526582,1
80542,task1/test/mouse075_task1_annotator1,587,task1/test/mouse075_task1_annotator1587,0,748.611139,825.611139,787.611139,813.611139,831.611139,751.611139,...,745.038623,783.038623,160.652746,140.652746,184.652746,152.652746,107.652746,165.652746,117.652746,1
80543,task1/test/mouse075_task1_annotator1,588,task1/test/mouse075_task1_annotator1588,0,751.521545,814.521545,746.521545,785.521545,790.521545,698.521545,...,789.357764,825.357764,147.123604,107.123604,175.123604,139.123604,98.123604,179.123604,119.123604,1
80544,task1/test/mouse075_task1_annotator1,589,task1/test/mouse075_task1_annotator1589,0,740.698437,778.698437,722.698437,753.698437,791.698437,707.698437,...,790.026691,841.026691,149.525123,105.525123,174.525123,140.525123,102.525123,190.525123,162.525123,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172557,task1/test/mouse079_task1_annotator1,16497,task1/test/mouse079_task1_annotator116497,0,124.899112,156.899112,190.899112,183.899112,245.899112,240.899112,...,90.212538,137.212538,290.271655,354.271655,331.271655,353.271655,350.271655,356.271655,350.271655,1
172558,task1/test/mouse079_task1_annotator1,16498,task1/test/mouse079_task1_annotator116498,0,121.091116,146.091116,188.091116,176.091116,239.091116,238.091116,...,91.118494,115.118494,296.645839,357.645839,330.645839,354.645839,349.645839,351.645839,281.645839,1
172559,task1/test/mouse079_task1_annotator1,16499,task1/test/mouse079_task1_annotator116499,0,135.111630,153.111630,204.111630,181.111630,225.111630,240.111630,...,88.455054,146.455054,306.659187,372.659187,329.659187,355.659187,367.659187,312.659187,370.659187,1
172560,task1/test/mouse079_task1_annotator1,16500,task1/test/mouse079_task1_annotator116500,0,149.067393,173.067393,185.067393,196.067393,252.067393,245.067393,...,128.639966,144.639966,314.699259,383.699259,329.699259,356.699259,313.699259,375.699259,371.699259,1


In [21]:
# download the dataset to be reused
df_train.to_csv('calms21_tab_data_train.csv')
df_test.to_csv('calms21_tab_data_test.csv')

## Potential enhancements
Experiments from causal inference showing relation to the distance of the mouse could be included.
Distances between the different bodypart points could be included.
?
?
?