In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns 

# 1. preprocessing
* generally explore the dataset, but more specifically...
    * check for null values 
    * identify unique behavior labels 
    * replace 7 original behavior labels with 3 (moving, standing, lying) 
    * ensure there are no contradictions (i.e simultaneous dynamic and static) within the 3 behavior columns 
    * drop superfluous columns and rows to reduce computational load 

In [3]:
df = pd.read_csv('/Users/benjamingray/Code/chordata assessment/assessment/DogMoveData.csv')
df.isna().sum() # no null values 

DogID         0
TestNum       0
t_sec         0
ABack_x       0
ABack_y       0
ABack_z       0
ANeck_x       0
ANeck_y       0
ANeck_z       0
GBack_x       0
GBack_y       0
GBack_z       0
GNeck_x       0
GNeck_y       0
GNeck_z       0
Task          0
Behavior_1    0
Behavior_2    0
Behavior_3    0
PointEvent    0
dtype: int64

In [4]:
# identify unique Behavior entries 
np.unique(df[['Behavior_1', 'Behavior_2', 'Behavior_3']].values)

array(['<undefined>', 'Bowing', 'Carrying object', 'Drinking', 'Eating',
       'Extra_Synchronization', 'Galloping', 'Jumping', 'Lying chest',
       'Pacing', 'Panting', 'Playing', 'Shaking', 'Sitting', 'Sniffing',
       'Standing', 'Synchronization', 'Trotting', 'Tugging', 'Walking'],
      dtype=object)

In [5]:
dynamic_behaviors = ['Walking', 'Shaking', 'Sniffing',
                    'Eating', 'Trotting', 'Pacing',
                    'Playing', 'Panting', 'Drinking', 'Galloping',
                    'Carrying object', 'Tugging', 'Jumping',
                    'Bowing']
 
static_behaviors = ['Sitting', 'Lying'] # standing?
other_behaviors = ['Synchronization', 'Extra_Synchronization', '<undefined>']
 
for col in ['Behavior_1', 'Behavior_2', 'Behavior_3']: # replace dynamic behaviours with 'Moving', replace 'Lying chest' with 'Lying' 
    df[col].replace(dynamic_behaviors, 'Moving', inplace=True)
    df[col].replace('Lying chest', 'Lying', inplace=True)

In [6]:
contradiction = df[(df['Behavior_1'].isin(dynamic_behaviors) & df['Behavior_2'].isin(static_behaviors))]
contradiction2 = df[(df['Behavior_1'].isin(static_behaviors) & df['Behavior_2'].isin(dynamic_behaviors))]

print(len(contradiction), len(contradiction2))

# search for contradicting rows i.e where Behaviour_1 is dynamic and Behavior_2 is static (or vice versa) 
# no examples of either contraction 
# Behavior_1 is the best single label
# therefore safe to discard Behavior_2 and Behavior_3 columns

0 0


In [7]:
# drop superfluous columns and rename sole Behavior column 
# Kumpulainena et al., (2021) report superior performance of models trained with data from the back sensor. Therefore we drop collar sensor data 
# 
# task column: highly likely to be strongly correlated with target (behavior) - any trained dog instructed to sit will likely sit. 
# cannot easily plot correlation of these two variables (task and behavior) since they are non-numeric. 
# therefore assume high correlation and hence drop the column

# furthermore, in a non-controlled environment whereby livestock were monitored using IMUs, such a 'task' feature would be unfeasible and therefore would not be useful to include as an input 


df.drop(columns=['Behavior_2', 'Behavior_3', 'ANeck_x', 'ANeck_y', 'ANeck_z', 'GNeck_x', 'GNeck_y', 'GNeck_z', 'PointEvent', 'TestNum', 'Task'], inplace=True)
df.rename(columns={'Behavior_1':'Behavior'}, inplace=True)

In [8]:
# df[Behavior].unique() reveals 'Standing' behavior. not a label we want to use for the classification but will be used later for position offset feature calculation 
# Drop all rows where behaviour = Synchronization, <undefined> or Extra_Synchronization
df = df[~df['Behavior'].isin(['Synchronization', 'Extra_Synchronization', '<undefined>'])] # drop rows where behavior != sitting, lying, moving 

In [9]:
# export preprocessed dataframe to csv for import into next file (2.transformation.ipynb)
df.to_csv('DietDogMoveData.csv', index=False) # put dataset on a diet! went from 10m rows to 6m by dropping collar sensor, non-target behaviour and other unnecessary features 