In [1]:
#basic package
import tqdm
import os
import numpy as np
import glob
import pandas as pd
import sys
import time
import datetime as dt
import itertools
import re
import math
from collections import Counter
import operator
from scipy import stats
from numpy import inf
import networkx as nx
from dtaidistance import dtw
import random

#save and load dictionaries/lists
import pickle

#scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#PCA
from sklearn import decomposition

#clustering
from sklearn.cluster import KMeans #only numerical var
from sklearn.metrics import silhouette_samples, silhouette_score
import kmodes
from kmodes.kmodes import KModes #with categorical var as well

#plot
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D

In [2]:
PACKAGE_PARENT = '../'
SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser('__file__'))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
from UTILS import kmeans_clustering, ZoneVariable, time_series_henColumn_tsRow, FB_daily, corr_from_dep2feature,\
corr_from_feature2feature, correlationGraph, ZoneVariable, DataRepresentation1, sampen, chi2_distance, is_day, correct_key
import config_origins as config

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


2


# Define parameters

In [3]:
print('change the configuration file if not done yet!')
path_extracted_data = config.path_extracted_data
id_run = config.id_run
dico_pen_tr = config.dico_pen_tr
li_binmn = config.li_binmn
penalty = config.penalty
dico_window = config.dico_window
birth_date = config.birth_date
dico_night_hour = config.dico_night_hour

change the configuration file if not done yet!


  and should_run_async(code)


# Download variables

In [4]:
#download the cleaned-movement data
df = pd.read_csv(os.path.join(path_extracted_data, id_run+'_CLEANEDDATA.csv'), sep=';', 
                 parse_dates=['Timestamp', 'date'], dayfirst=True) 
df['hour'] = df['Timestamp'].map(lambda x: x.hour)
df['time'] = df['Timestamp'].map(lambda x: dt.datetime.time(x-dt.timedelta(seconds=x.second)))
df.drop('duration', axis=1, inplace=True)
print(df.shape)
df.head(3)

  and should_run_async(code)


(2059712, 17)


Unnamed: 0,HenID,PenID,system,Zone,model_prediction,Timestamp,date,next_record_date,previous_record_date,previous_duration,next_zone,previous_zone,previous_previous_zone,correction_is_consecutive_equal_initial_zone,is_WG_open,hour,time
0,hen_71,pen12,10 - 12,3_Zone,1,2020-09-29 09:07:00,2020-09-29,2020-09-29 09:08:26.000,,,3_Zone,,,False,False,9,09:07:00
1,hen_7,pen11,10 - 12,3_Zone,1,2020-09-29 09:08:12,2020-09-29,2020-09-29 09:12:16.000,,,3_Zone,,,False,False,9,09:08:00
2,hen_10,pen11,10 - 12,3_Zone,1,2020-09-29 09:19:19,2020-09-29,2020-09-29 09:20:27.000,,,3_Zone,,,False,False,9,09:19:00


# Compute MLPS vectors

### on the daily-hours - with list of zones per day per animal at second level

In [None]:
#note that we will have more entries than needed, as the distrubances days are not removed in the raw-cleaned movements
#dataframe.
dico_pen_level_h = {}
#we have to do a loop per pen, due to memory issue
for p, df_pen in tqdm.tqdm(df.groupby('PenID')):
        
    #update results
    dico_pen_level_h[p] = {}
    
    #compute time series
    df_ts = time_series_henColumn_tsRow(df_pen, config, col_ts='Zone', ts_with_all_hen_value=False, save=False, 
                                        hen_time_series=False)
    
    #restrict to the daylight ONLY
    df_ts['is_day'] = df_ts['Timestamp'].map(lambda x: is_day(x, config.dico_night_hour))
    df_ts = df_ts[df_ts['is_day']]

    #list of all hen present in this pen
    li_hen = [v for v in df_ts.columns if 'hen_' in v]

    ################# create one list per animal #################
    #groupby date to have a list of zones per day (rows) for the hens (columns)
    df_ts = df_ts.groupby('date')[li_hen].agg(lambda x: list(x)).reset_index()
    #melt to have one row per (day, hens) to avoid looping to create the dictionary
    df_ts_ = pd.melt(df_ts, id_vars=['date'], value_vars=li_hen)
    for d, df__ in df_ts_.groupby(['date']):
        #update results: value column is the list of zone
        dico_pen_level_h[p][d] = dict(zip(df__['HenID'].tolist(), df__['value'].tolist()))   
    
#save dictionaries
pickle.dump(dico_pen_level_h, open(os.path.join(path_extracted_data, 
                                                id_run+'dico_pen_level_h_allzoneidseclevel_DAILYLEVEL.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)

### on the daily-hours with bin(of duration in each bin)

In [5]:
#for efficiency purpose let's compute the bined time series first
#note that we will have more entries than needed, as the distrubances days are not removed in the raw-cleaned movements
#dataframe.
dico_pen_bin_zone_level_h = {}
dico_pen_bin_level_h = {}
def duration_normalized_perZone(x):
    c = Counter(x)
    t = len(x)
    return [c['1_Zone']/t, c['2_Zone']/t, c['3_Zone']/t, c['4_Zone']/t, c['5_Zone']/t]
#small example
#li = ['1_Zone','3_Zone','3_Zone','4_Zone','5_Zone','5_Zone','1_Zone']
#duration_normalized_perZone(li)

for p, df_pen in tqdm.tqdm(df.groupby('PenID')):
    
    #update results
    dico_pen_bin_zone_level_h[p] = {}
    dico_pen_bin_level_h[p] = {}
    
    #compute time series
    df_ts = time_series_henColumn_tsRow(df_pen, config, col_ts='Zone', ts_with_all_hen_value=False, save=False, 
                                        hen_time_series=False)
    
    for nbr_binmn in tqdm.tqdm(li_binmn):
        
        #update results
        dico_pen_bin_zone_level_h[p][nbr_binmn] = {}
        dico_pen_bin_level_h[p][nbr_binmn] = {}
                
        #reduce to the interval we want
        mi = min(df_ts['Timestamp'].tolist())
        ma = max(df_ts['Timestamp'].tolist())
        #extend the end to the end of the day in case it case the last day available fo the chicken
        Daterange = pd.date_range(start = mi, end = ma, freq = str(nbr_binmn)+'MIN')    
        df_date = pd.DataFrame({str(nbr_binmn)+'mn_timestamp':Daterange})
        new_timestamp = str(nbr_binmn)+'mn_timestamp'
        df_date[new_timestamp] = df_date[new_timestamp].map(lambda x: pd.to_datetime(x))
        df_ts_ = pd.merge_asof(df_ts, df_date, left_on=['Timestamp'], right_on=[new_timestamp], direction='forward')
        
        #restrict to the day ONLY
        df_ts_['is_day'] = df_ts_['Timestamp'].map(lambda x: is_day(x, config.dico_night_hour))
        df_ts_ = df_ts_[df_ts_['is_day']]
        
        #groupby the interval that we want with the number of minutes in nestbox
        li_hen = [v for v in df_ts.columns if 'hen_' in v]
        
        ################# overall mlp #################
        df_sim = df_ts_.groupby(new_timestamp)[li_hen].agg(lambda x: duration_normalized_perZone(x)).reset_index()
        df_sim['date'] = df_sim[new_timestamp].map(lambda x: dt.datetime(x.year,x.month,x.day))
        #groupby date to have a list of zones per day (rows) for the hens (columns)
        df_sim = df_sim.groupby('date')[li_hen].agg(lambda x: list(x)).reset_index()
        #print(df_zone_sim.shape)
        #display(df_zone_sim.head(3))

        #melt to have one row per (day, hens) to avoid looping to create the dictionary
        df_sim_ = pd.melt(df_sim, id_vars=['date'], value_vars=li_hen)
        #variable column has the henIDs
        #print(df_zone_sim_.shape)
        #display(df_zone_sim_.head(3))
        for d, df__ in df_sim_.groupby(['date']):
            #update results
            dico_pen_bin_level_h[p][nbr_binmn][d] = dict(zip(df__['variable'].tolist(), df__['value'].tolist()))   
            
            
        ################# zone-ts over each zone #################                
        for ZONE in df['Zone'].unique():
            
            #update results
            dico_pen_bin_zone_level_h[p][nbr_binmn][ZONE] = {}
                
            df_zone_sim = df_ts_.groupby(new_timestamp)[li_hen].agg(lambda x: sum([i==ZONE for i in x])/60).reset_index()
            df_zone_sim['date'] = df_zone_sim[new_timestamp].map(lambda x: dt.datetime(x.year,x.month,x.day))

            #groupby date to have a list of zones per day (rows) for the hens (columns)
            df_zone_sim = df_zone_sim.groupby('date')[li_hen].agg(lambda x: list(x)).reset_index()
            #print(df_zone_sim.shape)
            #display(df_zone_sim.head(3))
            
            #melt to have one row per (day, hens) to avoid looping to create the dictionary
            df_zone_sim_ = pd.melt(df_zone_sim, id_vars=['date'], value_vars=li_hen)
            #variable column has the henIDs
            #print(df_zone_sim_.shape)
            #display(df_zone_sim_.head(3))
            for d, df__ in df_zone_sim_.groupby(['date']):
                #update results
                dico_pen_bin_zone_level_h[p][nbr_binmn][ZONE][d] = dict(zip(df__['variable'].tolist(), df__['value'].tolist()))
#save dictionaries
pickle.dump(dico_pen_bin_zone_level_h, open(os.path.join(path_extracted_data, 
                                                     id_run+'dico_pen_bin_zone_level_h_DAILYLEVEL.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(dico_pen_bin_level_h, open(os.path.join(path_extracted_data, 
                                                 id_run+'dico_pen_bin_level_h_DAILYLEVEL.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)

  and should_run_async(code)
  0%|                                                                                            | 0/8 [00:00<?, ?it/s]

in this time series there is 28 hens
The initial starting date in over all is: 2020-09-29 09:20:21, and the ending date will be: 2021-05-10 08:42:27
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2020-09-29 09:20:21, and the ending date will be: 2021-05-10 23:59:59



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A

Total running time: 5.28 mn



 20%|████████████████▌                                                                  | 1/5 [12:15<48:59, 734.96s/it][A
 40%|█████████████████████████████████▏                                                 | 2/5 [26:24<38:28, 769.40s/it][A
 60%|█████████████████████████████████████████████████▊                                 | 3/5 [37:23<24:32, 736.13s/it][A
 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [47:08<11:30, 690.74s/it][A
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [55:50<00:00, 670.01s/it][A
 12%|█████████▊                                                                    | 1/8 [1:01:08<7:07:56, 3668.07s/it]

in this time series there is 28 hens
The initial starting date in over all is: 2020-09-29 09:08:12, and the ending date will be: 2021-05-10 08:42:22
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2020-09-29 09:08:12, and the ending date will be: 2021-05-10 23:59:59
Total running time: 4.02 mn



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A
 20%|████████████████▌                                                                  | 1/5 [09:47<39:10, 587.51s/it][A
 40%|█████████████████████████████████▏                                                 | 2/5 [21:22<30:58, 619.66s/it][A
 60%|█████████████████████████████████████████████████▊                                 | 3/5 [31:22<20:27, 613.99s/it][A
 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [40:16<09:49, 589.96s/it][A
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [48:59<00:00, 587.93s/it][A
 25%|███████████████████▌                                                          | 2/8 [1:54:13<5:52:19, 3523.23s/it]

in this time series there is 31 hens
The initial starting date in over all is: 2020-09-29 09:07:00, and the ending date will be: 2021-05-10 08:42:20
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2020-09-29 09:07:00, and the ending date will be: 2021-05-10 23:59:59
Total running time: 4.88 mn



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A
 20%|████████████████▌                                                                  | 1/5 [11:52<47:28, 712.24s/it][A
 40%|█████████████████████████████████▏                                                 | 2/5 [22:22<34:23, 687.76s/it][A
 60%|█████████████████████████████████████████████████▊                                 | 3/5 [32:02<21:50, 655.23s/it][A
 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [46:12<11:53, 713.79s/it][A
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [57:18<00:00, 687.70s/it][A
 38%|█████████████████████████████▎                                                | 3/8 [2:56:35<4:59:04, 3588.93s/it]

in this time series there is 28 hens
The initial starting date in over all is: 2020-09-29 09:32:45, and the ending date will be: 2021-05-10 08:26:44
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2020-09-29 09:32:45, and the ending date will be: 2021-05-10 23:59:59
Total running time: 4.67 mn



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A
 20%|████████████████▌                                                                  | 1/5 [08:59<35:59, 539.87s/it][A
 40%|█████████████████████████████████▏                                                 | 2/5 [17:29<26:32, 530.90s/it][A
 60%|█████████████████████████████████████████████████▊                                 | 3/5 [26:30<17:47, 533.67s/it][A
 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [36:53<09:20, 560.45s/it][A
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [47:27<00:00, 569.44s/it][A
 50%|███████████████████████████████████████                                       | 4/8 [3:48:48<3:50:08, 3452.04s/it]

in this time series there is 27 hens
The initial starting date in over all is: 2020-09-29 09:33:43, and the ending date will be: 2021-05-10 08:26:01
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2020-09-29 09:33:43, and the ending date will be: 2021-05-10 23:59:59
Total running time: 3.97 mn



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A
 20%|████████████████▌                                                                  | 1/5 [08:52<35:28, 532.14s/it][A
 40%|█████████████████████████████████▏                                                 | 2/5 [17:23<26:17, 525.97s/it][A
 60%|█████████████████████████████████████████████████▊                                 | 3/5 [27:13<18:10, 545.02s/it][A
 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [38:36<09:46, 586.45s/it][A
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [48:02<00:00, 576.41s/it][A
 62%|████████████████████████████████████████████████▊                             | 5/8 [4:40:52<2:47:41, 3353.81s/it]

in this time series there is 29 hens
The initial starting date in over all is: 2020-09-29 09:33:48, and the ending date will be: 2021-05-10 08:26:39
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2020-09-29 09:33:48, and the ending date will be: 2021-05-10 23:59:59
Total running time: 3.84 mn



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A
 20%|████████████████▌                                                                  | 1/5 [08:54<35:38, 534.65s/it][A
 40%|█████████████████████████████████▏                                                 | 2/5 [19:35<28:19, 566.42s/it][A
 60%|█████████████████████████████████████████████████▊                                 | 3/5 [32:06<20:43, 621.75s/it][A
 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [42:08<10:15, 615.86s/it][A
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [50:49<00:00, 609.95s/it][A
 75%|██████████████████████████████████████████████████████████▌                   | 6/8 [5:35:37<1:51:05, 3332.95s/it]

in this time series there is 29 hens
The initial starting date in over all is: 2020-09-29 09:52:12, and the ending date will be: 2021-05-10 08:40:33
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2020-09-29 09:52:12, and the ending date will be: 2021-05-10 23:59:59
Total running time: 5.12 mn



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A
 20%|████████████████▌                                                                  | 1/5 [10:54<43:37, 654.44s/it][A
 40%|█████████████████████████████████▏                                                 | 2/5 [21:36<32:32, 650.77s/it][A
 60%|█████████████████████████████████████████████████▊                                 | 3/5 [32:03<21:27, 643.67s/it][A
 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [40:40<10:05, 605.65s/it][A
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [49:04<00:00, 588.94s/it][A
 88%|██████████████████████████████████████████████████████████████████████          | 7/8 [6:30:00<55:12, 3312.11s/it]

in this time series there is 28 hens
The initial starting date in over all is: 2020-09-29 09:36:51, and the ending date will be: 2021-05-10 08:39:39
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2020-09-29 09:36:51, and the ending date will be: 2021-05-10 23:59:59
Total running time: 3.96 mn



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A
 20%|████████████████▌                                                                  | 1/5 [08:34<34:19, 514.84s/it][A
 40%|█████████████████████████████████▏                                                 | 2/5 [16:57<25:33, 511.23s/it][A
 60%|█████████████████████████████████████████████████▊                                 | 3/5 [25:13<16:53, 506.59s/it][A
 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [33:23<08:21, 501.55s/it][A
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [41:30<00:00, 498.13s/it][A
100%|████████████████████████████████████████████████████████████████████████████████| 8/8 [7:15:31<00:00, 3266.40s/it]


### on [2h, 17h59] only

In [6]:
#for efficiency purpose let's compute the bined time series first
#note that we will have more entries than needed, as the distrubances days are not removed in the raw-cleaned movements
#dataframe.
dico_pen_bin_zone_level_h = {}
dico_pen_bin_level_h = {}
def duration_normalized_perZone(x):
    c = Counter(x)
    t = len(x)
    return [c['1_Zone']/t, c['2_Zone']/t, c['3_Zone']/t, c['4_Zone']/t, c['5_Zone']/t]
#small example
#li = ['1_Zone','3_Zone','3_Zone','4_Zone','5_Zone','5_Zone','1_Zone']
#duration_normalized_perZone(li)

for p, df_pen in tqdm.tqdm(df.groupby('PenID')):
    
    #update results
    dico_pen_bin_zone_level_h[p] = {}
    dico_pen_bin_level_h[p] = {}
    
    #compute time series
    df_ts = time_series_henColumn_tsRow(df_pen, config, col_ts='Zone', ts_with_all_hen_value=False, save=False, 
                                        hen_time_series=False)
    
    for nbr_binmn in tqdm.tqdm(li_binmn):
        
        #update results
        dico_pen_bin_zone_level_h[p][nbr_binmn] = {}
        dico_pen_bin_level_h[p][nbr_binmn] = {}
                
        #reduce to the interval we want
        mi = min(df_ts['Timestamp'].tolist())
        ma = max(df_ts['Timestamp'].tolist())
        #extend the end to the end of the day in case it case the last day available fo the chicken
        Daterange = pd.date_range(start = mi, end = ma, freq = str(nbr_binmn)+'MIN')    
        df_date = pd.DataFrame({str(nbr_binmn)+'mn_timestamp':Daterange})
        new_timestamp = str(nbr_binmn)+'mn_timestamp'
        df_date[new_timestamp] = df_date[new_timestamp].map(lambda x: pd.to_datetime(x))
        df_ts_ = pd.merge_asof(df_ts, df_date, left_on=['Timestamp'], right_on=[new_timestamp], direction='forward')
        
        #restrict to 2h-17h59 ONLY
        df_ts_['is_2h17'] = df_ts_['Timestamp'].map(lambda x: (x.hour>=2)&(x.hour<18))
        df_ts_ = df_ts_[df_ts_['is_2h17']]
        
        #groupby the interval that we want with the number of minutes in nestbox
        li_hen = [v for v in df_ts.columns if 'hen_' in v]
        
        ################# overall mlp #################
        df_sim = df_ts_.groupby(new_timestamp)[li_hen].agg(lambda x: duration_normalized_perZone(x)).reset_index()
        df_sim['date'] = df_sim[new_timestamp].map(lambda x: dt.datetime(x.year,x.month,x.day))
        #groupby date to have a list of zones per day (rows) for the hens (columns)
        df_sim = df_sim.groupby('date')[li_hen].agg(lambda x: list(x)).reset_index()
        #print(df_zone_sim.shape)
        #display(df_zone_sim.head(3))

        #melt to have one row per (day, hens) to avoid looping to create the dictionary
        df_sim_ = pd.melt(df_sim, id_vars=['date'], value_vars=li_hen)
        #variable column has the henIDs
        #print(df_zone_sim_.shape)
        #display(df_zone_sim_.head(3))
        for d, df__ in df_sim_.groupby(['date']):
            #update results
            dico_pen_bin_level_h[p][nbr_binmn][d] = dict(zip(df__['variable'].tolist(), df__['value'].tolist()))   
            
            
        ################# zone-ts over each zone #################                
        for ZONE in df['Zone'].unique():
            
            #update results
            dico_pen_bin_zone_level_h[p][nbr_binmn][ZONE] = {}
                
            df_zone_sim = df_ts_.groupby(new_timestamp)[li_hen].agg(lambda x: sum([i==ZONE for i in x])/60).reset_index()
            df_zone_sim['date'] = df_zone_sim[new_timestamp].map(lambda x: dt.datetime(x.year,x.month,x.day))

            #groupby date to have a list of zones per day (rows) for the hens (columns)
            df_zone_sim = df_zone_sim.groupby('date')[li_hen].agg(lambda x: list(x)).reset_index()
            #print(df_zone_sim.shape)
            #display(df_zone_sim.head(3))
            
            #melt to have one row per (day, hens) to avoid looping to create the dictionary
            df_zone_sim_ = pd.melt(df_zone_sim, id_vars=['date'], value_vars=li_hen)
            #variable column has the henIDs
            #print(df_zone_sim_.shape)
            #display(df_zone_sim_.head(3))
            for d, df__ in df_zone_sim_.groupby(['date']):
                #update results
                dico_pen_bin_zone_level_h[p][nbr_binmn][ZONE][d] = dict(zip(df__['variable'].tolist(), df__['value'].tolist()))
#save dictionaries
pickle.dump(dico_pen_bin_zone_level_h, open(os.path.join(path_extracted_data, 
                                                     id_run+'dico_pen_bin_zone_level_h_2h-17h59LEVEL.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(dico_pen_bin_level_h, open(os.path.join(path_extracted_data, 
                                                 id_run+'dico_pen_bin_level_h_2h-17h59LEVEL.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)

  and should_run_async(code)
  0%|                                                                                            | 0/8 [00:00<?, ?it/s]

in this time series there is 28 hens
The initial starting date in over all is: 2020-09-29 09:20:21, and the ending date will be: 2021-05-10 08:42:27
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2020-09-29 09:20:21, and the ending date will be: 2021-05-10 23:59:59
Total running time: 3.61 mn



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A
 20%|████████████████▌                                                                  | 1/5 [07:20<29:20, 440.10s/it][A
 40%|█████████████████████████████████▏                                                 | 2/5 [14:24<21:46, 435.52s/it][A
 60%|█████████████████████████████████████████████████▊                                 | 3/5 [21:24<14:21, 430.74s/it][A
 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [28:21<07:06, 426.69s/it][A
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [35:13<00:00, 422.77s/it][A
 12%|██████████                                                                      | 1/8 [38:53<4:32:11, 2333.13s/it]

in this time series there is 28 hens
The initial starting date in over all is: 2020-09-29 09:08:12, and the ending date will be: 2021-05-10 08:42:22
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2020-09-29 09:08:12, and the ending date will be: 2021-05-10 23:59:59
Total running time: 3.69 mn



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A
 20%|████████████████▌                                                                  | 1/5 [07:19<29:16, 439.06s/it][A
 40%|█████████████████████████████████▏                                                 | 2/5 [14:26<21:46, 435.42s/it][A
 60%|█████████████████████████████████████████████████▊                                 | 3/5 [21:26<14:22, 431.02s/it][A
 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [28:24<07:07, 427.13s/it][A
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [35:18<00:00, 423.63s/it][A
 25%|███████████████████▌                                                          | 2/8 [1:17:54<3:53:33, 2335.50s/it]

in this time series there is 31 hens
The initial starting date in over all is: 2020-09-29 09:07:00, and the ending date will be: 2021-05-10 08:42:20
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2020-09-29 09:07:00, and the ending date will be: 2021-05-10 23:59:59
Total running time: 3.79 mn



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A
 20%|████████████████▌                                                                  | 1/5 [08:03<32:14, 483.62s/it][A
 40%|█████████████████████████████████▏                                                 | 2/5 [15:50<23:55, 478.56s/it][A
 60%|█████████████████████████████████████████████████▊                                 | 3/5 [23:32<15:47, 473.54s/it][A
 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [31:07<07:48, 468.19s/it][A
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [38:40<00:00, 464.17s/it][A
 38%|█████████████████████████████▎                                                | 3/8 [2:00:24<3:19:59, 2399.93s/it]

in this time series there is 28 hens
The initial starting date in over all is: 2020-09-29 09:32:45, and the ending date will be: 2021-05-10 08:26:44
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2020-09-29 09:32:45, and the ending date will be: 2021-05-10 23:59:59
Total running time: 3.77 mn



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A
 20%|████████████████▌                                                                  | 1/5 [07:25<29:43, 445.75s/it][A
 40%|█████████████████████████████████▏                                                 | 2/5 [14:35<22:02, 440.81s/it][A
 60%|█████████████████████████████████████████████████▊                                 | 3/5 [21:38<14:31, 435.61s/it][A
 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [28:37<07:10, 430.54s/it][A
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [35:32<00:00, 426.44s/it][A
 50%|███████████████████████████████████████                                       | 4/8 [2:39:44<2:39:12, 2388.09s/it]

in this time series there is 27 hens
The initial starting date in over all is: 2020-09-29 09:33:43, and the ending date will be: 2021-05-10 08:26:01
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2020-09-29 09:33:43, and the ending date will be: 2021-05-10 23:59:59
Total running time: 3.85 mn



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A
 20%|████████████████▌                                                                  | 1/5 [07:06<28:24, 426.01s/it][A
 40%|█████████████████████████████████▏                                                 | 2/5 [14:05<21:12, 424.18s/it][A
 60%|█████████████████████████████████████████████████▊                                 | 3/5 [20:55<13:59, 419.68s/it][A
 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [27:42<06:55, 415.94s/it][A
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [34:32<00:00, 414.40s/it][A
 62%|████████████████████████████████████████████████▊                             | 5/8 [3:18:09<1:58:09, 2363.17s/it]

in this time series there is 29 hens
The initial starting date in over all is: 2020-09-29 09:33:48, and the ending date will be: 2021-05-10 08:26:39
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2020-09-29 09:33:48, and the ending date will be: 2021-05-10 23:59:59
Total running time: 4.05 mn



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A
 20%|████████████████▌                                                                  | 1/5 [07:50<31:21, 470.48s/it][A
 40%|█████████████████████████████████▏                                                 | 2/5 [15:29<23:21, 467.15s/it][A
 60%|█████████████████████████████████████████████████▊                                 | 3/5 [23:16<15:33, 466.95s/it][A
 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [30:48<07:42, 462.50s/it][A
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [38:15<00:00, 459.17s/it][A
 75%|██████████████████████████████████████████████████████████▌                   | 6/8 [4:00:30<1:20:32, 2416.38s/it]

in this time series there is 29 hens
The initial starting date in over all is: 2020-09-29 09:52:12, and the ending date will be: 2021-05-10 08:40:33
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2020-09-29 09:52:12, and the ending date will be: 2021-05-10 23:59:59



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A

Total running time: 3.80 mn



 20%|████████████████▌                                                                  | 1/5 [08:00<32:03, 480.82s/it][A
 40%|█████████████████████████████████▏                                                 | 2/5 [15:42<23:44, 474.95s/it][A
 60%|█████████████████████████████████████████████████▊                                 | 3/5 [23:18<15:38, 469.50s/it][A
 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [30:55<07:45, 465.70s/it][A
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [38:36<00:00, 463.35s/it][A
 88%|██████████████████████████████████████████████████████████████████████          | 7/8 [4:42:58<40:55, 2455.80s/it]

in this time series there is 28 hens
The initial starting date in over all is: 2020-09-29 09:36:51, and the ending date will be: 2021-05-10 08:39:39
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2020-09-29 09:36:51, and the ending date will be: 2021-05-10 23:59:59
Total running time: 3.99 mn



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A
 20%|████████████████▌                                                                  | 1/5 [07:50<31:23, 470.81s/it][A
 40%|█████████████████████████████████▏                                                 | 2/5 [15:43<23:34, 471.47s/it][A
 60%|█████████████████████████████████████████████████▊                                 | 3/5 [23:42<15:47, 473.64s/it][A
 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [31:32<07:52, 472.60s/it][A
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [39:45<00:00, 477.11s/it][A
100%|████████████████████████████████████████████████████████████████████████████████| 8/8 [5:26:47<00:00, 2450.97s/it]


### On the entire 24h period for generality

#for efficiency purpose let's compute the bined time series first
#note that we will have more entries than needed, as the distrubances days are not removed in the raw-cleaned movements
#dataframe.
dico_pen_bin_zone_level_h = {}
dico_pen_bin_level_h = {}
def duration_normalized_perZone(x):
    c = Counter(x)
    t = len(x)
    return [c['1_Zone']/t, c['2_Zone']/t, c['3_Zone']/t, c['4_Zone']/t, c['5_Zone']/t]
#small example
#li = ['1_Zone','3_Zone','3_Zone','4_Zone','5_Zone','5_Zone','1_Zone']
#duration_normalized_perZone(li)

for p, df_pen in tqdm.tqdm(df.groupby('PenID')):
    
    #update results
    dico_pen_bin_zone_level_h[p] = {}
    dico_pen_bin_level_h[p] = {}
    
    #compute time series
    df_ts = time_series_henColumn_tsRow(df_pen, config, col_ts='Zone', ts_with_all_hen_value=False, save=False, 
                                        hen_time_series=False)
    
    for nbr_binmn in tqdm.tqdm(li_binmn):
        
        #update results
        dico_pen_bin_zone_level_h[p][nbr_binmn] = {}
        dico_pen_bin_level_h[p][nbr_binmn] = {}
                
        #reduce to the interval we want
        mi = min(df_ts['Timestamp'].tolist())
        ma = max(df_ts['Timestamp'].tolist())
        #extend the end to the end of the day in case it case the last day available fo the chicken
        Daterange = pd.date_range(start = mi, end = ma, freq = str(nbr_binmn)+'MIN')    
        df_date = pd.DataFrame({str(nbr_binmn)+'mn_timestamp':Daterange})
        new_timestamp = str(nbr_binmn)+'mn_timestamp'
        df_date[new_timestamp] = df_date[new_timestamp].map(lambda x: pd.to_datetime(x))
        df_ts_ = pd.merge_asof(df_ts, df_date, left_on=['Timestamp'], right_on=[new_timestamp], direction='forward')
        #groupby the interval that we want with the number of minutes in nestbox
        li_hen = [v for v in df_ts.columns if 'hen_' in v]
        
        ################# overall mlp #################
        df_sim = df_ts_.groupby(new_timestamp)[li_hen].agg(lambda x: duration_normalized_perZone(x)).reset_index()
        df_sim['date'] = df_sim[new_timestamp].map(lambda x: dt.datetime(x.year,x.month,x.day))
        #groupby date to have a list of zones per day (rows) for the hens (columns)
        df_sim = df_sim.groupby('date')[li_hen].agg(lambda x: list(x)).reset_index()
        #print(df_zone_sim.shape)
        #display(df_zone_sim.head(3))

        #melt to have one row per (day, hens) to avoid looping to create the dictionary
        df_sim_ = pd.melt(df_sim, id_vars=['date'], value_vars=li_hen)
        #variable column has the henIDs
        #print(df_zone_sim_.shape)
        #display(df_zone_sim_.head(3))
        for d, df__ in df_sim_.groupby(['date']):
            #update results
            dico_pen_bin_level_h[p][nbr_binmn][d] = dict(zip(df__['variable'].tolist(), df__['value'].tolist()))   
            
            
        ################# zone-ts over each zone #################                
        for ZONE in df['Zone'].unique():
            
            #update results
            dico_pen_bin_zone_level_h[p][nbr_binmn][ZONE] = {}
                
            df_zone_sim = df_ts_.groupby(new_timestamp)[li_hen].agg(lambda x: sum([i==ZONE for i in x])/60).reset_index()
            df_zone_sim['date'] = df_zone_sim[new_timestamp].map(lambda x: dt.datetime(x.year,x.month,x.day))

            #groupby date to have a list of zones per day (rows) for the hens (columns)
            df_zone_sim = df_zone_sim.groupby('date')[li_hen].agg(lambda x: list(x)).reset_index()
            #print(df_zone_sim.shape)
            #display(df_zone_sim.head(3))
            
            #melt to have one row per (day, hens) to avoid looping to create the dictionary
            df_zone_sim_ = pd.melt(df_zone_sim, id_vars=['date'], value_vars=li_hen)
            #variable column has the henIDs
            #print(df_zone_sim_.shape)
            #display(df_zone_sim_.head(3))
            for d, df__ in df_zone_sim_.groupby(['date']):
                #update results
                dico_pen_bin_zone_level_h[p][nbr_binmn][ZONE][d] = dict(zip(df__['variable'].tolist(), df__['value'].tolist()))
#save dictionaries
pickle.dump(dico_pen_bin_zone_level_h, open(os.path.join(path_extracted_data, 
                                                     id_run+'dico_pen_bin_zone_level_h.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(dico_pen_bin_level_h, open(os.path.join(path_extracted_data, 
                                                 id_run+'dico_pen_bin_level_h.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)

#download the two dictionaries
dico_pen_bin_zone_level_h = pickle.load(open(os.path.join(path_extracted_data, 
                                                 id_run+'dico_pen_bin_zone_level_h.pkl'), 'rb'))
dico_pen_bin_level_h = pickle.load(open(os.path.join(path_extracted_data, 
                                                     id_run+'dico_pen_bin_level_h.pkl'), 'rb'))