In [1]:
#basic package
import tqdm
import os
import numpy as np
import glob
import pandas as pd
import sys
import time
import datetime as dt
import itertools
import re
import math
from collections import Counter
import operator
from scipy.stats import entropy

#plot
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
PACKAGE_PARENT = '../'
SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser('__file__'))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
from UTILS import preprocessing_Origins,is_day,time_series_henColumn_tsRow, is_WG_open
import config_exp2 as config

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


2


# Define parameters

In [3]:
print('change the configuration file if not done yet!')
path_extracted_data = config.path_extracted_data
path_initial_data = config.path_initial_data
id_run = config.id_run
dico_matching = config.dico_matching
li_tracking_date = config.li_tracking_date
print(id_run)
path_extracted_data_visual = os.path.join(path_extracted_data,'visual')
#create a director if not existing
if not os.path.exists(path_extracted_data_visual):
    os.makedirs(path_extracted_data_visual)

change the configuration file if not done yet!
EXP2_


# Open records

In [4]:
df = pd.read_csv(os.path.join(path_extracted_data, id_run+'_CLEANEDDATA.csv'), sep=';', parse_dates=['Timestamp', 'date'],
                dayfirst=True) 
print(df.shape)
df.head(3)

(2099229, 23)


Unnamed: 0.1,Unnamed: 0,Timestamp,HenID,Zone,PenID,ts_order,TagID,signalstrength,system,time,...,zone3,zone4,next_record_date,duration,next_zone,previous_zone,is_dur_smaller_60sec,correction_is_consecutive_equal_initial_zone,is_WG_open,is_day
0,132,2021-10-08 00:24:41,hen_1,3_Zone,pen10,348506,tag_5,14,10 - 12,00:24:41,...,6,0,2021-10-08 10:59:30,38089.0,2_Zone,,False,False,False,False
1,1274,2021-10-08 10:59:30,hen_1,2_Zone,pen10,348747,tag_5,5,10 - 12,10:59:30,...,0,0,2021-10-09 10:33:27,84837.0,2_Zone,3_Zone,False,False,False,True
2,6916,2021-10-09 10:33:27,hen_1,3_Zone,pen10,350158,tag_5,15,10 - 12,10:33:27,...,4,6,2021-10-09 10:59:56,1589.0,3_Zone,2_Zone,False,False,False,True


In [5]:
dico_henid_penid = dict(zip(df['HenID'],df['PenID']))


# Compute variables

In [10]:
def HenDailyVariable_OriginsExp2(df, config, name_='', timestamp_name='Timestamp', save=True, has_cons_equal_zone=True): 
    
    ''' 
    Note: work with ts that have nan (typically at begining)
    
    Input:
    df_ts: Each row correspond to a specific timestamp, each column to a specific hen timeseries (which column name must start 
        with hen_ ). Must also have a Timestamp and a level column, which will be used to aggregate info and compute variables on these 
        aggregated info
    config: file with parameter
    has_cons_equal_zone: if the initial data has some consecutives euqal zone for hte same hen (that are not necessarily at the same time)
    
    Output:
    Dataframe with according variables'''
    
    #start recording the time it last
    START_TIME = time.perf_counter()
    
    #remove milliseconds now that we cleaned the data (i.e. the records with less than 1seconds duration
    #sort by timestamp
    df.sort_values([timestamp_name], inplace=True)
    #use up to the second level only
    df[timestamp_name] = df[timestamp_name].map(lambda x: dt.datetime(x.year,x.month,x.day,x.hour,x.minute, x.second))
    #remove the first record
    df = df.drop_duplicates(subset=['HenID',timestamp_name], keep='last')
    
    #create dico of hneid and tag id as we will loose that info when converting into time series to ocmpute var
    dico_henid_tagid = dict(zip(df['HenID'],df['TagID']))
    dico_henid_penid = dict(zip(df['HenID'],df['PenID']))
    
    #remove duration if existing in the dataframe to avoid error
    if 'duration' in df.columns:
        df.drop('duration', axis=1, inplace=True)
    df_init = df.copy()
    print('----------------- Create time serie')
    df_ts = time_series_henColumn_tsRow(df, config, col_ts='Zone', ts_with_all_hen_value=False, save=False, hen_time_series=False)
    
    #compute nbr_sec computation here (list of difference between each timestamp, and must always be the same)
    li_ts = df_ts[timestamp_name].tolist()
    li_diff_ts = list(set(list(map(operator.sub, li_ts[1:], li_ts[0:-1]))))
    if len(li_diff_ts)!=1:
        print('ERROR: your timestamp columns have different one to one difference: ', li_diff_ts)
        sys.exit()
    nbr_sec = li_diff_ts[0].total_seconds()
    print('your time series has %d seconds between two timestamps'%nbr_sec)    
    
    ############ initialise parameters from config file
    path_extracted_data = config.path_extracted_data
    id_run = config.id_run
    date_max = config.date_max
    dico_night_hour = config.dico_night_hour
    dico_zone_order = config.dico_zone_order
    date_first_opening_WG = config.date_first_opening_WG
    close_dates = config.close_dates
    dico_garden_opening_hour = config.dico_garden_opening_hour
    path_FocalBird = config.path_FocalBird
    nestbox_sec = config.nestbox_sec
    WG_after_opening_mn = config.WG_after_opening_mn
    li_tracking_date = config.li_tracking_date
 
    ############ small verifications
    #verify columns name of df_ts and select the column we need
    li_hen = [i for i in list(df_ts) if i.startswith('hen_')]
    #verify that the timestamp has same difference than the suggested nbr_sec parameter
    df_ts = df_ts.sort_values(timestamp_name)
    if (df_ts[timestamp_name].iloc[1]-df_ts[timestamp_name].iloc[0]).seconds!=nbr_sec:
        print('ERROR: your timestamp difference does not equal your nbr_sec parameter')
        sys.exit()
        
    ############ separate day and night
    df_ts['is_day'] = df_ts[timestamp_name].map(lambda x: is_day(x, dico_night_hour))
    #night
    df_ts_night = df_ts[~df_ts['is_day']].copy()
    df_ts_night['night_level'] = df_ts_night[timestamp_name].map(lambda x: str(x)[0:-9]+'_'+str(x+dt.timedelta(days=1))[8:10] if\
                                            name_level(x,dico_night_hour) else str(x-dt.timedelta(days=1))[0:-9]+'_'+str(x)[8:10])
    #days
    #note that minuit is: 0, and its date should be as 1,2 (day-1, day)
    df_ts = df_ts[df_ts['is_day']].copy()
    df_ts['level'] = df_ts['date'].copy()
    
    ########################################################
    #### night info 
    ########################################################    
    #have to be done at begining to free memory space
    print('----------------- main night zone and nbr of transitions over night....')
    df_ts_night = pd.melt(df_ts_night.filter([timestamp_name,'night_level']+li_hen), id_vars=[timestamp_name,'night_level'],
                          value_vars=li_hen)
    df_ts_night.rename(columns={'variable':'HenID','value':'Zone'}, inplace=True)
    df_ts_night = df_ts_night[~df_ts_night['Zone'].isnull()].groupby(['HenID','night_level']).agg(
                            night_Max_duration_zones=pd.NamedAgg(column='Zone', aggfunc=lambda x: max_duration_zones(x)),
                            night_duration_Z5=pd.NamedAgg(column='Zone', aggfunc=lambda x: duration_Z5(x)),
                            night_Total_number_transition=pd.NamedAgg(column='Zone', 
                                                                      aggfunc=lambda x: nbr_transition(list((x))))).reset_index()
    df_ts_night['is_mvt_night'] = df_ts_night['night_Total_number_transition'].map(lambda x: int(x>0))
    
    #amount of transition per hour during the night
    df_n = df_init.copy()
    df_n['is_day'] = df_n['Timestamp'].map(lambda x: is_day(x, dico_night_hour))
    df_n['night_level'] = df_n['Timestamp'].map(lambda x: str(x)[0:-9]+'_'+str(x+dt.timedelta(days=1))[8:10] if\
                                            name_level(x,dico_night_hour) else str(x-dt.timedelta(days=1))[0:-9]+'_'+str(x)[8:10])
    df_n['hour'] = df_n['Timestamp'].map(lambda x: x.hour)
        
    df_ts_night = df_ts_night.fillna(0) #add 0 to all the one that had no transition over night
    df_ts_night['level'] = df_ts_night['night_level'].map(lambda x: dt.datetime.strptime(x.split('_')[0], '%Y-%m-%d'))
    
    ########################################################    
    ############ one row per unique hen-timestamp 
    ########################################################    
    df = df_ts.filter([timestamp_name,'level']+li_hen).copy()  
    #list of involved level
    li_day = set(df['level'].tolist())  
    df = pd.melt(df, id_vars=[timestamp_name,'level'], value_vars=li_hen)
    df.rename(columns={'variable':'HenID','value':'Zone'}, inplace=True)
    #we define the duration of each row to be the nbr_sec, its better than computing with the next timestamp as if we removed some days
    #due to health-assessemnt, then it will induce wrong durations! also more efficient that way. BUT its an assumption, that the row must
    #be equally spaced and nbr_sec is the duration in between each timestamp
    df['duration_sec'] = nbr_sec
    #list of not nan Zones
    li_Zone = [x for x in df[~df['Zone'].isnull()]['Zone'].unique()]

    ########################################################
    print('----------------- total & percentage duration per Zone in seconds....')
    #one row per day, hen, existingzone
    df_ = df.groupby(['HenID','level','Zone'])['duration_sec'].agg(lambda x: sum(x)).reset_index()
    #one row per day and hen, each columns account for a zone_duration
    df_daily = df_.pivot_table(values='duration_sec', index=['HenID', 'level'], columns='Zone')
    df_daily.rename(columns={x:'duration_'+x for x in li_Zone}, inplace=True)
    #lets verify with total duration
    df_daily['verification_daily_total_duration'] = df_daily.apply(lambda x: np.nansum([x[i] for i in ['duration_'+x for x in li_Zone]]),
                                                                   axis=1)
    df_daily['verification_daily_total_nbr_hour'] = df_daily['verification_daily_total_duration'].map(lambda x: x/60/60)

    df_daily = df_daily.reset_index()
    #replace np.nan duration by 0
    df_daily.replace(np.nan,0, inplace=True)
    #print('The number of hours per \"level\" period is of:')
    #display(df_daily.groupby(['verification_daily_total_nbr_hour'])['level','HenID'].agg(lambda x: list(x)).reset_index())

    #create an ordered list of the normalized duration per zone for chi2distance later (hen will first be sorted by entropy, and 
    #hence we will do this at the end)
    li_zone_dur = [c for c in df_daily.columns if c.startswith('duration_')] #keep same order
    df_daily['dur_values'] = df_daily.apply(lambda x: str([x[i] for i in li_zone_dur]), axis=1)
    df_daily['dur_values'] = df_daily['dur_values'].map(lambda x: eval(x))
    df_daily['dur_values_normalized'] = df_daily['dur_values'].map(lambda x: [i/float(np.sum(x)) if float(np.sum(x))!=0 else 0 for i in x])

    #add percentage in each zone also in zone 1
    for c in ['duration_'+x for x in li_Zone]:
        df_daily['perc_'+c] = df_daily.apply(lambda x: round(x[c]/x['verification_daily_total_duration']*100), axis=1)
    #(Total time outside)/(total time wg is open)
    df_daily['time_wg_open_sec'] = df_daily['level'].map(lambda x: WG_open_time(config,x)) #duratio of WG opening
    df_daily['duration_sincedaystarted_beforeWGopened_sec'] = df_daily['level'].map(lambda x: WG_open_dayduration_untilopen(config,x))
    #we dont have WG opened in the first period
    if 'duration_1_Zone' in df_daily.columns:
        df_daily['perc_1_Zone_while_WG_open'] = df_daily.apply(lambda x: x['duration_1_Zone']/x['time_wg_open_sec']*100, axis=1)

    ######################################################## 
    print('----------------- first timestamp in each zone per day & latency var....')
    #why: will be usefull to produce other variables, to verify the code and to use it for some zones
    df_ = df.groupby(['HenID', 'level','Zone'])[timestamp_name].agg(lambda x: min(list(x))).reset_index()
    #agg function = 'first' ats its string value, and the default function is the mean. Here by construction df_ has unique such 
    #values
    df__ = df_.pivot_table(values=timestamp_name, index=['HenID', 'level'], columns='Zone', aggfunc='first')
    df__.rename(columns={x:'FirstTimestamp_'+x for x in li_Zone}, inplace=True)
    df__ = df__.reset_index()
    df_daily = pd.merge(df_daily, df__, how='outer', on=['HenID','level'])
    
    #latency in WG
    #how long was WG open for today
    if 'FirstTimestamp_1_Zone' in df_daily.columns:
        df_daily['nbr_h_WGopen'] = df_daily['level'].map(lambda x: nbrh_WG_open(x, config))
        df_daily['FirstTimestamp_1_Zone_sec'] = df_daily['FirstTimestamp_1_Zone'].map(lambda x: (x.hour*60*60+x.minute*60+x.second))
        df_daily['latency_1_Zone_h'] = df_daily.apply(lambda x: (x['FirstTimestamp_1_Zone_sec']-
                                                                x['duration_sincedaystarted_beforeWGopened_sec'])/60/60 if x is not pd.NaT else 0, axis=1)    

        #when hen never went outside then put the total amount of time wg was open for. later on nan will be put when wg was close
        df_daily['latency_1_Zone_h'] = np.where(df_daily['FirstTimestamp_1_Zone_sec'].isnull(), df_daily['nbr_h_WGopen'],
                                                df_daily['latency_1_Zone_h'])

    ########################################################
    print('----------------- number of Zone (excluding nan)....')
    df_ = df[~df['Zone'].isnull()].groupby(['HenID','level'])['Zone'].agg(lambda x: len(set((x)))).reset_index()
    df_.rename(columns={'Zone':'Total_number_zone'}, inplace=True)
    df_daily = pd.merge(df_daily, df_, how='outer', on=['HenID','level'])
    
    ########################################################        
    #compute some variables 
    ########################################################
    #based on a list of zones over a day, where each zone count for the same nbr_sec second
    #e.g.[einstreu,eintreu,rampe,rampe.....]
    #excluding empty zones, because it influences for exemple the entropy computation (if full of nan, then might be more predictable)    
    print('----------------- compute some variables based on a list of zones over a day....')
    #note that entropy gets as input a list of proba distribution (or count of each element)                    
    df_ = df[~df['Zone'].isnull()].groupby(['HenID','level']).agg(
           list_of_durations=pd.NamedAgg(column='Zone', aggfunc=lambda x: list_of_durations(x, nbr_sec)),
           #TODO ONCE we now schedule   
           #food_related_behavior=pd.NamedAgg(column='Zone', aggfunc=lambda x: food_related_behavior(li_Z=list(x), config=config,
           #                                                                                        fake_sec=0)),
           #food_related_behavior_rp=pd.NamedAgg(column='Zone', aggfunc=lambda x: food_related_behavior(li_Z=list(x), config=config, 
           #                                                                                            fake_sec=20*60)),
           #food_related_behavior_rm=pd.NamedAgg(column='Zone', aggfunc=lambda x: food_related_behavior(li_Z=list(x), config=config,
           #                                                                                           fake_sec=-20*60)), 
           zone_list=pd.NamedAgg(column='Zone', aggfunc=lambda x: tuple(x)),
           list_of_zones=pd.NamedAgg(column='Zone', aggfunc=lambda x: list_of_zones(list(x))), 
           list_ZALL=pd.NamedAgg(column='Zone', aggfunc=lambda x: list_ALL(list(x))),
           mid_cum_Z4_sec=pd.NamedAgg(column='Zone', aggfunc=lambda x: mid_cum_Z4_sec(list(x))),
           Total_number_transition=pd.NamedAgg(column='Zone', aggfunc=lambda x: nbr_transition(list(x))),
           nbr_stays=pd.NamedAgg(column='Zone', aggfunc=lambda x: nbr_bouts_per_zone(list(x))),
           distribution_entropy=pd.NamedAgg(column='Zone', aggfunc=lambda x: entropy(list(Counter(list(x)).values()),base=2)), 
           vertical_travel_distance=pd.NamedAgg(column='Zone', aggfunc=lambda x: vertical_travel_distance(list(x)))
           ).reset_index()
    #distribution entropy note: order does not matter, number of 0min in a zone does not matter. compute here to then be able to verify with dur_values, isntead of computing directly on dur_values
    
    df_daily = pd.merge(df_daily, df_, how='outer', on=['HenID','level'])

    #total number of stay
    df_daily['nbr_stays_total'] = df_daily['nbr_stays'].map(lambda x: np.nansum(list(x.values())))
    #retrieve info per zone
    for z in li_Zone:
        df_daily['nbr_stays_'+z] = df_daily['nbr_stays'].map(lambda x: x.get(z,0))
    #total number of stay %of stays per zone
    df_daily['nbr_stays_total'] = df_daily['nbr_stays'].map(lambda x: np.nansum(list(x.values())))
    
    ########################################################
    #### WG
    ########################################################    
    print('------------ add WG info')
    #put np.nan when nbr_h_WGopen to all WG var
    for x_ in [x for x in df_daily.columns if '1_Zone' in x]:
        df_daily.loc[df_daily['nbr_h_WGopen']==0, x_] = np.nan
        
    ########################################################
    ######## add tagid,henid and night info
    ########################################################
    df_daily['TagID'] = df_daily['HenID'].map(lambda x: dico_henid_tagid[x])
    df_daily['PenID'] = df_daily['HenID'].map(lambda x: dico_henid_penid[x])
    df_ts_night['level'] = df_ts_night['level'].map(lambda x: dt.datetime(x.year,x.month,x.day)) #necessary for the merging
    #ERROR here can happen if initialstartsdate has two value for example!!
    #merge on HenID, level and all other li_var_hen, otherwise we will have duplicated columns
    li_var_hen = [i for i in df_ts_night.columns if i in df_daily.columns]
    df_daily = pd.merge(df_daily, df_ts_night, on=li_var_hen, how='outer')
    
    ########################################################
    ##################### remove dates #####################
    ########################################################    

    ####################################################################################################################################
    ######## remove dates of tags when they were not giving device update regularly 
    print('-------------- Lets remove dates of tags when they were not giving deviceupdate correctly')
    #verified date: correct
    df_wt = pd.read_csv(os.path.join(path_extracted_data, id_run+'_df_alldeviceinfo.csv'), parse_dates=['date'], dayfirst=True, sep=';') 
    x0 = df_daily.shape[0]
    df_daily['date_2keep'] = df_daily.apply(lambda x: x['level'] in df_wt[(df_wt['LFCounter_nbr_equal0']<10)&\
                                                                          (df_wt['bigest_gap']<=15*60)&\
                                                                          (df_wt['sender']==x['TagID'])]['date'].tolist(), axis=1)
    x0 = df_daily.shape[0]
    df_daily = df_daily[df_daily['date_2keep']]
    print_color((('By removing the unwanted days we passed from %d to %d timestamp (losing '%(x0,
                df_daily.shape[0]),'black'), (x0-df_daily.shape[0],'red'),(' timestamp)','black')))  

    ####################################################################################################################################
    ######## more generally remove all (dates,tagid) with not all seconds tracked (e.g. first day an animal is tracked)
    print('-------------- Lets remove all (dates,tagid) with not all seconds tracked (e.g. first day an animal is tracked), and no night variable')
    df_daily['nbr_h_per_day'] = df_daily['level'].map(lambda x: dico_night_hour[correct_key(x,dico_night_hour)]['nbr_hour'])
    df_daily['correct_amount_of_hour'] = df_daily.apply(lambda x: x['verification_daily_total_nbr_hour']==x['nbr_h_per_day'], axis=1)
    x0 = df_daily.shape[0]
    #as we wont really use the night, lets just remove all the time that corret_maounf of our is incorrect, altought it might remove some correct nights (but otherwise we should keep nights in a separate table
    df_daily = df_daily[df_daily['correct_amount_of_hour']]
    print_color((('By removing the unwanted days we passed from %d to %d timestamp (losing '%(x0,
                df_daily.shape[0]),'black'), (x0-df_daily.shape[0],'red'),(' timestamp)','black')))

    ######## remove all above the last official tracked day
    print('-------------- Lets remove all above the last official tracked day')
    x0 = df_daily.shape[0]
    df_daily = df_daily[df_daily['level'].isin(li_tracking_date)]    
    print_color((('By removing the unwanted days we passed from %d to %d timestamp (losing '%(x0,
                df_daily.shape[0]),'black'), (x0-df_daily.shape[0],'red'),(' timestamp)','black')))    

    ########################################################
    #add DOA; DIB
    ########################################################    
    df_daily['DOA'] = df_daily['level'].map(lambda x: (x-config.birth_date).days) 
    df_daily['WOA'] = df_daily['DOA'].map(lambda x: math.ceil(x/7))

    #save
    if save:
        print('save')
        df_daily.drop(['zone_list','date_2keep'],inplace=True,axis=1) #verification_daily_total_duration
        df_daily.to_csv(os.path.join(path_extracted_data, id_run+'_daily_'+'_'+str(name_)+'_variables.csv'), 
                        sep=';', index=False)

    END_TIME = time.perf_counter()
    print ("Total running time: %.2f mn" %((END_TIME-START_TIME)/60))

    return df_daily


In [11]:
#from li_tracking_date
li_alldate = [[dt.datetime(2021, 10, 9),
 dt.datetime(2021, 10, 10),
 dt.datetime(2021, 10, 11),
 dt.datetime(2021, 10, 16),
 dt.datetime(2021, 10, 17),
 dt.datetime(2021, 10, 18)],
 [dt.datetime(2021, 11, 23),
 dt.datetime(2021, 11, 24),
 dt.datetime(2021, 11, 25),
 dt.datetime(2021, 11, 26),
 dt.datetime(2021, 11, 27),
 dt.datetime(2021, 11, 28),
 dt.datetime(2021, 11, 29),
 dt.datetime(2021, 12, 1)],
 [dt.datetime(2022, 2, 1),
 dt.datetime(2022, 2, 2),
 dt.datetime(2022, 2, 3),
 dt.datetime(2022, 2, 4),
 dt.datetime(2022, 2, 5),
 dt.datetime(2022, 2, 6),
 dt.datetime(2022, 2, 7),
 dt.datetime(2022, 2, 9),
 dt.datetime(2022, 2, 10),
 dt.datetime(2022, 2, 11),
 dt.datetime(2022, 2, 12),
 dt.datetime(2022, 2, 13),
 dt.datetime(2022, 2, 14),
 dt.datetime(2022, 2, 15),
 dt.datetime(2022, 2, 16)],
 [dt.datetime(2022, 4, 5),
 dt.datetime(2022, 4, 6),
 dt.datetime(2022, 4, 7),
 dt.datetime(2022, 4, 8),
 dt.datetime(2022, 4, 9),
 dt.datetime(2022, 4, 10),
 dt.datetime(2022, 4, 11),
 dt.datetime(2022, 4, 13),
 dt.datetime(2022, 4, 14),
 dt.datetime(2022, 4, 15),
 dt.datetime(2022, 4, 16),
 dt.datetime(2022, 4, 17),
 dt.datetime(2022, 4, 18),
 dt.datetime(2022, 4, 19),
 dt.datetime(2022, 4, 20)],
 [dt.datetime(2022, 6, 28),
 dt.datetime(2022, 6, 29),
 dt.datetime(2022, 6, 30),
 dt.datetime(2022, 7, 1),
 dt.datetime(2022, 7, 2),
 dt.datetime(2022, 7, 3),
 dt.datetime(2022, 7, 4)]]

In [12]:
from UTILS import preprocessing_Origins,is_day,time_series_henColumn_tsRow, is_WG_open, name_level, max_duration_zones, \
  duration_Z5, nbr_transition, WG_open_dayduration_untilopen, food_related_behavior, nbr_bouts_per_zone, \
vertical_travel_distance, nestboxes_related_behavior, list_of_zones, list_ALL, mid_cum_Z4_sec, WG_open_time, nbrh_WG_open,\
list_of_durations,print_color,correct_key

In [13]:
#open time series per session and compute the variables for each session (car time series make sence at session level), et en 
#plus des variables tel que running entropy over the whole session ateach last timestamp of each level make sence only at 
#session level
#once to much data, do this per pen!!! with name=pens!!
START_TIME = time.perf_counter()
for li_consecutivedates in tqdm.tqdm(li_alldate):
    df_ = df[(df['date']<=(max(li_consecutivedates)+dt.timedelta(days=1)))&(df['date']>=(min(li_consecutivedates)-dt.timedelta(days=1)))].copy()
    print(df_.shape)
    #min date-1 to have last mvt! then we will remove these dates anyway
    df_daily = HenDailyVariable_OriginsExp2(df_, config, 
                                            name_=str(min(li_consecutivedates)).split(' ')[0],
                                            timestamp_name='Timestamp', has_cons_equal_zone=False, save=True)
    print(df_daily.shape)
    display(df_daily.head(3))
    print(df_daily['level'].min())
END_TIME = time.perf_counter()
print ("Total running time: %.2f mn" %((END_TIME-START_TIME)/60))  

  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

(21734, 23)
----------------- Create time serie
in this time series there is 162 hens
The initial starting date in over all is: 2021-10-08 00:00:19, and the ending date will be: 2021-10-19 18:44:25
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2021-10-08 00:00:19, and the ending date will be: 2021-10-19 23:59:59
Total running time: 0.29 mn
your time series has 1 seconds between two timestamps
----------------- main night zone and nbr of transitions over night....
----------------- total & percentage duration per Zone in seconds....
----------------- first timestamp in each zone per day & latency var....
----------------- number of Zone (excluding nan)....
----------------- compute some variables based on a list of zones over a day....
------------ add WG info
-------------- Lets remove dates of tags when they were not giving deviceupdate correctly


-------------- Lets remove all (dates,tagid) with not all seconds tracked (e.g. first day an animal is tracked), and no night variable


-------------- Lets remove all above the last official tracked day


save
Total running time: 7.31 mn
(807, 46)


Unnamed: 0,HenID,level,duration_2_Zone,duration_3_Zone,duration_4_Zone,duration_5_Zone,verification_daily_total_duration,verification_daily_total_nbr_hour,dur_values,dur_values_normalized,...,PenID,night_level,night_Max_duration_zones,night_duration_Z5,night_Total_number_transition,is_mvt_night,nbr_h_per_day,correct_amount_of_hour,DOA,WOA
1,hen_1,2021-10-09,28415.0,3985.0,0.0,0.0,32400.0,9.0,"[28415.0, 3985.0, 0.0, 0.0]","[0.8770061728395062, 0.12299382716049383, 0.0,...",...,pen10,2021-10-09_10,2_Zone,0,0,0,9,True,122,18
2,hen_1,2021-10-10,21881.0,9182.0,1337.0,0.0,32400.0,9.0,"[21881.0, 9182.0, 1337.0, 0.0]","[0.6753395061728396, 0.28339506172839507, 0.04...",...,pen10,2021-10-10_11,2_Zone,0,0,0,9,True,123,18
8,hen_1,2021-10-16,27548.0,4852.0,0.0,0.0,32400.0,9.0,"[27548.0, 4852.0, 0.0, 0.0]","[0.850246913580247, 0.14975308641975307, 0.0, ...",...,pen10,2021-10-16_17,2_Zone,0,0,0,9,True,129,19


 20%|████████████████▌                                                                  | 1/5 [07:19<29:17, 439.44s/it]

2021-10-09 00:00:00
(98176, 23)
----------------- Create time serie
in this time series there is 169 hens
The initial starting date in over all is: 2021-11-22 00:01:32, and the ending date will be: 2021-12-02 23:13:39
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2021-11-22 00:01:32, and the ending date will be: 2021-12-02 23:59:59
Total running time: 0.28 mn
your time series has 1 seconds between two timestamps
----------------- main night zone and nbr of transitions over night....
----------------- total & percentage duration per Zone in seconds....
----------------- first timestamp in each zone per day & latency var....
----------------- number of Zone (excluding nan)....
----------------- compute some variables based on a list of zones over a day....
------------ add WG info
-------------- Lets remove dates of tags when they were not giving deviceupdate correctl

-------------- Lets remove all (dates,tagid) with not all seconds tracked (e.g. first day an animal is tracked), and no night variable


-------------- Lets remove all above the last official tracked day


save
Total running time: 9.75 mn
(1314, 54)


Unnamed: 0,HenID,level,duration_1_Zone,duration_2_Zone,duration_3_Zone,duration_4_Zone,duration_5_Zone,verification_daily_total_duration,verification_daily_total_nbr_hour,dur_values,...,PenID,night_level,night_Max_duration_zones,night_duration_Z5,night_Total_number_transition,is_mvt_night,nbr_h_per_day,correct_amount_of_hour,DOA,WOA
1,hen_1,2021-11-23,0.0,32158.0,12115.0,3348.0,979.0,48600.0,13.5,"[0.0, 32158.0, 12115.0, 3348.0, 979.0]",...,pen10,2021-11-23_24,3_Zone,0,0,0,13.5,True,167,24
2,hen_1,2021-11-24,0.0,24636.0,10313.0,3227.0,10424.0,48600.0,13.5,"[0.0, 24636.0, 10313.0, 3227.0, 10424.0]",...,pen10,2021-11-24_25,3_Zone,0,0,0,13.5,True,168,24
3,hen_1,2021-11-25,0.0,28681.0,13217.0,3396.0,3306.0,48600.0,13.5,"[0.0, 28681.0, 13217.0, 3396.0, 3306.0]",...,pen10,2021-11-25_26,3_Zone,0,0,0,13.5,True,169,25


 40%|█████████████████████████████████▏                                                 | 2/5 [17:05<26:16, 525.58s/it]

2021-11-23 00:00:00
(157452, 23)
----------------- Create time serie
in this time series there is 168 hens
The initial starting date in over all is: 2022-01-31 00:09:33, and the ending date will be: 2022-02-17 22:42:29
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2022-01-31 00:09:33, and the ending date will be: 2022-02-17 23:59:59
Total running time: 0.49 mn
your time series has 1 seconds between two timestamps
----------------- main night zone and nbr of transitions over night....
----------------- total & percentage duration per Zone in seconds....
----------------- first timestamp in each zone per day & latency var....
----------------- number of Zone (excluding nan)....
----------------- compute some variables based on a list of zones over a day....
------------ add WG info
-------------- Lets remove dates of tags when they were not giving deviceupdate correct

-------------- Lets remove all (dates,tagid) with not all seconds tracked (e.g. first day an animal is tracked), and no night variable


-------------- Lets remove all above the last official tracked day


save
Total running time: 15.86 mn
(1923, 54)


Unnamed: 0,HenID,level,duration_1_Zone,duration_2_Zone,duration_3_Zone,duration_4_Zone,duration_5_Zone,verification_daily_total_duration,verification_daily_total_nbr_hour,dur_values,...,PenID,night_level,night_Max_duration_zones,night_duration_Z5,night_Total_number_transition,is_mvt_night,nbr_h_per_day,correct_amount_of_hour,DOA,WOA
1,hen_1,2022-02-01,0.0,36884.0,10409.0,2224.0,883.0,50400.0,14.0,"[0.0, 36884.0, 10409.0, 2224.0, 883.0]",...,pen10,2022-02-01_02,5_Zone,36000,0,0,14,True,237,34
2,hen_1,2022-02-02,0.0,32959.0,8728.0,1207.0,7506.0,50400.0,14.0,"[0.0, 32959.0, 8728.0, 1207.0, 7506.0]",...,pen10,2022-02-02_03,5_Zone,36000,0,0,14,True,238,34
3,hen_1,2022-02-03,0.0,32208.0,8221.0,1383.0,8588.0,50400.0,14.0,"[0.0, 32208.0, 8221.0, 1383.0, 8588.0]",...,pen10,2022-02-03_04,5_Zone,36000,0,0,14,True,239,35


 60%|█████████████████████████████████████████████████▊                                 | 3/5 [32:59<24:02, 721.37s/it]

2022-02-01 00:00:00
(153343, 23)
----------------- Create time serie
in this time series there is 167 hens
The initial starting date in over all is: 2022-04-04 00:01:59, and the ending date will be: 2022-04-21 23:46:42
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2022-04-04 00:01:59, and the ending date will be: 2022-04-21 23:59:59
Total running time: 0.51 mn
your time series has 1 seconds between two timestamps
----------------- main night zone and nbr of transitions over night....
----------------- total & percentage duration per Zone in seconds....
----------------- first timestamp in each zone per day & latency var....
----------------- number of Zone (excluding nan)....
----------------- compute some variables based on a list of zones over a day....
------------ add WG info
-------------- Lets remove dates of tags when they were not giving deviceupdate correct

-------------- Lets remove all (dates,tagid) with not all seconds tracked (e.g. first day an animal is tracked), and no night variable


-------------- Lets remove all above the last official tracked day


save
Total running time: 16.98 mn
(2452, 54)


Unnamed: 0,HenID,level,duration_1_Zone,duration_2_Zone,duration_3_Zone,duration_4_Zone,duration_5_Zone,verification_daily_total_duration,verification_daily_total_nbr_hour,dur_values,...,PenID,night_level,night_Max_duration_zones,night_duration_Z5,night_Total_number_transition,is_mvt_night,nbr_h_per_day,correct_amount_of_hour,DOA,WOA
1,hen_1,2022-04-05,349.0,30288.0,11477.0,1971.0,6315.0,50400.0,14.0,"[349.0, 30288.0, 11477.0, 1971.0, 6315.0]",...,pen10,2022-04-05_06,3_Zone,0,0,0,14,True,300,43
2,hen_1,2022-04-06,820.0,31421.0,9485.0,2146.0,6528.0,50400.0,14.0,"[820.0, 31421.0, 9485.0, 2146.0, 6528.0]",...,pen10,2022-04-06_07,5_Zone,36000,0,0,14,True,301,43
3,hen_1,2022-04-07,1969.0,29593.0,10367.0,2893.0,5578.0,50400.0,14.0,"[1969.0, 29593.0, 10367.0, 2893.0, 5578.0]",...,pen10,2022-04-07_08,4_Zone,0,0,0,14,True,302,44


 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [50:01<13:59, 839.81s/it]

2022-04-05 00:00:00
(62568, 23)
----------------- Create time serie
in this time series there is 158 hens
The initial starting date in over all is: 2022-06-27 00:14:19, and the ending date will be: 2022-07-04 23:57:14
But note that birds may have different ending and starting date which should be taken into account when computing variables
and after ending the last day at midnight : 2022-06-27 00:14:19, and the ending date will be: 2022-07-04 23:59:59
Total running time: 0.22 mn
your time series has 1 seconds between two timestamps
----------------- main night zone and nbr of transitions over night....
----------------- total & percentage duration per Zone in seconds....
----------------- first timestamp in each zone per day & latency var....
----------------- number of Zone (excluding nan)....
----------------- compute some variables based on a list of zones over a day....
------------ add WG info
-------------- Lets remove dates of tags when they were not giving deviceupdate correctl

-------------- Lets remove all (dates,tagid) with not all seconds tracked (e.g. first day an animal is tracked), and no night variable


-------------- Lets remove all above the last official tracked day


save
Total running time: 8.03 mn
(1101, 54)


Unnamed: 0,HenID,level,duration_1_Zone,duration_2_Zone,duration_3_Zone,duration_4_Zone,duration_5_Zone,verification_daily_total_duration,verification_daily_total_nbr_hour,dur_values,...,PenID,night_level,night_Max_duration_zones,night_duration_Z5,night_Total_number_transition,is_mvt_night,nbr_h_per_day,correct_amount_of_hour,DOA,WOA
1,hen_1,2022-06-28,5651.0,26576.0,10915.0,3196.0,4062.0,50400.0,14.0,"[5651.0, 26576.0, 10915.0, 3196.0, 4062.0]",...,pen10,2022-06-28_29,5_Zone,36000,0,0,14,True,384,55
2,hen_1,2022-06-29,3673.0,28811.0,9034.0,3102.0,5780.0,50400.0,14.0,"[3673.0, 28811.0, 9034.0, 3102.0, 5780.0]",...,pen10,2022-06-29_30,5_Zone,35616,1,1,14,True,385,55
3,hen_1,2022-06-30,4520.0,27780.0,7641.0,2049.0,8410.0,50400.0,14.0,"[4520.0, 27780.0, 7641.0, 2049.0, 8410.0]",...,pen10,2022-06-30_01,2_Zone,0,0,0,14,True,386,56


100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [58:04<00:00, 696.82s/it]

2022-06-28 00:00:00
Total running time: 58.07 mn





In [24]:
#Concatenate all HenVariables csv into one
li_paths_var = glob.glob(os.path.join(path_extracted_data, id_run+'_daily__20*_variables.csv'))
li_df = []
if len(li_paths_var)!=len(li_alldate):
    print('ERROR: not the correct number of files')
    sys.exit()
for path_var in tqdm.tqdm(li_paths_var):
    #for being more reproductible, we open the file that was saved from cleaning
    df_ = pd.read_csv(path_var, sep=';',parse_dates=['level', 'FirstTimestamp_2_Zone', 
                                                     'FirstTimestamp_3_Zone','FirstTimestamp_4_Zone',
                                                     'FirstTimestamp_5_Zone']) 
    print(df_.shape)
    li_df.append(df_)
df_daily = pd.concat(li_df)
df_daily.to_csv(os.path.join(path_extracted_data, id_run+'_daily_ALL_variables.csv'), sep=';', index=False)
print(df_daily.shape)
df_daily.head(3)

 20%|████████████████▊                                                                   | 1/5 [00:44<02:59, 44.84s/it]

(807, 46)


 40%|█████████████████████████████████▌                                                  | 2/5 [02:21<03:45, 75.15s/it]

(1314, 54)


 60%|█████████████████████████████████████████████████▊                                 | 3/5 [04:37<03:25, 102.91s/it]

(1923, 54)


 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [07:27<02:09, 129.47s/it]

(2452, 54)


100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [08:37<00:00, 103.41s/it]

(1101, 54)





(7597, 54)


Unnamed: 0,HenID,level,duration_2_Zone,duration_3_Zone,duration_4_Zone,duration_5_Zone,verification_daily_total_duration,verification_daily_total_nbr_hour,dur_values,dur_values_normalized,...,DOA,WOA,duration_1_Zone,perc_duration_1_Zone,perc_1_Zone_while_WG_open,FirstTimestamp_1_Zone,nbr_h_WGopen,FirstTimestamp_1_Zone_sec,latency_1_Zone_h,nbr_stays_1_Zone
0,hen_1,2021-10-09,28415.0,3985.0,0.0,0.0,32400.0,9.0,"[28415.0, 3985.0, 0.0, 0.0]","[0.8770061728395062, 0.12299382716049383, 0.0,...",...,122,18,,,,,,,,
1,hen_1,2021-10-10,21881.0,9182.0,1337.0,0.0,32400.0,9.0,"[21881.0, 9182.0, 1337.0, 0.0]","[0.6753395061728396, 0.28339506172839507, 0.04...",...,123,18,,,,,,,,
2,hen_1,2021-10-16,27548.0,4852.0,0.0,0.0,32400.0,9.0,"[27548.0, 4852.0, 0.0, 0.0]","[0.850246913580247, 0.14975308641975307, 0.0, ...",...,129,19,,,,,,,,


In [25]:
display(list(df_daily.columns))

['HenID',
 'level',
 'duration_2_Zone',
 'duration_3_Zone',
 'duration_4_Zone',
 'duration_5_Zone',
 'verification_daily_total_duration',
 'verification_daily_total_nbr_hour',
 'dur_values',
 'dur_values_normalized',
 'perc_duration_3_Zone',
 'perc_duration_2_Zone',
 'perc_duration_4_Zone',
 'perc_duration_5_Zone',
 'time_wg_open_sec',
 'duration_sincedaystarted_beforeWGopened_sec',
 'FirstTimestamp_2_Zone',
 'FirstTimestamp_3_Zone',
 'FirstTimestamp_4_Zone',
 'FirstTimestamp_5_Zone',
 'Total_number_zone',
 'list_of_durations',
 'list_of_zones',
 'list_ZALL',
 'mid_cum_Z4_sec',
 'Total_number_transition',
 'nbr_stays',
 'distribution_entropy',
 'vertical_travel_distance',
 'nestboxes_related_behavior',
 'nbr_stays_total',
 'nbr_stays_3_Zone',
 'nbr_stays_2_Zone',
 'nbr_stays_4_Zone',
 'nbr_stays_5_Zone',
 'TagID',
 'PenID',
 'night_level',
 'night_Max_duration_zones',
 'night_duration_Z5',
 'night_Total_number_transition',
 'is_mvt_night',
 'nbr_h_per_day',
 'correct_amount_of_hour',
 

In [26]:
print(df_daily.shape)
df_daily.head(3)

(7597, 54)


Unnamed: 0,HenID,level,duration_2_Zone,duration_3_Zone,duration_4_Zone,duration_5_Zone,verification_daily_total_duration,verification_daily_total_nbr_hour,dur_values,dur_values_normalized,...,DOA,WOA,duration_1_Zone,perc_duration_1_Zone,perc_1_Zone_while_WG_open,FirstTimestamp_1_Zone,nbr_h_WGopen,FirstTimestamp_1_Zone_sec,latency_1_Zone_h,nbr_stays_1_Zone
0,hen_1,2021-10-09,28415.0,3985.0,0.0,0.0,32400.0,9.0,"[28415.0, 3985.0, 0.0, 0.0]","[0.8770061728395062, 0.12299382716049383, 0.0,...",...,122,18,,,,,,,,
1,hen_1,2021-10-10,21881.0,9182.0,1337.0,0.0,32400.0,9.0,"[21881.0, 9182.0, 1337.0, 0.0]","[0.6753395061728396, 0.28339506172839507, 0.04...",...,123,18,,,,,,,,
2,hen_1,2021-10-16,27548.0,4852.0,0.0,0.0,32400.0,9.0,"[27548.0, 4852.0, 0.0, 0.0]","[0.850246913580247, 0.14975308641975307, 0.0, ...",...,129,19,,,,,,,,


# last minute addition

#daily var
df_daily = pd.read_csv(os.path.join(path_extracted_data, id_run+'_daily_ALL_variables.csv'), sep=';',
                 parse_dates=['level', 'FirstTimestamp_1_Zone', 'FirstTimestamp_2_Zone', 'FirstTimestamp_3_Zone',
                             'FirstTimestamp_4_Zone','FirstTimestamp_5_Zone'],
                 dayfirst=True) 
df_daily = df_daily[df_daily['level']!=dt.datetime(2021,7,25)]
print(df_daily.shape)
display(df_daily.head(3))
#df_daily.to_csv(os.path.join(path_extracted_data, id_run+'_daily_ALL_variables.csv'), sep=',', index=False)