In [None]:
def HenVariable(df, config, path_FocalBird, ts_name, name_='', timestamp_name='Timestamp'):
    
    ''' Note: work with ts that have nan (typically at begining)
    Compute some variable at a "level" level, which could be from 17h-3h i.e. over two consecutive days.
    
    Input:
    df_ts: Each row correspond to a specific timestamp, each column to a specific hen timeseries (which column name must start 
        with hen_ ). Must also have a Timestamp and a level column, which will be used to aggregate info and compute variables on these 
        aggregated info
    config: file with parameter
    
    Output:
    daily dataframe (where daily is according to the level variable) with according variables'''
    
    #start recording the time it last
    START_TIME = time.clock()
    
    print('----------------- create time serie for simplicity....')
    df_ts = time_series_henColumn_tsRow(df, config, col_ts='Zone', ts_with_all_hen_value=False)

    #compute nbr_sec computation here (list of difference between each timestamp, and must always be the same)
    li_ts = df_ts[timestamp_name].tolist()
    li_diff_ts = list(set(list(map(operator.sub, li_ts[1:], li_ts[0:-1]))))
    if len(li_diff_ts)!=1:
        print('ERROR: your timestamp columns have different one to one difference: ', li_diff_ts)
        sys.exit()
    nbr_sec = li_diff_ts[0].total_seconds()
    print('your time series has %d seconds between two timestamps'%nbr_sec)    
    
    ############ initialise parameters from config file
    path_extracted_data = config.path_extracted_data
    li_date2remove = config.li_date2remove
    id_run = config.id_run
    date_max = config.date_max
    dico_night_hour = config.dico_night_hour
    dico_zone_order = config.dico_zone_order
    dico_date2remove_pens= config.dico_date2remove_pens
    #EntropyTimeComputation = config.EntropyTimeComputation
    #NbrData = config.NbrData
    
    ############ add correct 'level' variable (i.e. consecutive time slot for night time series)
    df_ts['is_day'] = df_ts[timestamp_name].map(lambda x: is_day(x, dico_night_hour))
    #note that minuit is: 0, and its date should be as 1,2 (day-1, day)
    if ts_name == 'time_serie_night':
        df_ts = df_ts[~df_ts['is_day']].copy()
        df_ts['level'] = df_ts[timestamp_name].map(lambda x: str(x)[0:-9]+'_'+str(x+dt.timedelta(days=1))[8:10] if\
                                                name_level(x,dico_night_hour) else str(x-dt.timedelta(days=1))[0:-9]+'_'+str(x)[8:10])
    elif ts_name == 'time_serie_day':
        df_ts = df_ts[df_ts['is_day']].copy()
        df_ts['level'] = df_ts['date'].copy()
    else:
        print('ERROR: ts_name parameter must either be time_serie_night or time_serie_day')
        sys.exit()
        
    ############ verifications
    #verify columns name of df_ts and select the column we need
    li_hen = [i for i in list(df_ts) if i.startswith('hen_')]
    if not all([i in df_ts.columns for i in [timestamp_name,'level']]):
        print('ERROR: your df_ts must have timestamp and level column name')
        sys.exit()
    df = df_ts.filter([timestamp_name,'level']+li_hen).copy()
    #verify that the timestamp has same difference than the suggested nbr_sec parameter
    df = df.sort_values(timestamp_name)
    if (df[timestamp_name].iloc[1]-df[timestamp_name].iloc[0]).seconds!=nbr_sec:
        print('ERROR: your timestamp difference does not equal your nbr_sec parameter')
        sys.exit()
    
    #list of involved level
    li_day = set(df['level'].tolist())  

    ############ one row per unique hen-timestamp 
    df = pd.melt(df, id_vars=[timestamp_name,'level'], value_vars=li_hen)
    df.rename(columns={'variable':'HenID','value':'Zone'}, inplace=True)
    #we define the duration of each row to be the nbr_sec, its better than computing with the next timestamp as if we removed some days
    #due to health-assessemnt, then it will induce wrong durations! also more efficient that way. BUT its an assumption, that the row must
    #be equally spaced and nbr_sec is the duration in between each timestamp
    df['duration_sec'] = nbr_sec
    #list of not nan Zones
    li_Zone = [x for x in df[~df['Zone'].isnull()]['Zone'].unique()]

    ########################################################
    print('----------------- total duration per Zone in seconds!!....')
    #one row per day, hen, existingzone
    df_ = df.groupby(['HenID','level','Zone'])['duration_sec'].agg(lambda x: sum(x)).reset_index()
    #one row per day and hen, each columns account for a zone_duration
    df_daily = df_.pivot_table(values='duration_sec', index=['HenID', 'level'], columns='Zone')
    df_daily.rename(columns={x:'duration_'+x for x in li_Zone}, inplace=True)
    #lets verify with total duration
    df_daily['verification_daily_total_duration'] = df_daily.apply(lambda x: np.nansum([x[i] for i in ['duration_'+x for x in li_Zone]]),
                                                                   axis=1)
    df_daily = df_daily.reset_index()
    #replace np.nan duration by 0
    df_daily.replace(np.nan,0, inplace=True)
    df_daily['verification_daily_total_nbr_hour'] = df_daily['verification_daily_total_duration'].map(lambda x: x/60/60)
    print('The number of hours per \"level\" period is of:')
    display(df_daily.groupby(['verification_daily_total_nbr_hour'])['level','HenID'].agg(lambda x: list(x)).reset_index())

    #create an ordered list of the normalized duration per zone for chi2distance later (hen will first be sorted by entropy, and 
    #hence we will do this at the end)
    li_zone_dur = [c for c in df_daily.columns if c.startswith('duration_')] #keep same order
    df_daily['dur_values'] = df_daily.apply(lambda x: str([x[i] for i in li_zone_dur]), axis=1)
    df_daily['dur_values'] = df_daily['dur_values'].map(lambda x: eval(x))
    df_daily['dur_values_normalized'] = df_daily['dur_values'].map(lambda x: [i/float(np.sum(x)) if float(np.sum(x))!=0 else 0 for i in x])
    
    ########################################################
    print('----------------- first time stamp in each zone per day....')
    df_ = df.groupby(['HenID', 'level','Zone'])[timestamp_name].agg(lambda x: min(list(x))).reset_index()
    #agg function = 'first' ats its string value, and the default function is the mean. Here by construction df_ has unique such 
    #values
    df__ = df_.pivot_table(values=timestamp_name, index=['HenID', 'level'], columns='Zone', aggfunc='first')
    df__.rename(columns={x:'FirstTimestamp_'+x for x in li_Zone}, inplace=True)
    df__ = df__.reset_index()
    df_daily = pd.merge(df_daily, df__, how='outer', on=['HenID','level'])

    ########################################################
    print('----------------- number of Zone (excluding nan)....')
    df_ = df[~df['Zone'].isnull()].groupby(['HenID','level'])['Zone'].agg(lambda x: len(set((x)))).reset_index()
    df_.rename(columns={'Zone':'Total_number_zone'}, inplace=True)
    df_daily = pd.merge(df_daily, df_, how='outer', on=['HenID','level'])
    
    
    ########################################################        
    #compute some variables based on a list of zones over a day, where each zone count for the same nbr_sec second
    #e.g.[einstreu,eintreu,rampe,rampe.....]
    #excluding empty zones, because it influences for exemple the entropy computation (if full of nan, then might be more predictable)    
    print('----------------- compute some variables based on a list of zones over a day....')
                        
    df_ = df[~df['Zone'].isnull()].groupby(['HenID','level']).agg(
           list_of_durations=pd.NamedAgg(column='Zone', aggfunc=lambda x: list_of_durations(x, nbr_sec)),
           zone_list=pd.NamedAgg(column='Zone', aggfunc=lambda x: tuple(x)),
           Max_duration_zones=pd.NamedAgg(column='Zone', aggfunc=lambda x: max_duration_zones(x)),
           dico_duration_stats=pd.NamedAgg(column='Zone', aggfunc=lambda x: dico_duration_stats(x, nbr_sec)),
           dico_zone_sortedduration=pd.NamedAgg(column='Zone', aggfunc=lambda x: dico_zone_sortedduration(x, nbr_sec)),
           Total_number_transition=pd.NamedAgg(column='Zone', aggfunc=lambda x: nbr_transition(list((x)))),
           nbr_bouts=pd.NamedAgg(column='Zone', aggfunc=lambda x: nbr_bouts_per_zone(list((x))))).reset_index()

    df_daily = pd.merge(df_daily, df_, how='outer', on=['HenID','level'])
    for z in li_Zone:
        df_daily['nbr_bouts_'+z] = df_daily['nbr_bouts'].map(lambda x: x.get(z,0))
    df_daily.drop(['nbr_bouts'], inplace=True, axis=1)
    
    #add info from stats of the duration list (from a dictionary column into x (=len(dico)) columns)
    df_daily = pd.concat([df_daily.drop(['dico_duration_stats'], axis=1), df_daily['dico_duration_stats'].apply(pd.Series)], axis=1)
        
        
    ################################################################################################################
    ############################# add basics hens info, remove unwanted dates and save #############################
    ################################################################################################################
            
    #add basics hens info
    #download info on henID associtation to (TagID,date) 
    df_FB = pd.read_excel(path_FocalBird, parse_dates=['StartDate','EndDate'])
    df_FB['HenID'] = df_FB['HenID'].map(lambda x: 'hen_'+str(x))
    df_FB = df_FB[df_FB['ShouldBeExcluded']!='yes']
    df_FB['EndDate'].fillna(date_max+dt.timedelta(days=1), inplace=True)
    
    #Note: the HenID was already match according to the correct dates. 
    #Assumption:Each henID is linked to a unique PenID!
    df_daily = pd.merge(df_daily, df_FB[['HenID','PenID','CLASS','29-09 weight']], on=['HenID'], how='left')

    #remove dates with health care
    print('-------------- Lets remove unwanted dates that impacted ALL PENS')
    if len(li_date2remove)!=0:
        df_daily['date_toberemoved'] = df_daily['level'].map(lambda x: x in li_date2remove)
        x0 = df_daily.shape[0]
        df_daily = df_daily[~df_daily['date_toberemoved']]
        print_color((('By removing the unwanted days we passed from %d to %d timestamp (losing '%(x0,
                    df_daily.shape[0]),'black'), (x0-df_daily.shape[0],'red'),(' timestamp)','black')))    

    #remove dates linked to specific system
    print('-------------- Lets remove unwanted dates that impacted FEW PENS')
    if len(dico_date2remove_pens)!=0:
        df_daily['date_2remove_penper'] = df_daily.apply(lambda x: int(x['PenID']) in dico_date2remove_pens[x['level']], axis=1)
        x0 = df_daily.shape[0]
        df_daily = df_daily[~df_daily['date_2remove_penper']]
        print_color((('By removing the unwanted days we passed from %d to %d timestamp (losing '%(x0,
                    df_daily.shape[0]),'black'), (x0-df_daily.shape[0],'red'),(' timestamp)','black')))   
        
    #remove dates linked to specific hens
    print('-------------- Lets remove dates that impacted FEW HENS')
    #create a dictionary with henID as keys and a list of tracking-active days
    dico_hen_activedate = defaultdict(list)
    for i in range(df_FB.shape[0]):
        x = df_FB.iloc[i]
        li_dates = pd.date_range(start=x['StartDate']+dt.timedelta(days=1), 
                                 end=x['EndDate']-dt.timedelta(days=1), freq='D')
        dico_hen_activedate[x['HenID']].extend([dt.datetime.date(d) for d in li_dates])
    df_daily['level'] = df_daily['level'].map(lambda x: dt.datetime.date(x))
    df_daily['date_2remove_penhen'] = df_daily.apply(lambda x: x['level'] not in dico_hen_activedate[x['HenID']], axis=1)
    x0 = df_daily.shape[0]
    df_daily = df_daily[~df_daily['date_2remove_penhen']]
    print_color((('By removing the unwanted days we passed from %d to %d timestamp (losing '%(x0,
                df_daily.shape[0]),'black'), (x0-df_daily.shape[0],'red'),(' timestamp)','black')))      

    #save
    df_daily.drop(['verification_daily_total_nbr_hour','zone_list'],inplace=True,axis=1) #verification_daily_total_duration
    df_daily.to_csv(os.path.join(path_extracted_data, id_run+'_'+ts_name+'_'+name_+'_variables.csv'), sep=';', index=False)

    END_TIME = time.clock()
    print ("Total running time: %.2f mn" %((END_TIME-START_TIME)/60))

    
    return(df_daily)