## a) Extract good features for association discovery from the bird data.

In [442]:
import pandas as pd
import numpy as np


In [443]:
bird_data = pd.read_csv('data/birds2024ext.csv', delimiter=';')
features = pd.DataFrame()

bird_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   species       50 non-null     object 
 1   group         50 non-null     object 
 2   length        50 non-null     object 
 3   wspan         50 non-null     object 
 4   weight        50 non-null     object 
 5   AR            50 non-null     float64
 6   wload         50 non-null     float64
 7   back          50 non-null     object 
 8   belly         50 non-null     object 
 9   ftype         50 non-null     object 
 10  sim           50 non-null     object 
 11  billcol       50 non-null     object 
 12  legcol        50 non-null     object 
 13  arrives       50 non-null     object 
 14  leaves        50 non-null     object 
 15  eggs          50 non-null     object 
 16  incub         50 non-null     object 
 17  ccare         50 non-null     object 
 18  biotope       50 non-null     ob

### Numerical features
- Care only the extremes, e.g. few eggs or lots of eggs.
- Add `robust` attribute for high BMI
- Add `short-winged` attribute for low WSI

In [444]:
def range_to_mid(r: str) -> float:
    '''
    Return the mid value of a numerical range
    '''
    if '-' not in r:
        return int(r)
    lower_bound, upper_bound = list(map(int,r.split('-')))
    return (lower_bound + upper_bound) / 2


def min_max_scaling(d: np.ndarray) -> np.ndarray:
    min_val = d.min()
    max_val = d.max()
    return (d-min_val) / (max_val - min_val)

def l2_dist(a, b) -> np.ndarray:
    return np.sqrt(np.sum((a - b)**2))


In [445]:
num_range_cols = ['length', 'wspan', 'weight', 'eggs']
# map numerical ranges to their mid points
bird_data[num_range_cols] = bird_data[num_range_cols].map(range_to_mid)
bird_data.rename(columns={col: f'mid-{col}' for col in num_range_cols}, inplace=True)

bird_data['BMI'] = bird_data['mid-weight'] / (bird_data['mid-length'] ** 2)
bird_data['WSI'] = bird_data['mid-wspan'] / bird_data['mid-length']

bird_data.head(5)

Unnamed: 0,species,group,mid-length,mid-wspan,mid-weight,AR,wload,back,belly,ftype,...,biotope,diet,diver,long-billed,webbed-feet,long-legs,wading-bird,plunge-dives,BMI,WSI
0,naurulokki,laridae,36.0,92.5,275.0,8.13,0.31,light grey,white,B,...,"lakes,sea-bays","fish,invertebrates,garbage",No,No,Yes,No,No,Yes,0.212191,2.569444
1,harmaalokki,laridae,60.0,135.5,1050.0,8.24,0.64,bluish grey,white,B,...,"lakes,sea-coast,marshland","fish,garbage,chicks,grain",No,No,Yes,No,No,Yes,0.291667,2.258333
2,isolokki,laridae,65.5,148.0,1400.0,8.24,0.66,bluish grey,white,B,...,"sea-coast, harbours","fish,eggs,chicks,garbage,carrion",No,No,Yes,No,No,Yes,0.326321,2.259542
3,kalatiira,sternidae,39.0,75.0,122.5,9.14,0.24,grey,white,B,...,"lakes,archipelago",fish,No,No,Yes,No,No,Yes,0.080539,1.923077
4,lapintiira,sternidae,35.0,71.5,110.0,8.97,0.2,grey,white,B,...,"archipelago,lakes,marshland",fish,No,No,Yes,No,No,Yes,0.089796,2.042857


### BUG in Bio Group
I found that there is a group `gruifores` at the line 28 of the data set.
It should be `gruiformes` which is shown in the Taxonomy (birdsextdescription.txt)

In [446]:
# set up a biological group taxonomy
class BioGroup():
    def __init__(self, idx: str, name: str, level: int, family_name: str | None):
        self.idx = idx
        self.name = name
        self.level = level
        self.family_name = family_name


def merge_bio_group(group_name: str, bio_tree: dict[str, BioGroup]) -> str:
    group = bio_tree[group_name]
    if group.level == 2:
        return group.family_name
    return group.name


bio_tree: dict[str, BioGroup] = {}
with open('data/grouptaxonomy.txt', 'r') as group_fobj:
    for line in group_fobj:
        group_idx, group_name = list(map(str.strip,line.lower().split(';')[:2]))
        # print(f'{group_idx} {group_name}')
        group_level = group_idx.count('.')
        group_family = None

        # looking for its family
        if group_level > 0:
            family_indx = '.'.join(group_idx.split('.')[:group_level])
            for group in bio_tree.values():
                if family_indx == group.idx:
                    group_family = group.name
                    break
            
        bio_tree[group_name] = BioGroup(group_idx, group_name, group_level, group_family)

# Add group family to 3rd level bio category
features['group'] = bird_data['group'].apply(lambda g: merge_bio_group(g, bio_tree))

features['diet'] = bird_data['diet'].apply(lambda x: x.split(','))
features['habitat'] = bird_data['biotope'].apply(lambda x: x.split(','))

features.head(15)

Unnamed: 0,group,diet,habitat
0,lari,"[fish, invertebrates, garbage]","[lakes, sea-bays]"
1,lari,"[fish, garbage, chicks, grain]","[lakes, sea-coast, marshland]"
2,lari,"[fish, eggs, chicks, garbage, carrion]","[sea-coast, harbours]"
3,lari,[fish],"[lakes, archipelago]"
4,lari,[fish],"[archipelago, lakes, marshland]"
5,charadrii,"[invertebrates, seeds]","[marshland, wetlands]"
6,charadrii,[invertebrates],"[marshland, coastal-meadows]"
7,charadrii,"[invertebrates, worms, larvae, snails]",[forests]
8,charadrii,"[invertebrates, plants]","[forests, ponds]"
9,charadrii,[invertebrates],"[marshland, wetlands]"


### Binary features (Yes-No)

In [447]:
# Convert Yes-No fields into True/False for convenience later in comparision
yes_no_features = ['diver','long-billed','webbed-feet','long-legs','wading-bird','plunge-dives']

for feat in yes_no_features:
    features[feat] = bird_data[feat].apply(lambda x: x == 'Yes')

# In the case of `sim` feature, I create 2 attributes `dif-gender` and `sim-gender`
# instead of a binary feature
features['dif-gender'] = bird_data['sim'].apply(lambda x: x != 'Yes')
features['sim-gender'] = bird_data['sim'].apply(lambda x: x == 'Yes')

features.head(15)


Unnamed: 0,group,diet,habitat,diver,long-billed,webbed-feet,long-legs,wading-bird,plunge-dives,dif-gender,sim-gender
0,lari,"[fish, invertebrates, garbage]","[lakes, sea-bays]",False,False,True,False,False,True,False,True
1,lari,"[fish, garbage, chicks, grain]","[lakes, sea-coast, marshland]",False,False,True,False,False,True,False,True
2,lari,"[fish, eggs, chicks, garbage, carrion]","[sea-coast, harbours]",False,False,True,False,False,True,False,True
3,lari,[fish],"[lakes, archipelago]",False,False,True,False,False,True,False,True
4,lari,[fish],"[archipelago, lakes, marshland]",False,False,True,False,False,True,False,True
5,charadrii,"[invertebrates, seeds]","[marshland, wetlands]",False,False,False,True,True,False,True,False
6,charadrii,[invertebrates],"[marshland, coastal-meadows]",False,True,False,False,True,False,False,True
7,charadrii,"[invertebrates, worms, larvae, snails]",[forests],False,True,False,False,True,False,False,True
8,charadrii,"[invertebrates, plants]","[forests, ponds]",False,True,False,True,True,False,False,True
9,charadrii,[invertebrates],"[marshland, wetlands]",False,True,False,True,True,False,False,True


### Multi-valued Categorical features
Creating one attribute for each value

In [448]:
multi_value_features = ['ftype', 'incub', 'ccare']
for feat in multi_value_features:
    feat_values = bird_data[feat].unique()
    for attribute in feat_values:
        features[f'{feat}_{attribute}'] = bird_data[feat].apply(lambda x: x == attribute)

features.head(15)


Unnamed: 0,group,diet,habitat,diver,long-billed,webbed-feet,long-legs,wading-bird,plunge-dives,dif-gender,sim-gender,ftype_B,ftype_C,ftype_A,incub_both,incub_F,incub_M,ccare_both,ccare_F,ccare_M
0,lari,"[fish, invertebrates, garbage]","[lakes, sea-bays]",False,False,True,False,False,True,False,True,True,False,False,True,False,False,True,False,False
1,lari,"[fish, garbage, chicks, grain]","[lakes, sea-coast, marshland]",False,False,True,False,False,True,False,True,True,False,False,True,False,False,True,False,False
2,lari,"[fish, eggs, chicks, garbage, carrion]","[sea-coast, harbours]",False,False,True,False,False,True,False,True,True,False,False,True,False,False,True,False,False
3,lari,[fish],"[lakes, archipelago]",False,False,True,False,False,True,False,True,True,False,False,True,False,False,True,False,False
4,lari,[fish],"[archipelago, lakes, marshland]",False,False,True,False,False,True,False,True,True,False,False,True,False,False,True,False,False
5,charadrii,"[invertebrates, seeds]","[marshland, wetlands]",False,False,False,True,True,False,True,False,False,True,False,False,True,False,False,True,False
6,charadrii,[invertebrates],"[marshland, coastal-meadows]",False,True,False,False,True,False,False,True,False,True,False,False,True,False,False,True,False
7,charadrii,"[invertebrates, worms, larvae, snails]",[forests],False,True,False,False,True,False,False,True,False,True,False,False,True,False,False,True,False
8,charadrii,"[invertebrates, plants]","[forests, ponds]",False,True,False,True,True,False,False,True,False,True,False,True,False,False,False,False,True
9,charadrii,[invertebrates],"[marshland, wetlands]",False,True,False,True,True,False,False,True,False,True,False,True,False,False,False,False,True


### Informative features from the spring and autumn migration times
- Start early: `arrives` < April
- End late: `leaves` > September

In [449]:
EARLY_MONTH = 4
LATE_MONTH = 9

month_dict = {
    'january': 1,
    'february': 2,
    'march': 3,
    'april': 4,
    'may': 5,
    'june': 6,
    'july': 7,
    'august': 8,
    'september': 9,
    'october': 10,
    'november': 11,
    'december': 12
}


def month_from_str(text: str, early_first = True) -> int:
    '''
    Return the numerical representation of a given month string.
    In the case of month range, the function returns the early month
    or the late month depending on the given `early_first`. By default,
    the early month is returned.
    '''
    month_elements = list(map(str.lower, text.split('-')))
    if len(month_elements) == 1:
        return month_dict[month_elements[0]]
    
    min_month, max_month = month_elements
    if early_first:
        return month_dict[min_month]
    return month_dict[max_month]


features['early-arrival'] = bird_data['arrives'].apply(lambda x: month_from_str(x, True) < EARLY_MONTH)
features['late-leave'] = bird_data['leaves'].apply(lambda x: month_from_str(x, False) > LATE_MONTH)

features[['early-arrival', 'late-leave']].head(5)


Unnamed: 0,early-arrival,late-leave
0,True,False
1,True,True
2,False,False
3,False,False
4,False,False


### Numerical features, only the extremes are interesting

In [450]:
mean_BMI = bird_data['BMI'].mean()
mean_WSI = bird_data['WSI'].mean()

features['robust'] = bird_data['BMI'].apply(lambda x: x > mean_BMI)
features['short-winged'] = bird_data['WSI'].apply(lambda x: x < mean_WSI)

# Look at the value of `too few` and `too many` eggs
too_few, too_many = bird_data['mid-eggs'].quantile([0.20, 0.80])
print(too_few)
print(too_many)
features['too-few-eggs'] = bird_data['mid-eggs'].apply(lambda x: x < too_few)
features['too-many-eggs'] = bird_data['mid-eggs'].apply(lambda x: x > too_many)

features.head(15)

2.5
8.0


Unnamed: 0,group,diet,habitat,diver,long-billed,webbed-feet,long-legs,wading-bird,plunge-dives,dif-gender,...,incub_M,ccare_both,ccare_F,ccare_M,early-arrival,late-leave,robust,short-winged,too-few-eggs,too-many-eggs
0,lari,"[fish, invertebrates, garbage]","[lakes, sea-bays]",False,False,True,False,False,True,False,...,False,True,False,False,True,False,False,False,True,False
1,lari,"[fish, garbage, chicks, grain]","[lakes, sea-coast, marshland]",False,False,True,False,False,True,False,...,False,True,False,False,True,True,True,False,True,False
2,lari,"[fish, eggs, chicks, garbage, carrion]","[sea-coast, harbours]",False,False,True,False,False,True,False,...,False,True,False,False,False,False,True,False,False,False
3,lari,[fish],"[lakes, archipelago]",False,False,True,False,False,True,False,...,False,True,False,False,False,False,False,False,True,False
4,lari,[fish],"[archipelago, lakes, marshland]",False,False,True,False,False,True,False,...,False,True,False,False,False,False,False,False,True,False
5,charadrii,"[invertebrates, seeds]","[marshland, wetlands]",False,False,False,True,True,False,True,...,False,False,True,False,False,False,False,True,False,False
6,charadrii,[invertebrates],"[marshland, coastal-meadows]",False,True,False,False,True,False,False,...,False,False,True,False,False,True,False,True,False,False
7,charadrii,"[invertebrates, worms, larvae, snails]",[forests],False,True,False,False,True,False,False,...,False,False,True,False,True,True,False,True,False,False
8,charadrii,"[invertebrates, plants]","[forests, ponds]",False,True,False,True,True,False,False,...,False,False,False,True,True,False,False,True,False,False
9,charadrii,[invertebrates],"[marshland, wetlands]",False,True,False,True,True,False,False,...,False,False,False,True,False,False,False,True,False,False


In [451]:
features.head(15)
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   group          50 non-null     object
 1   diet           50 non-null     object
 2   habitat        50 non-null     object
 3   diver          50 non-null     bool  
 4   long-billed    50 non-null     bool  
 5   webbed-feet    50 non-null     bool  
 6   long-legs      50 non-null     bool  
 7   wading-bird    50 non-null     bool  
 8   plunge-dives   50 non-null     bool  
 9   dif-gender     50 non-null     bool  
 10  sim-gender     50 non-null     bool  
 11  ftype_B        50 non-null     bool  
 12  ftype_C        50 non-null     bool  
 13  ftype_A        50 non-null     bool  
 14  incub_both     50 non-null     bool  
 15  incub_F        50 non-null     bool  
 16  incub_M        50 non-null     bool  
 17  ccare_both     50 non-null     bool  
 18  ccare_F        50 non-null     b

In [452]:
import sys

# Convert feature table to transaction rules
def convert_to_trans(data_row: pd.Series) -> str:
    str_builder = []
    for col in data_row.index:
        val = data_row[col]
        if isinstance(val, list):
            if col == 'diet':
                str_builder.extend([f'eat_{food}' for food in val])
            elif col == 'habitat':
                str_builder.extend([f'live_{env}' for env in val])
        elif isinstance(val, bool):
            if val:
                str_builder.append(col)
        elif isinstance(val, str):
            str_builder.append(val)
        
    return ' '.join(str_builder)

transactions = features.apply(convert_to_trans, axis=1)

file_folder = sys.argv[1]
file_path = f'{file_folder}/transbird.csv'
transactions.to_csv(path_or_buf=file_path, header=False, index=False)