In [398]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Numerical features engineering

### Data Summary

In [399]:
data = pd.read_csv('birds2024ext.csv', delimiter=';')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   species       50 non-null     object 
 1   group         50 non-null     object 
 2   length        50 non-null     object 
 3   wspan         50 non-null     object 
 4   weight        50 non-null     object 
 5   AR            50 non-null     float64
 6   wload         50 non-null     float64
 7   back          50 non-null     object 
 8   belly         50 non-null     object 
 9   ftype         50 non-null     object 
 10  sim           50 non-null     object 
 11  billcol       50 non-null     object 
 12  legcol        50 non-null     object 
 13  arrives       50 non-null     object 
 14  leaves        50 non-null     object 
 15  eggs          50 non-null     object 
 16  incub         50 non-null     object 
 17  ccare         50 non-null     object 
 18  biotope       50 non-null     ob

In [420]:
print(data.dtypes)

species          object
group            object
length           object
wspan            object
weight           object
AR              float64
wload           float64
back             object
belly            object
ftype            object
sim              object
billcol          object
legcol           object
arrives          object
leaves           object
eggs             object
incub            object
ccare            object
biotope          object
diet             object
diver            object
long-billed      object
webbed-feet      object
long-legs        object
wading-bird      object
plunge-dives     object
dtype: object


In [400]:
data.T.iloc[:, :4]

Unnamed: 0,0,1,2,3
species,naurulokki,harmaalokki,isolokki,kalatiira
group,laridae,laridae,laridae,sternidae
length,34-38,55-65,63-68,36-42
wspan,86-99,123-148,138-158,70-80
weight,200-350,800-1300,1000-1800,100-145
AR,8.13,8.24,8.24,9.14
wload,0.31,0.64,0.66,0.24
back,light grey,bluish grey,bluish grey,grey
belly,white,white,white,white
ftype,B,B,B,B


## Iteration-1 features

start with the habitat, group, diet as the first features.

In [401]:
def extract_features(df, feature_list):
    df_return = df[feature_list].copy()
    return df_return

In [402]:
# More descriptive naming
feature_list = ['group', 'biotope', 'diet']
feature_subset = extract_features(data, feature_list=feature_list)
feature_subset.head(5)


Unnamed: 0,group,biotope,diet
0,laridae,"lakes,sea-bays","fish,invertebrates,garbage"
1,laridae,"lakes,sea-coast,marshland","fish,garbage,chicks,grain"
2,laridae,"sea-coast, harbours","fish,eggs,chicks,garbage,carrion"
3,sternidae,"lakes,archipelago",fish
4,sternidae,"archipelago,lakes,marshland",fish


In [403]:
# Check for NaNs
print("NaNs in 'group':", feature_subset['group'].isna().sum())
print("NaNs in 'biotope':", feature_subset['biotope'].isna().sum())
print("NaNs in 'diet':", feature_subset['diet'].isna().sum())

# Check for empty strings
print("Empty strings in 'group':", (feature_subset['group'] == '').sum())
print("Empty strings in 'biotope':", (feature_subset['biotope'] == '').sum())
print("Empty strings in 'diet':", (feature_subset['diet'] == '').sum())


NaNs in 'group': 0
NaNs in 'biotope': 0
NaNs in 'diet': 0
Empty strings in 'group': 0
Empty strings in 'biotope': 0
Empty strings in 'diet': 0


In [404]:
# Function to clean categorical columns
def clean_column(df, column, delimiter=','):
    """
    Cleans a categorical column by:
    - Stripping whitespace
    - Converting to lowercase
    - Standardizing delimiters
    """
    # Remove leading/trailing whitespace and convert to lowercase
    df.loc[:, column] = df[column].str.strip().str.lower()
    
    # Replace multiple spaces with single space using a raw string
    df.loc[:, column] = df[column].str.replace(r'\s+', ' ', regex=True)
    
    # Ensure consistent delimiter spacing
    df.loc[:, column] = df[column].str.replace(f' {delimiter}', delimiter, regex=False)
    df.loc[:, column] = df[column].str.replace(f'{delimiter} ', delimiter, regex=False)
    
    return df


def count_unique_items(df, columns):
    """
    Counts the number of unique items in each specified column of the DataFrame,
    considering that some cell values contain comma-separated items.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the data.
    - columns (list): List of column names to analyze.

    Returns:
    - dict: A dictionary with column names as keys and unique counts as values.
    """
    unique_counts = {}
    for column in columns:
        unique_items = set()
        for items in df[column].dropna():
            # Split the string by commas and strip whitespace
            split_items = [item.strip() for item in items.split(',')]
            unique_items.update(split_items)
        unique_counts[column] = len(unique_items)
    return unique_counts

In [405]:
# Calculate unique counts

cols = feature_subset.columns.tolist()[1:]
# Apply cleaning to each categorical column
for column in cols:
    feature_subset = clean_column(feature_subset, column)

unique_counts = count_unique_items(feature_subset, cols)
sum = 0



# Print unique counts
print("\nNumber of Unique Values in Each Column:")
for column, count in unique_counts.items():
    sum += count
    print(f"Column '{column}': {count} unique value(s)")
print(f"total count: {sum} for columns {cols}")



Number of Unique Values in Each Column:
Column 'biotope': 21 unique value(s)
Column 'diet': 28 unique value(s)
total count: 49 for columns ['biotope', 'diet']


### One-Hot Encoding

The Apriori algorithm needs the data to be in a 1 hot encoded format with binary values for each feature. thus the 3 features are now subdivided into binary features



go through groups, if not in array, add it. 
then loop through that arr, and add that to columns. 
then loop through data and add 1 to that col if group is that 
then delete group column


pd.concat() in pandas is used to concatenate two or more DataFrames or Series along a particular axis (either rows or columns). You can specify how you want to join the data (e.g., along rows or columns) and whether to keep all data or just the common parts.

In [406]:
def one_hot_encode_column(df: pd.DataFrame, column: str, unique: bool) -> pd.DataFrame:
    """ 
    encodes the selected column as a binary valued columns

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the data.
    - column: column to process
    - unique (bool): if the column has unique values

    Returns:
    - pd.DataFrame: DataFrame with the original column dropped and new one-hot encoded columns added.
    """
    if column not in df.columns:
        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

    if unique:
        group_dummies = pd.get_dummies(df[column], prefix=column)
    else: 
        group_dummies = df[column].str.get_dummies(sep=',').rename(columns=lambda x: f'{column}_{x.strip()}')
        group_dummies = group_dummies.astype(bool)
    

    encoded_features = pd.concat([df.drop(columns=[column]), group_dummies], axis=1)
    return encoded_features


In [407]:


pd.set_option('display.max_rows', None)  # This will show all rows
encoded_features = feature_subset.copy()

encoded_features = one_hot_encode_column(encoded_features, 'group', unique=True)

for col in ['biotope', 'diet']:
    encoded_features = one_hot_encode_column(encoded_features, col, unique=False)



print("encoded_features.head(1).T.shape", " : " , encoded_features.head(1).T.shape)

encoded_features.head(1).T.shape  :  (63, 1)


### Since association rules are not monotonic, we could find interesting rules by looking at the family of the birds. 

In [408]:
# Assuming 'encoded_features' is the DataFrame that contains the one-hot encoded columns
species_columns = [col for col in encoded_features.columns if col.startswith('group_')]



taxonomy_mapping = {
    'group_laridae': '1.1 Lari',
    'group_sternidae': '1.1 Lari',
    'group_scolopacidae': '1.2 Charadrii',
    'group_charadriidae': '1.2 Charadrii',
    'group_haematopodidae': '1.2 Charadrii',
    'group_dabbling ducks': '2.1 Anatinae',
    'group_diving ducks': '2.1 Anatinae'
}

print("len(taxonomy_mapping)", " " , len(taxonomy_mapping))
print(species_columns)
print(len(species_columns))

len(taxonomy_mapping)   7
['group_accipitridae', 'group_anserinae', 'group_ardeidae', 'group_charadriidae', 'group_dabbling ducks', 'group_diving ducks', 'group_gaviidae', 'group_gruifores', 'group_laridae', 'group_phalacrocoracidae', 'group_podicipedidae', 'group_rallidae', 'group_scolopacidae', 'group_sternidae']
14


In [409]:
# Function to map the general group based on the true column
def map_general_group(row):
    for col in species_columns:
        if row[col]:  # Check if the group is True
            return taxonomy_mapping.get(col, None)  # Map to general group
    return None

### Adding the general family column

In [410]:

# Apply the mapping to create the new column 'general_group'
encoded_features['family'] = encoded_features.apply(map_general_group, axis=1)
encoded_features = one_hot_encode_column(encoded_features, 'family', unique= True)

# encoded_features.T.iloc[encoded_features.shape[1] - 5: encoded_features.shape[1]+ 1, :3]
encoded_features.T.iloc[encoded_features.shape[1] - 12: encoded_features.shape[1]+ 1, :3]

Unnamed: 0,0,1,2
diet_rodents,False,False,False
diet_seeds,False,False,False
diet_shellfish,False,False,False
diet_small-rodents,False,False,False
diet_snails,False,False,False
diet_snakes,False,False,False
diet_squirrels,False,False,False
diet_vertebrae,False,False,False
diet_worms,False,False,False
family_1.1 Lari,True,True,True


## Apriori Algorithm

### Support

The support of an itemset \( X \) is defined as:

$$
\text{Support}(X) = \frac{\text{Number of transactions containing } X}{\text{Total number of transactions}}
$$

### Confidence

The confidence of an association rule \( X \rightarrow Y \) is defined as:
$$
\text{Confidence}(X \rightarrow Y) = \frac{\text{Support}(X \cup Y)}{\text{Support}(X)} = P(Y \mid X)
$$

### Lift

The lift of an association rule \( X \rightarrow Y \) is defined as:
$$
\text{Lift}(X \rightarrow Y) = \frac{\text{Confidence}(X \rightarrow Y)}{\text{Support}(Y)} = \frac{P(Y \mid X)}{P(Y)}
$$

Alternatively, it can be expressed as:
$$
\text{Lift}(X \rightarrow Y) = \frac{\text{Support}(X \cup Y)}{\text{Support}(X) \times \text{Support}(Y)} = \frac{P(Y \mid X)}{P(Y)}
$$


In [411]:
from mlxtend.frequent_patterns import apriori, association_rules


In [412]:
# Define minimum support (e.g., 0.05 for 5%)
frequent_itemsets = apriori(encoded_features, min_support=0.05, use_colnames=True)

# Sort by support
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)
print(len(frequent_itemsets))
frequent_itemsets.head(10)


197


Unnamed: 0,support,itemsets
31,0.36,(diet_invertebrates)
12,0.34,(biotope_lakes)
26,0.32,(diet_fish)
30,0.26,(diet_insects)
32,0.26,(diet_plants)
40,0.24,(family_1.2 Charadrii)
80,0.2,"(diet_fish, biotope_lakes)"
35,0.2,(diet_small-rodents)
18,0.18,(biotope_sea-bays)
123,0.18,"(diet_invertebrates, family_1.2 Charadrii)"


In [413]:
# Define minimum confidence (e.g., 0.7 for 70%)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

# Sort rules by confidence
rules = rules.sort_values(by='confidence', ascending=False)

rules.head(20)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
172,"(family_2.1 Anatinae, group_dabbling ducks, bi...",(diet_plants),0.06,0.26,0.06,1.0,3.846154,0.0444,inf,0.787234
321,(diet_snakes),(diet_frogs),0.06,0.18,0.06,1.0,5.555556,0.0492,inf,0.87234
150,"(biotope_marshland, family_1.2 Charadrii)",(diet_invertebrates),0.06,0.36,0.06,1.0,2.777778,0.0384,inf,0.680851
320,(diet_garbage),(family_1.1 Lari),0.06,0.1,0.06,1.0,10.0,0.054,inf,0.957447
318,(diet_seeds),(diet_invertebrates),0.06,0.36,0.06,1.0,2.777778,0.0384,inf,0.680851
154,"(family_2.1 Anatinae, diet_invertebrates, biot...",(group_dabbling ducks),0.06,0.1,0.06,1.0,10.0,0.054,inf,0.957447
155,"(family_2.1 Anatinae, diet_invertebrates, grou...",(diet_plants),0.06,0.26,0.06,1.0,3.846154,0.0444,inf,0.787234
157,"(diet_invertebrates, group_dabbling ducks, die...",(family_2.1 Anatinae),0.06,0.14,0.06,1.0,7.142857,0.0516,inf,0.914894
316,(group_ardeidae),(diet_small-rodents),0.06,0.2,0.06,1.0,5.0,0.048,inf,0.851064
315,(diet_snails),(family_1.2 Charadrii),0.06,0.24,0.06,1.0,4.166667,0.0456,inf,0.808511


In [414]:
# Example: Rules with lift greater than 1.2
interesting_rules = rules[rules['lift'] > 10]

interesting_rules = interesting_rules.sort_values(by='lift', ascending=False)

interesting_rules.head(20)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
146,(diet_garbage),"(group_laridae, diet_fish)",0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0
145,(group_laridae),"(diet_fish, diet_garbage)",0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0
213,"(diet_fish, diet_garbage, family_1.1 Lari)",(group_laridae),0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0
212,"(group_laridae, diet_fish, family_1.1 Lari)",(diet_garbage),0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0
215,"(group_laridae, diet_fish)","(diet_garbage, family_1.1 Lari)",0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0
216,"(group_laridae, family_1.1 Lari)","(diet_fish, diet_garbage)",0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0
217,"(diet_fish, diet_garbage)","(group_laridae, family_1.1 Lari)",0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0
218,"(diet_garbage, family_1.1 Lari)","(group_laridae, diet_fish)",0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0
220,(diet_garbage),"(group_laridae, diet_fish, family_1.1 Lari)",0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0
219,(group_laridae),"(diet_fish, diet_garbage, family_1.1 Lari)",0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0


## Introduction of more features


In [415]:
pd.reset_option('display.max_rows')

encoded_features.T.iloc[-15:, :4]

Unnamed: 0,0,1,2,3
diet_lizards,False,False,False,False
diet_molluscs,False,False,False,False
diet_plants,False,False,False,False
diet_rodents,False,False,False,False
diet_seeds,False,False,False,False
diet_shellfish,False,False,False,False
diet_small-rodents,False,False,False,False
diet_snails,False,False,False,False
diet_snakes,False,False,False,False
diet_squirrels,False,False,False,False


In [422]:
def boolean_encode_column(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """ 
    turns the yes no columns to boolean
    """
    df[column] = df[column].map({'Yes': True, 'No': False})
    return df

def add_new_colummns(data: pd.DataFrame, df: pd.DataFrame, columns: list{str}) -> pd.DataFrame:
    """ 
    adds a new column and turns that into boolean values
    """
    
    for col in columns:
        if col not in data.columns:
            raise ValueError (f"Column '{col}' does not exist in the DataFrame.")

        df[col] = data[col]
        boolean_encode_column(df, col)

    return df



In [423]:
binary_encoded_columns = ['sim', 'diver', 'long-billed', 'webbed-feet', 'long-legs', 'wading-bird', 'plunge-dives']

add_new_colummns(data, encoded_features, binary_encoded_columns)

encoded_features.T.iloc[:, 10:30]

Unnamed: 0,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
group_accipitridae,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
group_anserinae,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False
group_ardeidae,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
group_charadriidae,False,False,False,True,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False
group_dabbling ducks,False,False,False,False,False,False,False,True,True,True,True,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
diet_worms,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False
family_1.1 Lari,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
family_1.2 Charadrii,True,True,True,True,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False
family_2.1 Anatinae,False,False,False,False,False,False,False,True,True,True,True,True,True,True,False,False,False,False,False,False
