In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Numerical features engineering

### Data Summary

In [90]:
data = pd.read_csv('birds2024ext.csv', delimiter=';')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   species       50 non-null     object 
 1   group         50 non-null     object 
 2   length        50 non-null     object 
 3   wspan         50 non-null     object 
 4   weight        50 non-null     object 
 5   AR            50 non-null     float64
 6   wload         50 non-null     float64
 7   back          50 non-null     object 
 8   belly         50 non-null     object 
 9   ftype         50 non-null     object 
 10  sim           50 non-null     object 
 11  billcol       50 non-null     object 
 12  legcol        50 non-null     object 
 13  arrives       50 non-null     object 
 14  leaves        50 non-null     object 
 15  eggs          50 non-null     object 
 16  incub         50 non-null     object 
 17  ccare         50 non-null     object 
 18  biotope       50 non-null     ob

In [91]:
data.T.iloc[:, :9]

Unnamed: 0,0,1,2,3,4,5,6,7,8
species,naurulokki,harmaalokki,isolokki,kalatiira,lapintiira,suokukko,taivaanvuohi,lehtokurppa,metsäviklo
group,laridae,laridae,laridae,sternidae,sternidae,scolopacidae,scolopacidae,scolopacidae,scolopacidae
length,34-38,55-65,63-68,36-42,33-37,25-26,25-27,34-36,21-24
wspan,86-99,123-148,138-158,70-80,66-77,46-49,39-45,55-65,39-44
weight,200-350,800-1300,1000-1800,100-145,90-130,90-130,90-110,280-330,75-85
AR,8.13,8.24,8.24,9.14,8.97,6.73,5.91,5.68,7.2
wload,0.31,0.64,0.66,0.24,0.2,0.36,0.4,0.54,0.29
back,light grey,bluish grey,bluish grey,grey,grey,dappled brown,dappled brown,dappled brown,brown
belly,white,white,white,white,white,white,white,dappled beige,white
ftype,B,B,B,B,B,C,C,C,C


## Iteration-1 features

start with the habitat, group, diet as the first features.

In [92]:
def extract_features(df, feature_list):
    df_return = df[feature_list].copy()
    return df_return

In [93]:
# More descriptive naming
feature_list = ['group', 'biotope', 'diet']
feature_subset = extract_features(data, feature_list=feature_list)
feature_subset.head(5)


Unnamed: 0,group,biotope,diet
0,laridae,"lakes,sea-bays","fish,invertebrates,garbage"
1,laridae,"lakes,sea-coast,marshland","fish,garbage,chicks,grain"
2,laridae,"sea-coast, harbours","fish,eggs,chicks,garbage,carrion"
3,sternidae,"lakes,archipelago",fish
4,sternidae,"archipelago,lakes,marshland",fish


In [94]:
# Check for NaNs
print("NaNs in 'group':", feature_subset['group'].isna().sum())
print("NaNs in 'biotope':", feature_subset['biotope'].isna().sum())
print("NaNs in 'diet':", feature_subset['diet'].isna().sum())

# Check for empty strings
print("Empty strings in 'group':", (feature_subset['group'] == '').sum())
print("Empty strings in 'biotope':", (feature_subset['biotope'] == '').sum())
print("Empty strings in 'diet':", (feature_subset['diet'] == '').sum())


NaNs in 'group': 0
NaNs in 'biotope': 0
NaNs in 'diet': 0
Empty strings in 'group': 0
Empty strings in 'biotope': 0
Empty strings in 'diet': 0


In [95]:
# Function to clean categorical columns
def clean_column(df, column, delimiter=','):
    """
    Cleans a categorical column by:
    - Stripping whitespace
    - Converting to lowercase
    - Standardizing delimiters
    """
    # Remove leading/trailing whitespace and convert to lowercase
    df.loc[:, column] = df[column].str.strip().str.lower()
    
    # Replace multiple spaces with single space using a raw string
    df.loc[:, column] = df[column].str.replace(r'\s+', ' ', regex=True)
    
    # Ensure consistent delimiter spacing
    df.loc[:, column] = df[column].str.replace(f' {delimiter}', delimiter, regex=False)
    df.loc[:, column] = df[column].str.replace(f'{delimiter} ', delimiter, regex=False)
    
    return df


def count_unique_items(df, columns):
    """
    Counts the number of unique items in each specified column of the DataFrame,
    considering that some cell values contain comma-separated items.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the data.
    - columns (list): List of column names to analyze.

    Returns:
    - dict: A dictionary with column names as keys and unique counts as values.
    """
    unique_counts = {}
    for column in columns:
        unique_items = set()
        for items in df[column].dropna():
            # Split the string by commas and strip whitespace
            split_items = [item.strip() for item in items.split(',')]
            unique_items.update(split_items)
        unique_counts[column] = len(unique_items)
    return unique_counts

In [96]:
# Calculate unique counts

cols = feature_subset.columns.tolist()[1:]
# Apply cleaning to each categorical column
for column in cols:
    feature_subset = clean_column(feature_subset, column)

unique_counts = count_unique_items(feature_subset, cols)
sum = 0



# Print unique counts
print("\nNumber of Unique Values in Each Column:")
for column, count in unique_counts.items():
    sum += count
    print(f"Column '{column}': {count} unique value(s)")
print(f"total count: {sum} for columns {cols}")



Number of Unique Values in Each Column:
Column 'biotope': 21 unique value(s)
Column 'diet': 28 unique value(s)
total count: 49 for columns ['biotope', 'diet']


### One-Hot Encoding

The Apriori algorithm needs the data to be in a 1 hot encoded format with binary values for each feature. thus the 3 features are now subdivided into binary features



go through groups, if not in array, add it. 
then loop through that arr, and add that to columns. 
then loop through data and add 1 to that col if group is that 
then delete group column


pd.concat() in pandas is used to concatenate two or more DataFrames or Series along a particular axis (either rows or columns). You can specify how you want to join the data (e.g., along rows or columns) and whether to keep all data or just the common parts.

In [97]:
def one_hot_encode_column(df: pd.DataFrame, column: str, unique: bool) -> pd.DataFrame:
    """ 
    encodes the selected column as a binary valued columns

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the data.
    - column: column to process
    - unique (bool): if the column has unique values

    Returns:
    - pd.DataFrame: DataFrame with the original column dropped and new one-hot encoded columns added.
    """
    if column not in df.columns:
        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

    if unique:
        group_dummies = pd.get_dummies(df[column], prefix=column)
    else: 
        group_dummies = df[column].str.get_dummies(sep=',').rename(columns=lambda x: f'{column}_{x.strip()}')
        group_dummies = group_dummies.astype(bool)
    

    encoded_features = pd.concat([df.drop(columns=[column]), group_dummies], axis=1)
    return encoded_features


In [98]:


pd.set_option('display.max_rows', None)  # This will show all rows
encoded_features = feature_subset.copy()

encoded_features = one_hot_encode_column(encoded_features, 'group', unique=True)

for col in ['biotope', 'diet']:
    encoded_features = one_hot_encode_column(encoded_features, col, unique=False)



print("encoded_features.head(1).T.shape", " : " , encoded_features.head(1).T.shape)

encoded_features.head(1).T.shape  :  (63, 1)


### Since association rules are not monotonic, we could find interesting rules by looking at the family of the birds. 

In [99]:
# Assuming 'encoded_features' is the DataFrame that contains the one-hot encoded columns
species_columns = [col for col in encoded_features.columns if col.startswith('group_')]



taxonomy_mapping = {
    'group_laridae': '1.1 Lari',
    'group_sternidae': '1.1 Lari',
    'group_scolopacidae': '1.2 Charadrii',
    'group_charadriidae': '1.2 Charadrii',
    'group_haematopodidae': '1.2 Charadrii',
    'group_dabbling ducks': '2.1 Anatinae',
    'group_diving ducks': '2.1 Anatinae'
}

print("len(taxonomy_mapping)", " " , len(taxonomy_mapping))
print(species_columns)
print(len(species_columns))

len(taxonomy_mapping)   7
['group_accipitridae', 'group_anserinae', 'group_ardeidae', 'group_charadriidae', 'group_dabbling ducks', 'group_diving ducks', 'group_gaviidae', 'group_gruifores', 'group_laridae', 'group_phalacrocoracidae', 'group_podicipedidae', 'group_rallidae', 'group_scolopacidae', 'group_sternidae']
14


In [100]:
# Function to map the general group based on the true column
def map_general_group(row):
    for col in species_columns:
        if row[col]:  # Check if the group is True
            return taxonomy_mapping.get(col, None)  # Map to general group
    return None

### Adding the general family column

In [101]:

# Apply the mapping to create the new column 'general_group'
encoded_features['family'] = encoded_features.apply(map_general_group, axis=1)
encoded_features = one_hot_encode_column(encoded_features, 'family', unique= True)

# encoded_features.T.iloc[encoded_features.shape[1] - 5: encoded_features.shape[1]+ 1, :3]
encoded_features.T.iloc[encoded_features.shape[1] - 12: encoded_features.shape[1]+ 1, :3]

pd.reset_option('display.max_rows')

# encoded_features.T.iloc[-15:, :4]

## Apriori Algorithm

### Support

The support of an itemset \( X \) is defined as:

$$
\text{Support}(X) = \frac{\text{Number of transactions containing } X}{\text{Total number of transactions}}
$$

### Confidence

The confidence of an association rule \( X \rightarrow Y \) is defined as:
$$
\text{Confidence}(X \rightarrow Y) = \frac{\text{Support}(X \cup Y)}{\text{Support}(X)} = P(Y \mid X)
$$

### Lift

The lift of an association rule \( X \rightarrow Y \) is defined as:
$$
\text{Lift}(X \rightarrow Y) = \frac{\text{Confidence}(X \rightarrow Y)}{\text{Support}(Y)} = \frac{P(Y \mid X)}{P(Y)}
$$

Alternatively, it can be expressed as:
$$
\text{Lift}(X \rightarrow Y) = \frac{\text{Support}(X \cup Y)}{\text{Support}(X) \times \text{Support}(Y)} = \frac{P(Y \mid X)}{P(Y)}
$$


In [102]:
from mlxtend.frequent_patterns import apriori, association_rules


In [103]:
# Define minimum support (e.g., 0.05 for 5%)
frequent_itemsets = apriori(encoded_features, min_support=0.05, use_colnames=True)

# Sort by support
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)
print(len(frequent_itemsets))
frequent_itemsets.head(10)


197


Unnamed: 0,support,itemsets
31,0.36,(diet_invertebrates)
12,0.34,(biotope_lakes)
26,0.32,(diet_fish)
30,0.26,(diet_insects)
32,0.26,(diet_plants)
40,0.24,(family_1.2 Charadrii)
80,0.2,"(diet_fish, biotope_lakes)"
35,0.2,(diet_small-rodents)
18,0.18,(biotope_sea-bays)
123,0.18,"(family_1.2 Charadrii, diet_invertebrates)"


In [104]:
# Define minimum confidence (e.g., 0.7 for 70%)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

# Sort rules by confidence
rules = rules.sort_values(by='confidence', ascending=False)

# rules.head(20)


In [105]:
# Example: Rules with lift greater than 1.2
interesting_rules = rules[rules['lift'] > 10]

interesting_rules = interesting_rules.sort_values(by='lift', ascending=False)

# interesting_rules.head(20)


## Iteration-2
## Introduction of more features


In [106]:
def boolean_encode_column(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """ 
    turns the yes no columns to boolean
    """
    df[column] = df[column].map({'Yes': True, 'No': False})
    return df

def add_new_columns(data: pd.DataFrame, df: pd.DataFrame, columns: list[str], booleanize: bool = True) -> pd.DataFrame:
    """ 
    adds a new column and turns that into boolean values.map
    """
    
    for col in columns:
        if col not in data.columns:
            raise ValueError (f"Column '{col}' does not exist in the DataFrame.")

        df[col] = data[col]
        if booleanize:
            boolean_encode_column(df, col)

    return df

In [107]:

data['non_sim'] = data['sim'].map({'Yes': 'No', 'No': 'Yes'})

binary_encoded_columns = ['sim','non_sim' , 'diver', 'long-billed', 'webbed-feet', 'long-legs', 'wading-bird', 'plunge-dives']


add_new_columns(data, encoded_features, binary_encoded_columns)

encoded_features.T.iloc[:, 10:30]

Unnamed: 0,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
group_accipitridae,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
group_anserinae,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False
group_ardeidae,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
group_charadriidae,False,False,False,True,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False
group_dabbling ducks,False,False,False,False,False,False,False,True,True,True,True,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
long-billed,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False
webbed-feet,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True,True,False,False,False,False
long-legs,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True
wading-bird,True,True,True,True,True,True,True,False,False,False,False,False,False,False,False,False,True,True,True,True


In [108]:
def filter_rules(rules_df: pd.DataFrame, min_lift: float = 1.0, min_support: float = 0.0, 
                min_confidence: float = 0.0, max_rules: int = 100) -> pd.DataFrame:
    """
    Filters and sorts association rules based on lift, support, and confidence thresholds.

    Parameters:
    - rules_df (pd.DataFrame): DataFrame containing association rules.
    - min_lift (float): Minimum lift threshold.
    - min_support (float): Minimum support threshold.
    - min_confidence (float): Minimum confidence threshold.
    - max_rules (int): Maximum number of rules to return.

    Returns:
    - pd.DataFrame: Filtered and sorted association rules.
    """
    # Apply multiple filters
    filtered_rules = rules_df[
        (rules_df['lift'] >= min_lift) &
        (rules_df['support'] >= min_support) &
        (rules_df['confidence'] >= min_confidence)
    ]
    
    # Sort rules by lift descending, then by support descending, then by confidence descending
    sorted_rules = filtered_rules.sort_values(
        by=['lift', 'support', 'confidence'], 
        ascending=[False, False, False]
    )
    
    # Remove duplicate rules based on antecedents and consequents
    sorted_rules = sorted_rules.drop_duplicates(subset=['antecedents', 'consequents'])
    # print("len(sorted_rules)" , " " , len(sorted_rules))
    # print("len(rules_df)" , " " , len(rules_df))
    
    # Return the top max_rules
    return sorted_rules.head(max_rules)

In [109]:

# Define minimum support (e.g., 0.05 for 5%)
frequent_itemsets = apriori(encoded_features, min_support=0.05, use_colnames=True)

# Sort by support
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)
print(len(frequent_itemsets))
frequent_itemsets.head(10)



1012


Unnamed: 0,support,itemsets
42,0.8,(sim)
46,0.44,(webbed-feet)
48,0.42,(wading-bird)
259,0.4,"(wading-bird, sim)"
31,0.36,(diet_invertebrates)
12,0.34,(biotope_lakes)
26,0.32,(diet_fish)
257,0.3,"(webbed-feet, sim)"
124,0.3,"(webbed-feet, biotope_lakes)"
188,0.3,"(diet_fish, sim)"


In [110]:

rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.07, support_only=False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(wading-bird),(sim),0.42,0.80,0.40,0.952381,1.190476,0.0640,4.200000,0.275862
1,(sim),(wading-bird),0.80,0.42,0.40,0.500000,1.190476,0.0640,1.160000,0.800000
2,(webbed-feet),(sim),0.44,0.80,0.30,0.681818,0.852273,-0.0520,0.628571,-0.236364
3,(sim),(webbed-feet),0.80,0.44,0.30,0.375000,0.852273,-0.0520,0.896000,-0.464286
4,(webbed-feet),(biotope_lakes),0.44,0.34,0.30,0.681818,2.005348,0.1504,2.074286,0.895238
...,...,...,...,...,...,...,...,...,...,...
3729,(diet_plants),"(diet_invertebrates, webbed-feet, non_sim, gro...",0.26,0.08,0.08,0.307692,3.846154,0.0592,1.328889,1.000000
3730,(group_dabbling ducks),"(diet_invertebrates, diet_plants, non_sim, web...",0.10,0.08,0.08,0.800000,10.000000,0.0720,4.600000,1.000000
3731,(webbed-feet),"(diet_invertebrates, diet_plants, non_sim, gro...",0.44,0.08,0.08,0.181818,2.272727,0.0448,1.124444,1.000000
3732,(biotope_marshland),(wading-bird),0.16,0.42,0.08,0.500000,1.190476,0.0128,1.160000,0.190476


In [111]:

filtered_rules = filter_rules(rules, min_support=0.2, max_rules=10)
filtered_rules
# print(len(filtered_rules))

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
43,(diet_fish),"(sim, biotope_lakes)",0.32,0.24,0.2,0.625,2.604167,0.1232,2.026667,0.905882
42,"(sim, biotope_lakes)",(diet_fish),0.24,0.32,0.2,0.833333,2.604167,0.1232,4.08,0.810526
12,(family_1.2 Charadrii),(wading-bird),0.24,0.42,0.24,1.0,2.380952,0.1392,inf,0.763158
13,(wading-bird),(family_1.2 Charadrii),0.42,0.24,0.24,0.571429,2.380952,0.1392,1.773333,1.0
29,"(family_1.2 Charadrii, sim)",(wading-bird),0.22,0.42,0.22,1.0,2.380952,0.1276,inf,0.74359
32,(wading-bird),"(family_1.2 Charadrii, sim)",0.42,0.22,0.22,0.52381,2.380952,0.1276,1.638,1.0
24,"(webbed-feet, sim)",(diet_fish),0.3,0.32,0.22,0.733333,2.291667,0.124,2.55,0.805195
25,(diet_fish),"(webbed-feet, sim)",0.32,0.3,0.22,0.6875,2.291667,0.124,2.24,0.828877
31,(family_1.2 Charadrii),"(wading-bird, sim)",0.24,0.4,0.22,0.916667,2.291667,0.124,7.2,0.741627
30,"(wading-bird, sim)",(family_1.2 Charadrii),0.4,0.24,0.22,0.55,2.291667,0.124,1.688889,0.939394


## Iteration-3 Numerical features

In [112]:
def range_to_mid(r: str) -> float:
    '''
    Return the mid value of a numerical range
    '''
    if not isinstance(r, str) or '-' not in r:
        raise TypeError
    lower_bound, upper_bound = list(map(int,r.split('-')))
    return (lower_bound + upper_bound) / 2


def min_max_scaling(d: np.ndarray) -> np.ndarray:
    min_val = d.min()
    max_val = d.max()
    return (d-min_val) / (max_val - min_val)

In [113]:
num_range_cols = ['length', 'wspan', 'weight']
other_num = ['AR', 'wload']

add_new_columns(data, encoded_features, num_range_cols, booleanize= False)
add_new_columns(data, encoded_features, other_num, booleanize= False)

encoded_features.T.iloc[70:, :2]

Unnamed: 0,0,1
webbed-feet,True,True
long-legs,False,False
wading-bird,False,False
plunge-dives,True,True
length,34-38,55-65
wspan,86-99,123-148
weight,200-350,800-1300
AR,8.13,8.24
wload,0.31,0.64


In [114]:

# map numerical ranges to their mid points
encoded_features[num_range_cols] = encoded_features[num_range_cols].map(range_to_mid)
encoded_features.rename(columns={col: f'mid_{col}' for col in num_range_cols}, inplace=True)

encoded_features.T.iloc[70:, :2]

Unnamed: 0,0,1
webbed-feet,True,True
long-legs,False,False
wading-bird,False,False
plunge-dives,True,True
mid_length,36.0,60.0
mid_wspan,92.5,135.5
mid_weight,275.0,1050.0
AR,8.13,8.24
wload,0.31,0.64
