In [185]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Numerical features engineering

### Data Summary

In [186]:
data = pd.read_csv('birds2024ext.csv', delimiter=';')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   species       50 non-null     object 
 1   group         50 non-null     object 
 2   length        50 non-null     object 
 3   wspan         50 non-null     object 
 4   weight        50 non-null     object 
 5   AR            50 non-null     float64
 6   wload         50 non-null     float64
 7   back          50 non-null     object 
 8   belly         50 non-null     object 
 9   ftype         50 non-null     object 
 10  sim           50 non-null     object 
 11  billcol       50 non-null     object 
 12  legcol        50 non-null     object 
 13  arrives       50 non-null     object 
 14  leaves        50 non-null     object 
 15  eggs          50 non-null     object 
 16  incub         50 non-null     object 
 17  ccare         50 non-null     object 
 18  biotope       50 non-null     ob

In [187]:
data.T.iloc[:, :4]

Unnamed: 0,0,1,2,3
species,naurulokki,harmaalokki,isolokki,kalatiira
group,laridae,laridae,laridae,sternidae
length,34-38,55-65,63-68,36-42
wspan,86-99,123-148,138-158,70-80
weight,200-350,800-1300,1000-1800,100-145
AR,8.13,8.24,8.24,9.14
wload,0.31,0.64,0.66,0.24
back,light grey,bluish grey,bluish grey,grey
belly,white,white,white,white
ftype,B,B,B,B


## Iteration-1 features

start with the habitat, group, diet as the first features.

In [188]:
def extract_features(df, feature_list):
    df_return = df[feature_list].copy()
    return df_return

In [189]:
# More descriptive naming
feature_list = ['species', 'group', 'biotope', 'diet']
feature_subset = extract_features(data, feature_list=feature_list)
feature_subset.head(5)


Unnamed: 0,species,group,biotope,diet
0,naurulokki,laridae,"lakes,sea-bays","fish,invertebrates,garbage"
1,harmaalokki,laridae,"lakes,sea-coast,marshland","fish,garbage,chicks,grain"
2,isolokki,laridae,"sea-coast, harbours","fish,eggs,chicks,garbage,carrion"
3,kalatiira,sternidae,"lakes,archipelago",fish
4,lapintiira,sternidae,"archipelago,lakes,marshland",fish


In [190]:
# Check for NaNs
print("NaNs in 'group':", feature_subset['group'].isna().sum())
print("NaNs in 'biotope':", feature_subset['biotope'].isna().sum())
print("NaNs in 'diet':", feature_subset['diet'].isna().sum())

# Check for empty strings
print("Empty strings in 'group':", (feature_subset['group'] == '').sum())
print("Empty strings in 'biotope':", (feature_subset['biotope'] == '').sum())
print("Empty strings in 'diet':", (feature_subset['diet'] == '').sum())


NaNs in 'group': 0
NaNs in 'biotope': 0
NaNs in 'diet': 0
Empty strings in 'group': 0
Empty strings in 'biotope': 0
Empty strings in 'diet': 0


In [191]:
# Function to clean categorical columns
def clean_column(df, column, delimiter=','):
    """
    Cleans a categorical column by:
    - Stripping whitespace
    - Converting to lowercase
    - Standardizing delimiters
    """
    # Remove leading/trailing whitespace and convert to lowercase
    df.loc[:, column] = df[column].str.strip().str.lower()
    
    # Replace multiple spaces with single space using a raw string
    df.loc[:, column] = df[column].str.replace(r'\s+', ' ', regex=True)
    
    # Ensure consistent delimiter spacing
    df.loc[:, column] = df[column].str.replace(f' {delimiter}', delimiter, regex=False)
    df.loc[:, column] = df[column].str.replace(f'{delimiter} ', delimiter, regex=False)
    
    return df


def count_unique_items(df, columns):
    """
    Counts the number of unique items in each specified column of the DataFrame,
    considering that some cell values contain comma-separated items.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the data.
    - columns (list): List of column names to analyze.

    Returns:
    - dict: A dictionary with column names as keys and unique counts as values.
    """
    unique_counts = {}
    for column in columns:
        unique_items = set()
        for items in df[column].dropna():
            # Split the string by commas and strip whitespace
            split_items = [item.strip() for item in items.split(',')]
            unique_items.update(split_items)
        unique_counts[column] = len(unique_items)
    return unique_counts

In [192]:
# Calculate unique counts

cols = feature_subset.columns.tolist()[1:]
# Apply cleaning to each categorical column
for column in cols:
    feature_subset = clean_column(feature_subset, column)

unique_counts = count_unique_items(feature_subset, cols)
sum = 0



# Print unique counts
print("\nNumber of Unique Values in Each Column:")
for column, count in unique_counts.items():
    sum += count
    print(f"Column '{column}': {count} unique value(s)")
print(f"total count: {sum} for columns {cols}")



Number of Unique Values in Each Column:
Column 'group': 14 unique value(s)
Column 'biotope': 21 unique value(s)
Column 'diet': 28 unique value(s)
total count: 63 for columns ['group', 'biotope', 'diet']


### One-Hot Encoding

The Apriori algorithm needs the data to be in a 1 hot encoded format with binary values for each feature. thus the 3 features are now subdivided into binary features



go through groups, if not in array, add it. 
then loop through that arr, and add that to columns. 
then loop through data and add 1 to that col if group is that 
then delete group column


pd.concat() in pandas is used to concatenate two or more DataFrames or Series along a particular axis (either rows or columns). You can specify how you want to join the data (e.g., along rows or columns) and whether to keep all data or just the common parts.

In [197]:
def one_hot_encode_column(df: pd.DataFrame, column: str, unique: bool) -> pd.DataFrame:
    """ 
    encodes the selected column as a binary valued columns

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the data.
    - column: column to process
    - unique (bool): if the column has unique values

    Returns:
    - pd.DataFrame: DataFrame with the original column dropped and new one-hot encoded columns added.
    """
    if column not in df.columns:
        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

    if unique:
        group_dummies = pd.get_dummies(df[column], prefix=column)
    else: 
        group_dummies = df[column].str.get_dummies(sep=',').rename(columns=lambda x: f'{column}_{x.strip()}')
    

    encoded_features = pd.concat([df.drop(columns=[column]), group_dummies], axis=1)
    return encoded_features


SyntaxError: invalid syntax (2073053548.py, line 12)

In [196]:


pd.set_option('display.max_rows', None)  # This will show all rows


for col in ['biotope', 'diet']:
    encoded_features = one_hot_encode_column(feature_subset, col, unique=False)

encoded_features['group'] = one_hot_encode_column(feature_subset, 'group', unique=True)


print(encoded_features.head(1).T)
print("encoded_features.head(1).T.shape", " : " , encoded_features.head(1).T.shape)

ValueError: Columns must be same length as key

In [169]:

pd.set_option('display.max_rows', None)  # This will show all rows

# One-Hot Encode the 'group' Column
group_dummies = pd.get_dummies(feature_subset['group'], prefix='group')

# One-Hot Encode the 'biotope' Column
biotope_expanded = feature_subset['biotope'].str.get_dummies(sep=',').rename(columns=lambda x: f'biotope_{x.strip()}')

# One-Hot Encode the 'diet' Column
diet_expanded = feature_subset['diet'].str.get_dummies(sep=',').rename(columns=lambda x: f'diet_{x.strip()}')

# Concatenate all dummy DataFrames with the original subset, but drop the original 'group', 'biotope', 'diet' columns
encoded_features = pd.concat([feature_subset.drop(columns=['species', 'group', 'biotope', 'diet']), group_dummies, biotope_expanded, diet_expanded], axis=1)
# Display the Encoded Features
print("len(biotope_expanded.columns)" , " ", len(biotope_expanded.columns))
print(encoded_features.head(1).T)
print("encoded_features.head(1).T.shape", " : " , encoded_features.head(1).T.shape)


len(biotope_expanded.columns)   21
                                 0
group_accipitridae           False
group_anserinae              False
group_ardeidae               False
group_charadriidae           False
group_dabbling ducks         False
group_diving ducks           False
group_gaviidae               False
group_gruifores              False
group_laridae                 True
group_phalacrocoracidae      False
group_podicipedidae          False
group_rallidae               False
group_scolopacidae           False
group_sternidae              False
biotope_archipelago              0
biotope_coastal-meadows          0
biotope_fells                    0
biotope_fields                   0
biotope_forest-edges             0
biotope_forests                  0
biotope_harbours                 0
biotope_islets                   0
biotope_lakes                    1
biotope_marshland                0
biotope_meadows                  0
biotope_nutrient-rich-lakes      0
biotope_pastures    

In [170]:
# Check for binary values
print((encoded_features.iloc[:, 1:] <= 1).all().all())  # Should return True

# Check for missing values
# print(encoded_features.isnull().sum())

True


### Since association rules are not monotonic, we could find interesting rules by looking at the family of the birds. 

In [171]:


# Assuming 'encoded_features' is the DataFrame that contains the one-hot encoded columns
species_columns = [col for col in encoded_features.columns if col.startswith('group_')]

# Create a mapping dictionary
taxonomy_mapping = {
    'group_laridae': 'Charadriiformes (Lari)',
    'group_sternidae': 'Charadriiformes (Lari)',
    'group_scolopacidae': 'Charadriiformes (Charadrii)',
    'group_charadriidae': 'Charadriiformes (Charadrii)',
    'group_haematopodidae': 'Charadriiformes (Charadrii)',
    'group_anatinae': 'Anatidae',
    'group_anserinae': 'Anatidae',
    'group_gruidae': 'Gruiformes',
    'group_rallidae': 'Gruiformes',
    'group_phalacrocoracidae': 'Phalacrocoracidae',
    'group_podicipedidae': 'Podicipedidae',
    'group_gaviidae': 'Gaviidae',
    'group_ardeidae': 'Ardeidae',
    'group_accipitridae': 'Accipitriformes'
}


print(species_columns)
print(len(species_columns))

['group_accipitridae', 'group_anserinae', 'group_ardeidae', 'group_charadriidae', 'group_dabbling ducks', 'group_diving ducks', 'group_gaviidae', 'group_gruifores', 'group_laridae', 'group_phalacrocoracidae', 'group_podicipedidae', 'group_rallidae', 'group_scolopacidae', 'group_sternidae']
14


In [172]:

# Function to map the general group based on the true column
def map_general_group(row):
    for col in species_columns:
        if row[col]:  # Check if the group is True
            return taxonomy_mapping.get(col, None)  # Map to general group
    return None

# Apply the mapping to create the new column 'general_group'
encoded_features['general_family'] = encoded_features.apply(map_general_group, axis=1)

# View the DataFrame with the new column




In [173]:
encoded_features.T.shape

# View the DataFrame with the new column
encoded_features.T.iloc[encoded_features.shape[1] - 5: encoded_features.shape[1]+ 1, :3]

# View the last few columns, including the new 'general_group' column
# encoded_features.head(2).T.iloc[:, [-3, -1]]

Unnamed: 0,0,1,2
diet_snakes,0,0,0
diet_squirrels,0,0,0
diet_vertebrae,0,0,0
diet_worms,0,0,0
general_family,Charadriiformes (Lari),Charadriiformes (Lari),Charadriiformes (Lari)


In [174]:
group_dummies_family = pd.get_dummies(encoded_features['general_family'], prefix='family')

encoded_features = pd.concat([encoded_features.drop(columns=['general_family']),  group_dummies_family], axis=1)

In [175]:

encoded_features.T.iloc[encoded_features.shape[1] - 12: encoded_features.shape[1]+ 1, :3]

Unnamed: 0,0,1,2
diet_squirrels,0,0,0
diet_vertebrae,0,0,0
diet_worms,0,0,0
family_Accipitriformes,False,False,False
family_Anatidae,False,False,False
family_Ardeidae,False,False,False
family_Charadriiformes (Charadrii),False,False,False
family_Charadriiformes (Lari),True,True,True
family_Gaviidae,False,False,False
family_Gruiformes,False,False,False


In [176]:
# Check data types of encoded_features
print("Data Types of Encoded Features:")
print(encoded_features.dtypes)


# Convert all columns to boolean
encoded_features = encoded_features.astype(bool)
# Verify the conversion
print("\nData Types After Conversion:")
print(encoded_features.dtypes)



Data Types of Encoded Features:
group_accipitridae                     bool
group_anserinae                        bool
group_ardeidae                         bool
group_charadriidae                     bool
group_dabbling ducks                   bool
group_diving ducks                     bool
group_gaviidae                         bool
group_gruifores                        bool
group_laridae                          bool
group_phalacrocoracidae                bool
group_podicipedidae                    bool
group_rallidae                         bool
group_scolopacidae                     bool
group_sternidae                        bool
biotope_archipelago                   int64
biotope_coastal-meadows               int64
biotope_fells                         int64
biotope_fields                        int64
biotope_forest-edges                  int64
biotope_forests                       int64
biotope_harbours                      int64
biotope_islets                        int64


## Apriori Algorithm

### Support

The support of an itemset \( X \) is defined as:

$$
\text{Support}(X) = \frac{\text{Number of transactions containing } X}{\text{Total number of transactions}}
$$

### Confidence

The confidence of an association rule \( X \rightarrow Y \) is defined as:
$$
\text{Confidence}(X \rightarrow Y) = \frac{\text{Support}(X \cup Y)}{\text{Support}(X)} = P(Y \mid X)
$$

### Lift

The lift of an association rule \( X \rightarrow Y \) is defined as:
$$
\text{Lift}(X \rightarrow Y) = \frac{\text{Confidence}(X \rightarrow Y)}{\text{Support}(Y)} = \frac{P(Y \mid X)}{P(Y)}
$$

Alternatively, it can be expressed as:
$$
\text{Lift}(X \rightarrow Y) = \frac{\text{Support}(X \cup Y)}{\text{Support}(X) \times \text{Support}(Y)} = \frac{P(Y \mid X)}{P(Y)}
$$


In [177]:
from mlxtend.frequent_patterns import apriori, association_rules


In [178]:
# Assuming 'encoded_features' has 'species' as the first column
basket = encoded_features.drop('species', axis=1)
# Define minimum support (e.g., 0.05 for 5%)
frequent_itemsets = apriori(basket, min_support=0.05, use_colnames=True)

# Sort by support
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)
print(len(frequent_itemsets))
frequent_itemsets.head(10)


KeyError: "['species'] not found in axis"

In [130]:
# Define minimum confidence (e.g., 0.7 for 70%)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

# Sort rules by confidence
rules = rules.sort_values(by='confidence', ascending=False)

rules.head(20)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
405,"(family_Accipitriformes, diet_insects)","(diet_frogs, diet_small-rodents, group_accipit...",0.06,0.08,0.06,1.0,12.5,0.0552,inf,0.978723
1,(family_Accipitriformes),(group_accipitridae),0.16,0.16,0.16,1.0,6.25,0.1344,inf,1.0
356,"(diet_frogs, biotope_forest-edges)",(group_accipitridae),0.06,0.16,0.06,1.0,6.25,0.0504,inf,0.893617
357,"(biotope_forest-edges, group_accipitridae)",(diet_frogs),0.06,0.18,0.06,1.0,5.555556,0.0492,inf,0.87234
358,(biotope_forest-edges),"(diet_frogs, group_accipitridae)",0.06,0.1,0.06,1.0,10.0,0.054,inf,0.957447
207,"(diet_invertebrates, diet_plants, biotope_shores)",(group_dabbling ducks),0.06,0.1,0.06,1.0,10.0,0.054,inf,0.957447
206,"(diet_invertebrates, group_dabbling ducks, bio...",(diet_plants),0.06,0.26,0.06,1.0,3.846154,0.0444,inf,0.787234
359,"(family_Accipitriformes, biotope_forest-edges)",(group_accipitridae),0.06,0.16,0.06,1.0,6.25,0.0504,inf,0.893617
360,"(biotope_forest-edges, group_accipitridae)",(family_Accipitriformes),0.06,0.16,0.06,1.0,6.25,0.0504,inf,0.893617
361,(biotope_forest-edges),"(family_Accipitriformes, group_accipitridae)",0.06,0.16,0.06,1.0,6.25,0.0504,inf,0.893617


In [131]:
# Example: Rules with lift greater than 1.2
interesting_rules = rules[rules['lift'] > 10]

interesting_rules = interesting_rules.sort_values(by='lift', ascending=False)

interesting_rules.head(20)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
338,(family_Ardeidae),"(group_ardeidae, diet_frogs)",0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0
333,(family_Ardeidae),"(group_ardeidae, diet_small-rodents)",0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0
330,"(group_ardeidae, diet_small-rodents)",(family_Ardeidae),0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0
331,"(family_Ardeidae, diet_small-rodents)",(group_ardeidae),0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0
332,(group_ardeidae),"(family_Ardeidae, diet_small-rodents)",0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0
337,(group_ardeidae),"(diet_frogs, family_Ardeidae)",0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0
129,(diet_garbage),"(group_laridae, family_Charadriiformes (Lari))",0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0
342,(group_laridae),(diet_garbage),0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0
343,(diet_garbage),(group_laridae),0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0
336,"(diet_frogs, family_Ardeidae)",(group_ardeidae),0.06,0.06,0.06,1.0,16.666667,0.0564,inf,1.0


In [136]:

data.T.iloc[:, :4]

Unnamed: 0,0,1,2,3
species,naurulokki,harmaalokki,isolokki,kalatiira
group,laridae,laridae,laridae,sternidae
length,34-38,55-65,63-68,36-42
wspan,86-99,123-148,138-158,70-80
weight,200-350,800-1300,1000-1800,100-145
AR,8.13,8.24,8.24,9.14
wload,0.31,0.64,0.66,0.24
back,light grey,bluish grey,bluish grey,grey
belly,white,white,white,white
ftype,B,B,B,B


In [179]:

encoded_features.T.iloc[:, :4]

Unnamed: 0,0,1,2,3
group_accipitridae,False,False,False,False
group_anserinae,False,False,False,False
group_ardeidae,False,False,False,False
group_charadriidae,False,False,False,False
group_dabbling ducks,False,False,False,False
group_diving ducks,False,False,False,False
group_gaviidae,False,False,False,False
group_gruifores,False,False,False,False
group_laridae,True,True,True,False
group_phalacrocoracidae,False,False,False,False
