In [591]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Numerical features engineering

### Data Summary

In [592]:
data = pd.read_csv('birds2024ext.csv', delimiter=';')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   species       50 non-null     object 
 1   group         50 non-null     object 
 2   length        50 non-null     object 
 3   wspan         50 non-null     object 
 4   weight        50 non-null     object 
 5   AR            50 non-null     float64
 6   wload         50 non-null     float64
 7   back          50 non-null     object 
 8   belly         50 non-null     object 
 9   ftype         50 non-null     object 
 10  sim           50 non-null     object 
 11  billcol       50 non-null     object 
 12  legcol        50 non-null     object 
 13  arrives       50 non-null     object 
 14  leaves        50 non-null     object 
 15  eggs          50 non-null     object 
 16  incub         50 non-null     object 
 17  ccare         50 non-null     object 
 18  biotope       50 non-null     ob

In [593]:
data.T.iloc[:, :20]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
species,naurulokki,harmaalokki,isolokki,kalatiira,lapintiira,suokukko,taivaanvuohi,lehtokurppa,metsäviklo,liro,karikukko,suosirri,pikkusirri,pikkutylli,tylli,keräkurmitsa,töyhtöhyyppä,haapana,tavi,sinisorsa
group,laridae,laridae,laridae,sternidae,sternidae,scolopacidae,scolopacidae,scolopacidae,scolopacidae,scolopacidae,scolopacidae,scolopacidae,scolopacidae,charadriidae,charadriidae,charadriidae,charadriidae,dabbling ducks,dabbling ducks,dabbling ducks
length,34-38,55-65,63-68,36-42,33-37,25-26,25-27,34-36,21-24,21-23,21-24,17-21,14-16,15-16,18-19,20-24,30-34,41-51,34-38,50-60
wspan,86-99,123-148,138-158,70-80,66-77,46-49,39-45,55-65,39-44,35-39,43-49,32-36,27-30,32-35,35-41,57-64,67-72,71-80,53-59,79-87
weight,200-350,800-1300,1000-1800,100-145,90-130,90-130,90-110,280-330,75-85,50-70,90-130,35-62,20-30,35-45,50-65,85-140,170-230,600-800,250-450,900-1300
AR,8.13,8.24,8.24,9.14,8.97,6.73,5.91,5.68,7.2,7.53,7.21,6.95,6.63,6.93,7.43,7.43,5.45,7.7,6.51,6.19
wload,0.31,0.64,0.66,0.24,0.2,0.36,0.4,0.54,0.29,0.23,0.45,0.31,0.18,0.24,0.32,0.45,0.34,1.09,0.75,1.05
back,light grey,bluish grey,bluish grey,grey,grey,dappled brown,dappled brown,dappled brown,brown,dappled brown,black-brown,dappled brown,dappled brown,greyish brown,greyish brown,dappled brown,black,dappled brown,dappled brown,dappled brown
belly,white,white,white,white,white,white,white,dappled beige,white,white,white,black-white,white,white,white,black,white,white,beige,beige
ftype,B,B,B,B,B,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C


## Iteration-1 features

start with the habitat, group, diet as the first features.

In [594]:
def extract_features(df, feature_list):
    df_return = df[feature_list].copy()
    return df_return

In [595]:
# More descriptive naming
feature_list = ['group', 'biotope', 'diet']
feature_subset = extract_features(data, feature_list=feature_list)
feature_subset.head(5)


Unnamed: 0,group,biotope,diet
0,laridae,"lakes,sea-bays","fish,invertebrates,garbage"
1,laridae,"lakes,sea-coast,marshland","fish,garbage,chicks,grain"
2,laridae,"sea-coast, harbours","fish,eggs,chicks,garbage,carrion"
3,sternidae,"lakes,archipelago",fish
4,sternidae,"archipelago,lakes,marshland",fish


In [596]:
# Check for NaNs
print("NaNs in 'group':", feature_subset['group'].isna().sum())
print("NaNs in 'biotope':", feature_subset['biotope'].isna().sum())
print("NaNs in 'diet':", feature_subset['diet'].isna().sum())

# Check for empty strings
print("Empty strings in 'group':", (feature_subset['group'] == '').sum())
print("Empty strings in 'biotope':", (feature_subset['biotope'] == '').sum())
print("Empty strings in 'diet':", (feature_subset['diet'] == '').sum())


NaNs in 'group': 0
NaNs in 'biotope': 0
NaNs in 'diet': 0
Empty strings in 'group': 0
Empty strings in 'biotope': 0
Empty strings in 'diet': 0


In [597]:
# Function to clean categorical columns
def clean_column(df, column, delimiter=','):
    """
    Cleans a categorical column by:
    - Stripping whitespace
    - Converting to lowercase
    - Standardizing delimiters
    """
    # Remove leading/trailing whitespace and convert to lowercase
    df.loc[:, column] = df[column].str.strip().str.lower()
    
    # Replace multiple spaces with single space using a raw string
    df.loc[:, column] = df[column].str.replace(r'\s+', ' ', regex=True)
    
    # Ensure consistent delimiter spacing
    df.loc[:, column] = df[column].str.replace(f' {delimiter}', delimiter, regex=False)
    df.loc[:, column] = df[column].str.replace(f'{delimiter} ', delimiter, regex=False)
    
    return df


def count_unique_items(df, columns):
    """
    Counts the number of unique items in each specified column of the DataFrame,
    considering that some cell values contain comma-separated items.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the data.
    - columns (list): List of column names to analyze.

    Returns:
    - dict: A dictionary with column names as keys and unique counts as values.
    """
    unique_counts = {}
    for column in columns:
        unique_items = set()
        for items in df[column].dropna():
            # Split the string by commas and strip whitespace
            split_items = [item.strip() for item in items.split(',')]
            unique_items.update(split_items)
        unique_counts[column] = len(unique_items)
    return unique_counts

In [598]:
# Calculate unique counts

cols = feature_subset.columns.tolist()[1:]
# Apply cleaning to each categorical column
for column in cols:
    feature_subset = clean_column(feature_subset, column)

unique_counts = count_unique_items(feature_subset, cols)
sum = 0



# Print unique counts
print("\nNumber of Unique Values in Each Column:")
for column, count in unique_counts.items():
    sum += count
    print(f"Column '{column}': {count} unique value(s)")
print(f"total count: {sum} for columns {cols}")



Number of Unique Values in Each Column:
Column 'biotope': 21 unique value(s)
Column 'diet': 28 unique value(s)
total count: 49 for columns ['biotope', 'diet']


### One-Hot Encoding

The Apriori algorithm needs the data to be in a 1 hot encoded format with binary values for each feature. thus the 3 features are now subdivided into binary features



go through groups, if not in array, add it. 
then loop through that arr, and add that to columns. 
then loop through data and add 1 to that col if group is that 
then delete group column


pd.concat() in pandas is used to concatenate two or more DataFrames or Series along a particular axis (either rows or columns). You can specify how you want to join the data (e.g., along rows or columns) and whether to keep all data or just the common parts.

In [599]:
def one_hot_encode_column(df: pd.DataFrame, column: str, unique: bool) -> pd.DataFrame:
    """ 
    encodes the selected column as a binary valued columns

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the data.
    - column: column to process
    - unique (bool): if the column has unique values

    Returns:
    - pd.DataFrame: DataFrame with the original column dropped and new one-hot encoded columns added.
    """
    if column not in df.columns:
        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

    if unique:
        group_dummies = pd.get_dummies(df[column], prefix=column)
    else: 
        group_dummies = df[column].str.get_dummies(sep=',').rename(columns=lambda x: f'{column}_{x.strip()}')
        group_dummies = group_dummies.astype(bool)
    

    encoded_features = pd.concat([df.drop(columns=[column]), group_dummies], axis=1)
    return encoded_features


In [600]:


pd.set_option('display.max_rows', None)  # This will show all rows
encoded_features = feature_subset.copy()

encoded_features = one_hot_encode_column(encoded_features, 'group', unique=True)

for col in ['biotope', 'diet']:
    encoded_features = one_hot_encode_column(encoded_features, col, unique=False)



print("encoded_features.head(1).T.shape", " : " , encoded_features.head(1).T.shape)

encoded_features.head(1).T.shape  :  (63, 1)


### Since association rules are not monotonic, we could find interesting rules by looking at the family of the birds. 

In [601]:
# Assuming 'encoded_features' is the DataFrame that contains the one-hot encoded columns
species_columns = [col for col in encoded_features.columns if col.startswith('group_')]



taxonomy_mapping = {
    'group_laridae': '1.1 Lari',
    'group_sternidae': '1.1 Lari',
    'group_scolopacidae': '1.2 Charadrii',
    'group_charadriidae': '1.2 Charadrii',
    'group_haematopodidae': '1.2 Charadrii',
    'group_dabbling ducks': '2.1 Anatinae',
    'group_diving ducks': '2.1 Anatinae'
}

print("len(taxonomy_mapping)", " " , len(taxonomy_mapping))
print(species_columns)
print(len(species_columns))

len(taxonomy_mapping)   7
['group_accipitridae', 'group_anserinae', 'group_ardeidae', 'group_charadriidae', 'group_dabbling ducks', 'group_diving ducks', 'group_gaviidae', 'group_gruifores', 'group_laridae', 'group_phalacrocoracidae', 'group_podicipedidae', 'group_rallidae', 'group_scolopacidae', 'group_sternidae']
14


In [602]:
# Function to map the general group based on the true column
def map_general_group(row):
    for col in species_columns:
        if row[col]:  # Check if the group is True
            return taxonomy_mapping.get(col, None)  # Map to general group
    return None

### Adding the general family column

In [603]:

# Apply the mapping to create the new column 'general_group'
encoded_features['family'] = encoded_features.apply(map_general_group, axis=1)
encoded_features = one_hot_encode_column(encoded_features, 'family', unique= True)

# encoded_features.T.iloc[encoded_features.shape[1] - 5: encoded_features.shape[1]+ 1, :3]
encoded_features.T.iloc[encoded_features.shape[1] - 12: encoded_features.shape[1]+ 1, :3]

pd.reset_option('display.max_rows')

# encoded_features.T.iloc[-15:, :4]

### Iteration-1 Apriori
### Apriori Algorithm

### Support

The support of an itemset \( X \) is defined as:

$$
\text{Support}(X) = \frac{\text{Number of transactions containing } X}{\text{Total number of transactions}}
$$

### Confidence

The confidence of an association rule \( X \rightarrow Y \) is defined as:
$$
\text{Confidence}(X \rightarrow Y) = \frac{\text{Support}(X \cup Y)}{\text{Support}(X)} = P(Y \mid X)
$$

### Lift

The lift of an association rule \( X \rightarrow Y \) is defined as:
$$
\text{Lift}(X \rightarrow Y) = \frac{\text{Confidence}(X \rightarrow Y)}{\text{Support}(Y)} = \frac{P(Y \mid X)}{P(Y)}
$$

Alternatively, it can be expressed as:
$$
\text{Lift}(X \rightarrow Y) = \frac{\text{Support}(X \cup Y)}{\text{Support}(X) \times \text{Support}(Y)} = \frac{P(Y \mid X)}{P(Y)}
$$


In [604]:
from mlxtend.frequent_patterns import apriori, association_rules


In [605]:

def filter_rules(rules_df: pd.DataFrame, min_lift: float = 1.0, min_support: float = 0.0, 
                min_confidence: float = 0.0, max_rules: int = 100, special_sorting: bool = True) -> pd.DataFrame:
    """
    Filters and sorts association rules based on lift, support, and confidence thresholds.

    Parameters:
    - rules_df (pd.DataFrame): DataFrame containing association rules.
    - min_lift (float): Minimum lift threshold.
    - min_support (float): Minimum support threshold.
    - min_confidence (float): Minimum confidence threshold.
    - max_rules (int): Maximum number of rules to return.

    Returns:
    - pd.DataFrame: Filtered and sorted association rules.
    """
    # Apply multiple filters
    filtered_rules = rules_df[
        (rules_df['lift'] >= min_lift) &
        (rules_df['support'] >= min_support) &
        (rules_df['confidence'] >= min_confidence)
    ].copy()

    if special_sorting: # Ensure you're modifying the DataFrame in place using .loc
        filtered_rules.loc[:,'combined_product'] = filtered_rules['lift'] * (1/4) * filtered_rules['confidence'] * filtered_rules['support'] * filtered_rules['zhangs_metric']
        sorted_rules = filtered_rules.sort_values(
            by=['combined_product'], 
            ascending=[False]
        )
    else: # Sort rules by lift descending, then by support descending, then by confidence descending
        sorted_rules = filtered_rules.sort_values(
            by=['lift', 'support', 'confidence'], 
            ascending=[False, False, False]
        )
    
    # Remove duplicate rules based on antecedents and consequents
    sorted_rules = sorted_rules.drop_duplicates(subset=['antecedents', 'consequents'])
    # print("len(sorted_rules)" , " " , len(sorted_rules))
    # print("len(rules_df)" , " " , len(rules_df))

    # Return the top max_rules
    return sorted_rules.head(max_rules)


def output_rules_apriori(df: pd.DataFrame, min_lift: float = 1.0, min_support: float = 0.07, 
                min_confidence: float = 0.0, max_rules: int = 15):


    frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
    filtered_rules = filter_rules(rules, min_lift=min_lift, min_support=min_support, min_confidence=min_confidence, max_rules=max_rules)

    return filtered_rules

In [606]:
# Define minimum support (e.g., 0.05 for 5%)
frequent_itemsets = apriori(encoded_features, min_support=0.05, use_colnames=True)

# Sort by support
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)
print(len(frequent_itemsets))
frequent_itemsets.head(10)


197


Unnamed: 0,support,itemsets
31,0.36,(diet_invertebrates)
12,0.34,(biotope_lakes)
26,0.32,(diet_fish)
30,0.26,(diet_insects)
32,0.26,(diet_plants)
40,0.24,(family_1.2 Charadrii)
80,0.2,"(diet_fish, biotope_lakes)"
35,0.2,(diet_small-rodents)
18,0.18,(biotope_sea-bays)
123,0.18,"(diet_invertebrates, family_1.2 Charadrii)"


In [607]:
output_rules_apriori(df = encoded_features)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,combined_product
58,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\ng\nr\no\nu\np\n_\ns\nc\no\nl\no\np\na\nc\ni\nd\na\ne\n'\n}\n),diet_invertebrates\nfamily_1.2 Charadrii,0.16,0.18,0.16,1.0,5.555556,0.1312,inf,0.97619,0.216931
55,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\ni\nn\nv\ne\nr\nt\ne\nb\nr\na\nt\ne\ns\n'\n\n\n \n'\nf\na\nm\ni\nl\ny\n_\n1\n.\n2\n \nC\nh\na\nr\na\nd\nr\ni\ni\n'\n}\n),group_scolopacidae,0.18,0.16,0.16,0.888889,5.555556,0.1312,7.56,1.0,0.197531
95,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\np\nl\na\nn\nt\ns\n'\n\n\n \n'\nd\ni\ne\nt\n_\ni\nn\nv\ne\nr\nt\ne\nb\nr\na\nt\ne\ns\n'\n\n\n \n'\nf\na\nm\ni\nl\ny\n_\n2\n.\n1\n \nA\nn\na\nt\ni\nn\na\ne\n'\n}\n),group_dabbling ducks,0.08,0.1,0.08,1.0,10.0,0.072,inf,0.978261,0.195652
42,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\ni\nn\nv\ne\nr\nt\ne\nb\nr\na\nt\ne\ns\n'\n\n\n \n'\nf\na\nm\ni\nl\ny\n_\n2\n.\n1\n \nA\nn\na\nt\ni\nn\na\ne\n'\n}\n),group_dabbling ducks,0.08,0.1,0.08,1.0,10.0,0.072,inf,0.978261,0.195652
35,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nb\ni\no\nt\no\np\ne\n_\ns\nh\no\nr\ne\ns\n'\n\n\n \n'\nf\na\nm\ni\nl\ny\n_\n2\n.\n1\n \nA\nn\na\nt\ni\nn\na\ne\n'\n}\n),group_dabbling ducks,0.08,0.1,0.08,1.0,10.0,0.072,inf,0.978261,0.195652
31,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\np\nl\na\nn\nt\ns\n'\n\n\n \n'\nb\ni\no\nt\no\np\ne\n_\ns\nh\no\nr\ne\ns\n'\n}\n),group_dabbling ducks,0.08,0.1,0.08,1.0,10.0,0.072,inf,0.978261,0.195652
85,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\np\nl\na\nn\nt\ns\n'\n\n\n \n'\nb\ni\no\nt\no\np\ne\n_\ns\nh\no\nr\ne\ns\n'\n\n\n \n'\nf\na\nm\ni\nl\ny\n_\n2\n.\n1\n \nA\nn\na\nt\ni\nn\na\ne\n'\n}\n),group_dabbling ducks,0.08,0.1,0.08,1.0,10.0,0.072,inf,0.978261,0.195652
89,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\np\nl\na\nn\nt\ns\n'\n\n\n \n'\nb\ni\no\nt\no\np\ne\n_\ns\nh\no\nr\ne\ns\n'\n}\n),family_2.1 Anatinae\ngroup_dabbling ducks,0.08,0.1,0.08,1.0,10.0,0.072,inf,0.978261,0.195652
100,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\ni\nn\nv\ne\nr\nt\ne\nb\nr\na\nt\ne\ns\n'\n\n\n \n'\nf\na\nm\ni\nl\ny\n_\n2\n.\n1\n \nA\nn\na\nt\ni\nn\na\ne\n'\n}\n),diet_plants\ngroup_dabbling ducks,0.08,0.1,0.08,1.0,10.0,0.072,inf,0.978261,0.195652
91,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nb\ni\no\nt\no\np\ne\n_\ns\nh\no\nr\ne\ns\n'\n\n\n \n'\nf\na\nm\ni\nl\ny\n_\n2\n.\n1\n \nA\nn\na\nt\ni\nn\na\ne\n'\n}\n),diet_plants\ngroup_dabbling ducks,0.08,0.1,0.08,1.0,10.0,0.072,inf,0.978261,0.195652


## Iteration-2
## Introduction of more features


In [608]:
def boolean_encode_column(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """ 
    turns the yes no columns to boolean
    """
    df[column] = df[column].map({'Yes': True, 'No': False})
    return df

def add_new_columns(data: pd.DataFrame, df: pd.DataFrame, columns: list[str], booleanize: bool = True) -> pd.DataFrame:
    """ 
    adds a new column and turns that into boolean values.map
    """
    
    for col in columns:
        if col not in data.columns:
            raise ValueError (f"Column '{col}' does not exist in the DataFrame.")

        df[col] = data[col]
        if booleanize:
            boolean_encode_column(df, col)

    return df

In [609]:

data['non_sim'] = data['sim'].map({'Yes': 'No', 'No': 'Yes'})

binary_encoded_columns = ['sim','non_sim' , 'diver', 'long-billed', 'webbed-feet', 'long-legs', 'wading-bird', 'plunge-dives']


add_new_columns(data, encoded_features, binary_encoded_columns)

encoded_features.T.iloc[:, 10:30]

Unnamed: 0,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
group_accipitridae,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
group_anserinae,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False
group_ardeidae,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
group_charadriidae,False,False,False,True,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False
group_dabbling ducks,False,False,False,False,False,False,False,True,True,True,True,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
long-billed,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False
webbed-feet,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True,True,False,False,False,False
long-legs,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True
wading-bird,True,True,True,True,True,True,True,False,False,False,False,False,False,False,False,False,True,True,True,True


### Iteration-2 Apriori

In [610]:

output_rules_apriori(df = encoded_features)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,combined_product
1700,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nw\na\nd\ni\nn\ng\n-\nb\ni\nr\nd\n'\n\n\n \n'\nd\ni\ne\nt\n_\ns\nm\na\nl\nl\n-\nr\no\nd\ne\nn\nt\ns\n'\n\n\n \n'\ns\ni\nm\n'\n}\n),diet_frogs\nlong-billed,0.08,0.08,0.08,1.0,12.5,0.0736,inf,1.0,0.25
1110,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nf\na\nm\ni\nl\ny\n_\n2\n.\n1\n \nA\nn\na\nt\ni\nn\na\ne\n'\n}\n),diet_plants\nnon_sim\nwebbed-feet,0.14,0.14,0.14,1.0,7.142857,0.1204,inf,1.0,0.25
1052,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\ns\nm\na\nl\nl\n-\nr\no\nd\ne\nn\nt\ns\n'\n\n\n \n'\nl\no\nn\ng\n-\nb\ni\nl\nl\ne\nd\n'\n}\n),wading-bird\ndiet_frogs,0.08,0.08,0.08,1.0,12.5,0.0736,inf,1.0,0.25
1053,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\nf\nr\no\ng\ns\n'\n\n\n \n'\nl\no\nn\ng\n-\nb\ni\nl\nl\ne\nd\n'\n}\n),wading-bird\ndiet_small-rodents,0.08,0.08,0.08,1.0,12.5,0.0736,inf,1.0,0.25
1099,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\np\nl\na\nn\nt\ns\n'\n\n\n \n'\nn\no\nn\n_\ns\ni\nm\n'\n\n\n \n'\nw\ne\nb\nb\ne\nd\n-\nf\ne\ne\nt\n'\n}\n),family_2.1 Anatinae,0.14,0.14,0.14,1.0,7.142857,0.1204,inf,1.0,0.25
1103,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\np\nl\na\nn\nt\ns\n'\n\n\n \n'\nn\no\nn\n_\ns\ni\nm\n'\n}\n),webbed-feet\nfamily_2.1 Anatinae,0.14,0.14,0.14,1.0,7.142857,0.1204,inf,1.0,0.25
1105,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\np\nl\na\nn\nt\ns\n'\n\n\n \n'\nf\na\nm\ni\nl\ny\n_\n2\n.\n1\n \nA\nn\na\nt\ni\nn\na\ne\n'\n}\n),non_sim\nwebbed-feet,0.14,0.14,0.14,1.0,7.142857,0.1204,inf,1.0,0.25
1106,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nn\no\nn\n_\ns\ni\nm\n'\n\n\n \n'\nw\ne\nb\nb\ne\nd\n-\nf\ne\ne\nt\n'\n}\n),diet_plants\nfamily_2.1 Anatinae,0.14,0.14,0.14,1.0,7.142857,0.1204,inf,1.0,0.25
454,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nf\na\nm\ni\nl\ny\n_\n2\n.\n1\n \nA\nn\na\nt\ni\nn\na\ne\n'\n}\n),non_sim\nwebbed-feet,0.14,0.14,0.14,1.0,7.142857,0.1204,inf,1.0,0.25
1108,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nw\ne\nb\nb\ne\nd\n-\nf\ne\ne\nt\n'\n\n\n \n'\nf\na\nm\ni\nl\ny\n_\n2\n.\n1\n \nA\nn\na\nt\ni\nn\na\ne\n'\n}\n),diet_plants\nnon_sim,0.14,0.14,0.14,1.0,7.142857,0.1204,inf,1.0,0.25


## Iteration-3 Numerical features

In [611]:
def range_to_mid(r: str) -> float:
    '''
    Return the mid value of a numerical range or a digit
    '''
    if isinstance(r, str) and '-' in r:
        lower_bound, upper_bound = list(map(int,r.split('-')))
        return (lower_bound + upper_bound) / 2
    elif r.isdigit():
        return float(r)
    else:
        raise TypeError("Input should be a numerical range or a single value as a string.")

    

def min_max_scaling(d: np.ndarray) -> np.ndarray:
    min_val = d.min()
    max_val = d.max()
    return (d-min_val) / (max_val - min_val)

In [612]:
num_range_cols = ['length', 'wspan', 'weight', 'eggs']
other_num = ['AR', 'wload']

add_new_columns(data, encoded_features, num_range_cols, booleanize= False)
add_new_columns(data, encoded_features, other_num, booleanize= False)

encoded_features.T.iloc[70:, :2]

Unnamed: 0,0,1
webbed-feet,True,True
long-legs,False,False
wading-bird,False,False
plunge-dives,True,True
length,34-38,55-65
wspan,86-99,123-148
weight,200-350,800-1300
eggs,1-3,1-3
AR,8.13,8.24
wload,0.31,0.64


In [613]:

# map numerical ranges to their mid points
encoded_features[num_range_cols] = encoded_features[num_range_cols].map(range_to_mid)
encoded_features.rename(columns={col: f'mid_{col}' for col in num_range_cols}, inplace=True)


# Calculate two new features BMI and WSI
encoded_features['BMI'] = encoded_features['mid_weight'] / (encoded_features['mid_length'] ** 2)
encoded_features['WSI'] = encoded_features['mid_wspan'] / encoded_features['mid_length']






encoded_features.T.iloc[70:, :2]

Unnamed: 0,0,1
webbed-feet,True,True
long-legs,False,False
wading-bird,False,False
plunge-dives,True,True
mid_length,36.0,60.0
mid_wspan,92.5,135.5
mid_weight,275.0,1050.0
mid_eggs,2.0,2.0
AR,8.13,8.24
wload,0.31,0.64


In [614]:
def encode_top80_numericals_as1HE_column(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """ 
    turns the numerical column into a 1HE column and drops the original column
    """

    if column not in df.columns:
        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

    # get the columns that are part of the top 80 percentile of that column values 
    limit = df[column].quantile(0.80)

    new_col_name = f'top80_{column}'


    df[new_col_name] = (df[column] >= limit)

    # assigning to df again to emulate in-place modification
    df = df.drop(columns=[column])
    return df


In [615]:

numerical_compound_columns = ['AR', 'wload', 'WSI', 'BMI', 'mid_eggs']
for column in numerical_compound_columns:
    encoded_features  = encode_top80_numericals_as1HE_column(encoded_features, column=column)



In [616]:

encoded_features.T.iloc[70:, :2]

Unnamed: 0,0,1
webbed-feet,True,True
long-legs,False,False
wading-bird,False,False
plunge-dives,True,True
mid_length,36.0,60.0
mid_wspan,92.5,135.5
mid_weight,275.0,1050.0
top80_AR,True,True
top80_wload,False,False
top80_WSI,True,True


In [617]:

mid_columns = [f'mid_{col}' for col in num_range_cols if col != 'eggs']
mid_columns

['mid_length', 'mid_wspan', 'mid_weight']

In [618]:

encoded_features.drop(columns=mid_columns, inplace=True)

encoded_features.T.iloc[70:, :15]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
webbed-feet,True,True,True,True,True,False,False,False,False,False,False,False,False,False,False
long-legs,False,False,False,False,False,True,False,False,True,True,False,False,False,False,False
wading-bird,False,False,False,False,False,True,True,True,True,True,True,True,True,True,True
plunge-dives,True,True,True,True,True,False,False,False,False,False,False,False,False,False,False
top80_AR,True,True,True,True,True,False,False,False,False,False,False,False,False,False,False
top80_wload,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
top80_WSI,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False
top80_BMI,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
top80_mid_eggs,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


### Iteration-3 Apriori


In [619]:
# Example: Set maximum width for display columns
pd.set_option('display.max_colwidth', None)  # This sets no limit on column width
pd.set_option('display.max_columns', None)   # Show all columns
output_rules_apriori(df = encoded_features)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,combined_product
1843,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\np\nl\na\nn\nt\ns\n'\n\n\n \n'\nn\no\nn\n_\ns\ni\nm\n'\n}\n),webbed-feet\nfamily_2.1 Anatinae,0.14,0.14,0.14,1.0,7.142857,0.1204,inf,1.0,0.25
3542,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nw\na\nd\ni\nn\ng\n-\nb\ni\nr\nd\n'\n\n\n \n'\nd\ni\ne\nt\n_\ns\nm\na\nl\nl\n-\nr\no\nd\ne\nn\nt\ns\n'\n}\n),sim\ndiet_frogs\nlong-billed,0.08,0.08,0.08,1.0,12.5,0.0736,inf,1.0,0.25
3543,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nw\na\nd\ni\nn\ng\n-\nb\ni\nr\nd\n'\n\n\n \n'\nd\ni\ne\nt\n_\nf\nr\no\ng\ns\n'\n}\n),diet_small-rodents\nsim\nlong-billed,0.08,0.08,0.08,1.0,12.5,0.0736,inf,1.0,0.25
3544,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\ns\nm\na\nl\nl\n-\nr\no\nd\ne\nn\nt\ns\n'\n\n\n \n'\nl\no\nn\ng\n-\nb\ni\nl\nl\ne\nd\n'\n}\n),wading-bird\nsim\ndiet_frogs,0.08,0.08,0.08,1.0,12.5,0.0736,inf,1.0,0.25
3545,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\nf\nr\no\ng\ns\n'\n\n\n \n'\nl\no\nn\ng\n-\nb\ni\nl\nl\ne\nd\n'\n}\n),wading-bird\ndiet_small-rodents\nsim,0.08,0.08,0.08,1.0,12.5,0.0736,inf,1.0,0.25
1862,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nf\na\nm\ni\nl\ny\n_\n2\n.\n1\n \nA\nn\na\nt\ni\nn\na\ne\n'\n}\n),diet_plants\ntop80_mid_eggs\nnon_sim,0.14,0.14,0.14,1.0,7.142857,0.1204,inf,1.0,0.25
1863,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\np\nl\na\nn\nt\ns\n'\n\n\n \n'\nt\no\np\n8\n0\n_\nm\ni\nd\n_\ne\ng\ng\ns\n'\n\n\n \n'\nw\ne\nb\nb\ne\nd\n-\nf\ne\ne\nt\n'\n}\n),family_2.1 Anatinae,0.14,0.14,0.14,1.0,7.142857,0.1204,inf,1.0,0.25
3616,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\np\nl\na\nn\nt\ns\n'\n\n\n \n'\nt\no\np\n8\n0\n_\nm\ni\nd\n_\ne\ng\ng\ns\n'\n\n\n \n'\nn\no\nn\n_\ns\ni\nm\n'\n\n\n \n'\nw\ne\nb\nb\ne\nd\n-\nf\ne\ne\nt\n'\n}\n),family_2.1 Anatinae,0.14,0.14,0.14,1.0,7.142857,0.1204,inf,1.0,0.25
3620,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\np\nl\na\nn\nt\ns\n'\n\n\n \n'\nt\no\np\n8\n0\n_\nm\ni\nd\n_\ne\ng\ng\ns\n'\n\n\n \n'\nn\no\nn\n_\ns\ni\nm\n'\n}\n),webbed-feet\nfamily_2.1 Anatinae,0.14,0.14,0.14,1.0,7.142857,0.1204,inf,1.0,0.25
3622,f\nr\no\nz\ne\nn\ns\ne\nt\n(\n{\n'\nd\ni\ne\nt\n_\np\nl\na\nn\nt\ns\n'\n\n\n \n'\nn\no\nn\n_\ns\ni\nm\n'\n\n\n \n'\nw\ne\nb\nb\ne\nd\n-\nf\ne\ne\nt\n'\n}\n),top80_mid_eggs\nfamily_2.1 Anatinae,0.14,0.14,0.14,1.0,7.142857,0.1204,inf,1.0,0.25
