In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Numerical features engineering

### Data Summary

In [72]:
data = pd.read_csv('birds2024.csv', delimiter=';')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   species       50 non-null     object 
 1   group         50 non-null     object 
 2   length        50 non-null     object 
 3   wspan         50 non-null     object 
 4   weight        50 non-null     object 
 5   AR            50 non-null     float64
 6   wload         50 non-null     float64
 7   back          50 non-null     object 
 8   belly         50 non-null     object 
 9   ftype         50 non-null     object 
 10  sim           50 non-null     object 
 11  billcol       50 non-null     object 
 12  legcol        50 non-null     object 
 13  arrives       50 non-null     object 
 14  leaves        50 non-null     object 
 15  eggs          50 non-null     object 
 16  incub         50 non-null     object 
 17  ccare         50 non-null     object 
 18  biotope       50 non-null     ob

In [73]:
print(data.iloc[:5, :10])
data.iloc[:5, 10:]

       species      group length    wspan     weight    AR  wload  \
0   naurulokki    laridae  34-38    86-99    200-350  8.13   0.31   
1  harmaalokki    laridae  55-65  123-148   800-1300  8.24   0.64   
2     isolokki    laridae  63-68  138-158  1000-1800  8.24   0.66   
3    kalatiira  sternidae  36-42    70-80    100-145  9.14   0.24   
4   lapintiira  sternidae  33-37    66-77     90-130  8.97   0.20   

          back  belly ftype  
0   light grey  white     B  
1  bluish grey  white     B  
2  bluish grey  white     B  
3         grey  white     B  
4         grey  white     B  


Unnamed: 0,sim,billcol,legcol,arrives,leaves,eggs,incub,ccare,biotope,diet,diver,long-billed,webbed-feet,long-legs,wading-bird,plunge-dives
0,Yes,red,red,March-April,July,1-3,both,both,"lakes,sea-bays","fish,invertebrates,garbage",No,No,Yes,No,No,Yes
1,Yes,yellow,reddish,March-April,August-December,1-3,both,both,"lakes,sea-coast,marshland","fish,garbage,chicks,grain",No,No,Yes,No,No,Yes
2,Yes,yellow,reddish,December,March,3,both,both,"sea-coast, harbours","fish,eggs,chicks,garbage,carrion",No,No,Yes,No,No,Yes
3,Yes,red,red,May,August-September,1-3,both,both,"lakes,archipelago",fish,No,No,Yes,No,No,Yes
4,Yes,red,red,May,July-August,1-3,both,both,"archipelago,lakes,marshland",fish,No,No,Yes,No,No,Yes


In [74]:
print(data.iloc[0:2, :].T) 


                                       0                          1
species                       naurulokki                harmaalokki
group                            laridae                    laridae
length                             34-38                      55-65
wspan                              86-99                    123-148
weight                           200-350                   800-1300
AR                                  8.13                       8.24
wload                               0.31                       0.64
back                          light grey                bluish grey
belly                              white                      white
ftype                                  B                          B
sim                                  Yes                        Yes
billcol                              red                     yellow
legcol                               red                    reddish
arrives                      March-April        

In [75]:
data.head(5)

Unnamed: 0,species,group,length,wspan,weight,AR,wload,back,belly,ftype,...,incub,ccare,biotope,diet,diver,long-billed,webbed-feet,long-legs,wading-bird,plunge-dives
0,naurulokki,laridae,34-38,86-99,200-350,8.13,0.31,light grey,white,B,...,both,both,"lakes,sea-bays","fish,invertebrates,garbage",No,No,Yes,No,No,Yes
1,harmaalokki,laridae,55-65,123-148,800-1300,8.24,0.64,bluish grey,white,B,...,both,both,"lakes,sea-coast,marshland","fish,garbage,chicks,grain",No,No,Yes,No,No,Yes
2,isolokki,laridae,63-68,138-158,1000-1800,8.24,0.66,bluish grey,white,B,...,both,both,"sea-coast, harbours","fish,eggs,chicks,garbage,carrion",No,No,Yes,No,No,Yes
3,kalatiira,sternidae,36-42,70-80,100-145,9.14,0.24,grey,white,B,...,both,both,"lakes,archipelago",fish,No,No,Yes,No,No,Yes
4,lapintiira,sternidae,33-37,66-77,90-130,8.97,0.2,grey,white,B,...,both,both,"archipelago,lakes,marshland",fish,No,No,Yes,No,No,Yes


## Iteration-1 features

start with the habitat, group, diet as the first features.

In [76]:
# More descriptive naming
feature_subset = data[['species', 'group', 'biotope', 'diet']]
feature_subset.head(5)


Unnamed: 0,species,group,biotope,diet
0,naurulokki,laridae,"lakes,sea-bays","fish,invertebrates,garbage"
1,harmaalokki,laridae,"lakes,sea-coast,marshland","fish,garbage,chicks,grain"
2,isolokki,laridae,"sea-coast, harbours","fish,eggs,chicks,garbage,carrion"
3,kalatiira,sternidae,"lakes,archipelago",fish
4,lapintiira,sternidae,"archipelago,lakes,marshland",fish


In [77]:
def count_unique_items(df, columns):
    """
    Counts the number of unique items in each specified column of the DataFrame,
    considering that some cell values contain comma-separated items.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the data.
    - columns (list): List of column names to analyze.

    Returns:
    - dict: A dictionary with column names as keys and unique counts as values.
    """
    unique_counts = {}
    for column in columns:
        unique_items = set()
        for items in df[column].dropna():
            # Split the string by commas and strip whitespace
            split_items = [item.strip() for item in items.split(',')]
            unique_items.update(split_items)
        unique_counts[column] = len(unique_items)
    return unique_counts


In [78]:
# Calculate unique counts
unique_counts = count_unique_items(feature_subset, feature_subset.columns.tolist()[1:])
sum = 0
# Print unique counts
print("\nNumber of Unique Values in Each Column:")
for column, count in unique_counts.items():
    sum += count
    print(f"Column '{column}': {count} unique value(s)")
print(f"total count: {sum}")



Number of Unique Values in Each Column:
Column 'species': 50 unique value(s)
Column 'group': 14 unique value(s)
Column 'biotope': 21 unique value(s)
Column 'diet': 28 unique value(s)
total count: 113


### One-Hot Encoding

The Apriori algorithm needs the data to be in a 1 hot encoded format with binary values for each feature. thus the 3 features are now subdivided into binary features



go through groups, if not in array, add it. 
then loop through that arr, and add that to columns. 
then loop through data and add 1 to that col if group is that 
then delete group column


pd.concat() in pandas is used to concatenate two or more DataFrames or Series along a particular axis (either rows or columns). You can specify how you want to join the data (e.g., along rows or columns) and whether to keep all data or just the common parts.

In [79]:
pd.set_option('display.max_rows', None)  # This will show all rows

# One-Hot Encode the 'group' Column
group_dummies = pd.get_dummies(feature_subset['group'], prefix='group')

# One-Hot Encode the 'biotope' Column
biotope_expanded = feature_subset['biotope'].str.get_dummies(sep=',').rename(columns=lambda x: f'biotope_{x.strip()}')

# One-Hot Encode the 'diet' Column
diet_expanded = feature_subset['diet'].str.get_dummies(sep=',').rename(columns=lambda x: f'diet_{x.strip()}')

# Concatenate all dummy DataFrames with the original subset, but drop the original 'group', 'biotope', 'diet' columns
encoded_features = pd.concat([feature_subset.drop(columns=['group', 'biotope', 'diet']), group_dummies, biotope_expanded, diet_expanded], axis=1)
# Display the Encoded Features
print(encoded_features.head(1).T)
print("encoded_features.head(1).T.shape", " : " , encoded_features.head(1).T.shape)


                                      0
species                      naurulokki
group_accipitridae                False
group_anserinae                   False
group_ardeidae                    False
group_charadriidae                False
group_dabbling ducks              False
group_diving ducks                False
group_gaviidae                    False
group_gruifores                   False
group_laridae                      True
group_phalacrocoracidae           False
group_podicipedidae               False
group_rallidae                    False
group_scolopacidae                False
group_sternidae                   False
biotope_forest-edges                  0
biotope_harbours                      0
biotope_archipelago                   0
biotope_coastal-meadows               0
biotope_fells                         0
biotope_fields                        0
biotope_forest-edges                  0
biotope_forests                       0
biotope_islets                        0
