In [52]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules


In [152]:
# Load the bird data
df = pd.read_csv("birds2024ext.csv", delimiter=';')

df.head()

Unnamed: 0,species,group,length,wspan,weight,AR,wload,back,belly,ftype,...,incub,ccare,biotope,diet,diver,long-billed,webbed-feet,long-legs,wading-bird,plunge-dives
0,naurulokki,laridae,34-38,86-99,200-350,8.13,0.31,light grey,white,B,...,both,both,"lakes,sea-bays","fish,invertebrates,garbage",No,No,Yes,No,No,Yes
1,harmaalokki,laridae,55-65,123-148,800-1300,8.24,0.64,bluish grey,white,B,...,both,both,"lakes,sea-coast,marshland","fish,garbage,chicks,grain",No,No,Yes,No,No,Yes
2,isolokki,laridae,63-68,138-158,1000-1800,8.24,0.66,bluish grey,white,B,...,both,both,"sea-coast, harbours","fish,eggs,chicks,garbage,carrion",No,No,Yes,No,No,Yes
3,kalatiira,sternidae,36-42,70-80,100-145,9.14,0.24,grey,white,B,...,both,both,"lakes,archipelago",fish,No,No,Yes,No,No,Yes
4,lapintiira,sternidae,33-37,66-77,90-130,8.97,0.2,grey,white,B,...,both,both,"archipelago,lakes,marshland",fish,No,No,Yes,No,No,Yes


In [153]:
# Feature extraction (example features based on the task description)
# This will involve creating features like group, habitat, diet, etc.
# May need to preprocess certain columns, especially categorical and binary features.

# Example of handling categorical features
# Assuming the dataset has columns like 'group', 'habitat', 'diet', etc.

# Habitat is mentioned here as biotope

df['group'] = df['group'].fillna('Unknown')
df['biotope'] = df['biotope'].fillna('Unknown')


In [154]:
# Convert binary features (e.g., long-billed)
df['long-billed'] = df['long-billed'].apply(lambda x: 1 if x == 'Yes' else 0)
df['webbed-feet'] = df['webbed-feet'].apply(lambda x: 1 if x == 'Yes' else 0)
df['long-legs'] = df['long-legs'].apply(lambda x: 1 if x == 'Yes' else 0)
df['wading-bird'] = df['wading-bird'].apply(lambda x: 1 if x == 'Yes' else 0)
df['plunge-dives'] = df['plunge-dives'].apply(lambda x: 1 if x == 'Yes' else 0)
df['diver'] = df['diver'].apply(lambda x: 1 if x == 'Yes' else 0)


In [155]:
# Handle multi-valued categorical features
# Split columns like 'biotope' into multiple binary features
biotope_dummies = df['biotope'].str.get_dummies(',')
biotope_dummies.columns = biotope_dummies.columns.str.strip()+'-bio'

diet_dummies = df['diet'].str.get_dummies(',')
diet_dummies.columns = diet_dummies.columns.str.strip()+'-diet'


In [156]:
# Merge dummy columns back into the main dataframe
df = pd.concat([df, biotope_dummies, diet_dummies], axis=1)


In [157]:
df.tail()

Unnamed: 0,species,group,length,wspan,weight,AR,wload,back,belly,ftype,...,plants-diet,rodents-diet,seeds-diet,shellfish-diet,small-rodents-diet,snails-diet,snakes-diet,squirrels-diet,vertebrae-diet,worms-diet
45,mehiläishaukka,accipitridae,52-59,110-132,510-1160,5.48,0.38,greyish brown,brown-white,A,...,0,0,0,0,1,0,0,0,0,0
46,ruskosuohaukka,accipitridae,43-55,115-140,480-750,6.37,0.34,dark brown,brown,A,...,0,0,0,0,1,0,0,0,0,0
47,sinisuohaukka,accipitridae,42-55,97-118,285-630,5.81,0.21,brown,brown-white,A,...,0,0,0,0,1,0,0,0,0,0
48,haarahaukka,accipitridae,48-58,130-155,560-940,7.14,0.33,dark brown,dark brown,A,...,0,0,0,0,1,0,0,0,0,0
49,merikotka,accipitridae,78-100,190-240,4000-7000,5.18,0.68,brown,brown,A,...,0,0,0,0,0,0,0,0,0,0


In [158]:
df.columns

Index(['species', 'group', 'length', 'wspan', 'weight', 'AR', 'wload', 'back',
       'belly', 'ftype', 'sim', 'billcol', 'legcol', 'arrives', 'leaves',
       'eggs', 'incub', 'ccare', 'biotope', 'diet', 'diver', 'long-billed',
       'webbed-feet', 'long-legs', 'wading-bird', 'plunge-dives',
       'forest-edges-bio', 'harbours-bio', 'archipelago-bio',
       'coastal-meadows-bio', 'fells-bio', 'fields-bio', 'forest-edges-bio',
       'forests-bio', 'islets-bio', 'lakes-bio', 'marshland-bio',
       'meadows-bio', 'nutrient-rich-lakes-bio', 'pastures-bio', 'ponds-bio',
       'reedbeds-bio', 'sea-bays-bio', 'sea-coast-bio', 'shores-bio',
       'shrub-tundra-bio', 'streams-bio', 'wetlands-bio', 'algae-diet',
       'berries-diet', 'birds-diet', 'carrion-diet', 'chicks-diet',
       'clams-diet', 'eggs-diet', 'fish-diet', 'frogs-diet', 'garbage-diet',
       'grain-diet', 'grass-diet', 'hares-diet', 'insects-diet',
       'invertebrates-diet', 'larvae-diet', 'lizards-diet', 'molluscs-

In [159]:
# Removing original columns as we now have dummies
df.drop(columns=['biotope', 'diet'], inplace=True)


In [160]:
df['eggs'].head()

0    1-3
1    1-3
2      3
3    1-3
4    1-3
Name: eggs, dtype: object

In [161]:
def range_to_mid(r):
    '''
    Return the mid value of a numerical range
    '''
    if isinstance(r, str) and '-' in r:
        lower_bound, upper_bound = list(map(int,r.split('-')))
        return (lower_bound + upper_bound) / 2
    else:
        return r

In [162]:
num_range_cols = ['length', 'wspan', 'weight', 'eggs']
# map numerical ranges to their mid points
df[num_range_cols] = df[num_range_cols].map(range_to_mid)
df.rename(columns={col: f'mid-{col}' for col in num_range_cols}, inplace=True)


In [163]:
df.head()

Unnamed: 0,species,group,mid-length,mid-wspan,mid-weight,AR,wload,back,belly,ftype,...,plants-diet,rodents-diet,seeds-diet,shellfish-diet,small-rodents-diet,snails-diet,snakes-diet,squirrels-diet,vertebrae-diet,worms-diet
0,naurulokki,laridae,36.0,92.5,275.0,8.13,0.31,light grey,white,B,...,0,0,0,0,0,0,0,0,0,0
1,harmaalokki,laridae,60.0,135.5,1050.0,8.24,0.64,bluish grey,white,B,...,0,0,0,0,0,0,0,0,0,0
2,isolokki,laridae,65.5,148.0,1400.0,8.24,0.66,bluish grey,white,B,...,0,0,0,0,0,0,0,0,0,0
3,kalatiira,sternidae,39.0,75.0,122.5,9.14,0.24,grey,white,B,...,0,0,0,0,0,0,0,0,0,0
4,lapintiira,sternidae,35.0,71.5,110.0,8.97,0.2,grey,white,B,...,0,0,0,0,0,0,0,0,0,0


In [164]:
df.columns

Index(['species', 'group', 'mid-length', 'mid-wspan', 'mid-weight', 'AR',
       'wload', 'back', 'belly', 'ftype', 'sim', 'billcol', 'legcol',
       'arrives', 'leaves', 'mid-eggs', 'incub', 'ccare', 'diver',
       'long-billed', 'webbed-feet', 'long-legs', 'wading-bird',
       'plunge-dives', 'forest-edges-bio', 'harbours-bio', 'archipelago-bio',
       'coastal-meadows-bio', 'fells-bio', 'fields-bio', 'forest-edges-bio',
       'forests-bio', 'islets-bio', 'lakes-bio', 'marshland-bio',
       'meadows-bio', 'nutrient-rich-lakes-bio', 'pastures-bio', 'ponds-bio',
       'reedbeds-bio', 'sea-bays-bio', 'sea-coast-bio', 'shores-bio',
       'shrub-tundra-bio', 'streams-bio', 'wetlands-bio', 'algae-diet',
       'berries-diet', 'birds-diet', 'carrion-diet', 'chicks-diet',
       'clams-diet', 'eggs-diet', 'fish-diet', 'frogs-diet', 'garbage-diet',
       'grain-diet', 'grass-diet', 'hares-diet', 'insects-diet',
       'invertebrates-diet', 'larvae-diet', 'lizards-diet', 'molluscs-die

In [165]:
df['mid-eggs'] = pd.to_numeric(df['mid-eggs'], errors='coerce')

In [166]:
df['mid-eggs']

0      2.0
1      2.0
2      3.0
3      2.0
4      2.0
5      4.0
6      4.0
7      4.0
8      4.0
9      4.0
10     4.0
11     4.0
12     4.0
13     4.0
14     4.0
15     3.0
16     4.0
17     8.0
18     9.0
19     8.5
20     9.0
21    10.0
22     9.0
23     9.5
24     6.0
25     6.5
26     2.0
27     8.5
28    10.0
29     7.0
30     8.0
31    10.0
32     3.5
33     4.5
34     4.5
35     5.0
36     5.0
37     2.0
38     2.0
39     6.0
40     5.0
41     4.0
42     3.5
43     5.0
44     2.5
45     1.5
46     4.5
47     5.0
48     2.5
49     2.0
Name: mid-eggs, dtype: float64

In [167]:
# Handling numerical features: create features for extremes (e.g., eggs)
df['few-eggs'] = df['mid-eggs'].apply(lambda x: 1 if x < 3 else 0)
df['many-eggs'] = df['mid-eggs'].apply(lambda x: 1 if x > 6 else 0)


In [168]:
df['mid-eggs']

0      2.0
1      2.0
2      3.0
3      2.0
4      2.0
5      4.0
6      4.0
7      4.0
8      4.0
9      4.0
10     4.0
11     4.0
12     4.0
13     4.0
14     4.0
15     3.0
16     4.0
17     8.0
18     9.0
19     8.5
20     9.0
21    10.0
22     9.0
23     9.5
24     6.0
25     6.5
26     2.0
27     8.5
28    10.0
29     7.0
30     8.0
31    10.0
32     3.5
33     4.5
34     4.5
35     5.0
36     5.0
37     2.0
38     2.0
39     6.0
40     5.0
41     4.0
42     3.5
43     5.0
44     2.5
45     1.5
46     4.5
47     5.0
48     2.5
49     2.0
Name: mid-eggs, dtype: float64

In [169]:
# Removing mid-eggs
df.drop(columns=['mid-eggs'], inplace=True)


In [170]:
# df_new = df.drop(['back', 'belly', 'ftype', 'sim', 'billcol', 'legcol', 'arrives', 'leaves', 'incub', 'ccare'], axis=1)

df_new = df.drop(['back', 'belly', 'ftype', 'sim', 'billcol', 'legcol', 'arrives', 'leaves', 'incub', 'ccare'], axis=1)

In [171]:
df_new.to_csv('output1.csv', index=False)

In [174]:
df_binary = df_new.applymap(lambda x: 1 if x != 0 else 0) 

# Use the apriori function with the binary DataFrame
frequent_itemsets = apriori(df_binary, min_support=0.05, use_colnames=True)

  df_binary = df_new.applymap(lambda x: 1 if x != 0 else 0)  # Adjust this logic based on your data


In [175]:
# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.5)

# Filter the rules for more interesting results
significant_rules = rules[(rules['confidence'] > 0.6) & (rules['lift'] > 1.5)]

# Output the most significant and interesting rules
significant_rules = significant_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

# Display the resulting rules
significant_rules.head()

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(diver),(webbed-feet),0.18,1.0,2.272727
2,(diver),(lakes-bio),0.14,0.777778,2.287582
8,(diver),(fish-diet),0.14,0.777778,2.430556
13,(shellfish-diet),(diver),0.08,1.0,5.555556
15,(long-legs),(long-billed),0.12,0.666667,2.564103


In [177]:
significant_rules.head(20)

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(diver),(webbed-feet),0.18,1.0,2.272727
2,(diver),(lakes-bio),0.14,0.777778,2.287582
8,(diver),(fish-diet),0.14,0.777778,2.430556
13,(shellfish-diet),(diver),0.08,1.0,5.555556
15,(long-legs),(long-billed),0.12,0.666667,2.564103
17,(long-billed),(wading-bird),0.2,0.769231,1.831502
29,(plunge-dives),(webbed-feet),0.1,0.833333,1.893939
31,(archipelago-bio),(webbed-feet),0.1,0.833333,1.893939
32,(webbed-feet),(lakes-bio),0.3,0.681818,2.005348
33,(lakes-bio),(webbed-feet),0.3,0.882353,2.005348


In [185]:
randoms = significant_rules.sort_values(by=['lift', 'confidence'], ascending=False)

In [186]:
randoms.head(20)

Unnamed: 0,antecedents,consequents,support,confidence,lift
25292,"(small-rodents-diet, long-legs)","(frogs-diet, long-billed)",0.06,1.0,12.5
25294,"(frogs-diet, long-legs)","(small-rodents-diet, long-billed)",0.06,1.0,12.5
25318,"(small-rodents-diet, wading-bird)","(frogs-diet, long-billed)",0.08,1.0,12.5
25320,"(small-rodents-diet, long-billed)","(wading-bird, frogs-diet)",0.08,1.0,12.5
25321,"(wading-bird, frogs-diet)","(small-rodents-diet, long-billed)",0.08,1.0,12.5
25323,"(frogs-diet, long-billed)","(small-rodents-diet, wading-bird)",0.08,1.0,12.5
25540,"(small-rodents-diet, long-legs)","(wading-bird, frogs-diet)",0.06,1.0,12.5
25543,"(frogs-diet, long-legs)","(small-rodents-diet, wading-bird)",0.06,1.0,12.5
59897,"(small-rodents-diet, species, long-legs)","(frogs-diet, long-billed)",0.06,1.0,12.5
59900,"(species, frogs-diet, long-legs)","(small-rodents-diet, long-billed)",0.06,1.0,12.5
