In [1]:
pip install mlxtend

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import necessary libraries:
import pandas as pd
import numpy as np
import mlxtend
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [3]:
# Import dataset(This csv fiel is created from Project_1_ETL_part,ipynb):
df_merged_medals_hosts_continent = pd.read_csv("merged_medals_hosts_continent.csv")

## 1. Association rule mining for China:

**- Filter dataset:**

In [4]:
# Only keep China's related data:
df_association_rule_China = df_merged_medals_hosts_continent[df_merged_medals_hosts_continent["Country Name"].isin(["China"])].copy()

In [5]:
# Drop the columns that I don't need for this time's association rule mining:
df_association_rule_China.drop(["Country Name", "Continent", "Game Season", "Slug Game", 
                                "Participant Type", "Event Title", "Game Year", "Athlete Full Name"], axis=1, inplace=True)

In [6]:
df_association_rule_China

Unnamed: 0,Discipline Title,Event Gender,Medal Type
25,Freestyle Skiing,Women,GOLD
31,Freestyle Skiing,Mixed,SILVER
37,Freestyle Skiing,Men,GOLD
40,Freestyle Skiing,Women,GOLD
43,Freestyle Skiing,Women,GOLD
...,...,...,...
12177,Diving,Women,GOLD
12185,Diving,Men,BRONZE
12187,Diving,Men,SILVER
12241,Archery,Women,SILVER


In [7]:
# Check missing values in the dataframe:
df_association_rule_China.isna

<bound method DataFrame.isna of        Discipline Title Event Gender Medal Type
25     Freestyle Skiing        Women       GOLD
31     Freestyle Skiing        Mixed     SILVER
37     Freestyle Skiing          Men       GOLD
40     Freestyle Skiing        Women       GOLD
43     Freestyle Skiing        Women       GOLD
...                 ...          ...        ...
12177            Diving        Women       GOLD
12185            Diving          Men     BRONZE
12187            Diving          Men     SILVER
12241           Archery        Women     SILVER
12251          Handball        Women     BRONZE

[807 rows x 3 columns]>

In [8]:
# Count total missing values at each column in the dataframe:
df_association_rule_China.isna().sum()

Discipline Title    0
Event Gender        0
Medal Type          0
dtype: int64

**- Data Transformation:**

In [9]:
# Convert the data in new_df_association_rule_China to string type since TransactionEncoder() function only can handle string type:
new_df_association_rule_China = df_association_rule_China.astype(str)

In [10]:
# Convert the data from dataframe new_df_association_rule_China into a list:
list_China = new_df_association_rule_China.values.tolist()

In [11]:
# Covert the list to one-hot encoded boolean numpy array. 
# Apriori function allows boolean data type only, such as 1 and 0, or FALSE and TRUE.
te_China = TransactionEncoder()
array_te_China = te_China.fit(list_China).transform(list_China)

In [12]:
# check the array:
array_te_China

array([[False, False, False, ..., False,  True, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False,  True, False, ..., False,  True, False],
       [False, False, False, ..., False,  True, False]])

In [13]:
# check the columns:
te_China.columns_

['3x3 Basketball',
 'Archery',
 'Artistic Gymnastics',
 'Artistic Swimming',
 'Athletics',
 'BRONZE',
 'Badminton',
 'Basketball',
 'Beach Volleyball',
 'Boxing',
 'Canoe Sprint',
 'Curling',
 'Cycling Track',
 'Diving',
 'Fencing',
 'Figure skating',
 'Football',
 'Freestyle Skiing',
 'GOLD',
 'Golf',
 'Gymnastics Artistic',
 'Gymnastics Rhythmic',
 'Handball',
 'Hockey',
 'Judo',
 'Karate',
 'Men',
 'Mixed',
 'Modern Pentathlon',
 'Open',
 'Rowing',
 'SILVER',
 'Sailing',
 'Shooting',
 'Short Track',
 'Short Track Speed Skating',
 'Skeleton',
 'Snowboard',
 'Softball',
 'Speed skating',
 'Swimming',
 'Synchronized Swimming',
 'Table Tennis',
 'Taekwondo',
 'Tennis',
 'Trampoline',
 'Trampoline Gymnastics',
 'Volleyball',
 'Weightlifting',
 'Women',
 'Wrestling']

In [14]:
# Covert the array to a dataframe since Apriori function can handle dataframe only:
arm_df_China = pd.DataFrame(array_te_China, columns = te_China.columns_)

**- Association Rule Generation:**

In [15]:
#Find the frequent itemsets
frequent_itemsets_China = apriori(arm_df_China,min_support=0.05,use_colnames =True)

In [16]:
#Check the length of rules
frequent_itemsets_China["length"]=frequent_itemsets_China["itemsets"].apply(lambda x: len(x))

In [17]:
# Use confidence to filter out association rules that are not strong enough:
# Assume the min confidence is 0.5
rules_con_China = association_rules(frequent_itemsets_China, metric="confidence",min_threshold=0.5)

In [18]:
# Use lift to filter out association rules
# Assume the min lift is 1
rules_lift_China = association_rules(frequent_itemsets_China, metric="lift",min_threshold=1)

**- Result:**

In [19]:
# Based on min confidence (=0.5), 
# output antecedents, consequents, support, confidence and lift.
result_arm_China = rules_con_China[["antecedents","consequents","support","confidence","lift"]]

In [20]:
result_arm_China

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(BRONZE),(Women),0.149938,0.565421,1.009501
1,(Diving),(GOLD),0.081784,0.634615,1.537942
2,(Diving),(Women),0.065675,0.509615,0.909866
3,(Table Tennis),(GOLD),0.050805,0.532468,1.290394
4,(GOLD),(Women),0.223048,0.540541,0.96508
5,(Weightlifting),(Men),0.053284,0.693548,1.788158
6,(SILVER),(Women),0.187113,0.580769,1.036904
7,(Table Tennis),(Women),0.050805,0.532468,0.950667


## 2. Association rule mining for Australia:

**- Filter dataset:**

In [21]:
# Only keep Australia's related data:
df_association_rule_Australia = df_merged_medals_hosts_continent[df_merged_medals_hosts_continent["Country Name"].isin(["Australia"])].copy()

In [22]:
# Drop the columns that I don't need for this time's association rule mining:
df_association_rule_Australia.drop(["Country Name", "Continent", "Game Season", "Slug Game", "Participant Type", 
                                    "Event Title", "Game Year", "Athlete Full Name"], axis=1, inplace=True)

In [23]:
df_association_rule_Australia

Unnamed: 0,Discipline Title,Event Gender,Medal Type
28,Freestyle Skiing,Women,GOLD
104,Snowboard,Men,SILVER
105,Snowboard,Women,BRONZE
238,Skeleton,Women,SILVER
437,Diving,Women,BRONZE
...,...,...,...
21401,Athletics,Men,BRONZE
21539,Swimming,Men,GOLD
21545,Swimming,Men,GOLD
21570,Athletics,Men,GOLD


In [24]:
# Check missing values in the dataframe:
df_association_rule_Australia.isna

<bound method DataFrame.isna of        Discipline Title Event Gender Medal Type
28     Freestyle Skiing        Women       GOLD
104           Snowboard          Men     SILVER
105           Snowboard        Women     BRONZE
238            Skeleton        Women     SILVER
437              Diving        Women     BRONZE
...                 ...          ...        ...
21401         Athletics          Men     BRONZE
21539          Swimming          Men       GOLD
21545          Swimming          Men       GOLD
21570         Athletics          Men       GOLD
21597         Athletics          Men       GOLD

[627 rows x 3 columns]>

In [25]:
# Count total missing values at each column in the dataframe:
df_association_rule_Australia.isna().sum()

Discipline Title    0
Event Gender        0
Medal Type          0
dtype: int64

**- Data Transformation:**

In [26]:
# Convert the data in new_df_association_rule_Australia to string type since TransactionEncoder() function only can handle string type:
new_df_association_rule_Australia = df_association_rule_Australia.astype(str)

In [27]:
# Convert the data from dataframe new_df_association_rule_Autralia into a list:
list_Australia = new_df_association_rule_Australia.values.tolist()

In [28]:
# Covert the list to one-hot encoded boolean numpy array. 
# Apriori function allows boolean data type only, such as 1 and 0, or FALSE and TRUE.
te_Australia = TransactionEncoder()
array_te_Australia = te_Australia.fit(list_Australia).transform(list_Australia)

In [29]:
# check the array:
array_te_Australia

array([[False, False, False, ..., False,  True, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False,  True, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False,  True, ..., False, False, False],
       [False, False,  True, ..., False, False, False]])

In [30]:
# check the columns:
te_Australia.columns_

['Alpine Skiing',
 'Archery',
 'Athletics',
 'BRONZE',
 'Baseball',
 'Basketball',
 'Beach Volleyball',
 'Boxing',
 'Canoe Marathon',
 'Canoe Slalom',
 'Canoe Sprint',
 'Cycling BMX',
 'Cycling BMX Freestyle',
 'Cycling Road',
 'Cycling Track',
 'Diving',
 'Equestrian',
 'Equestrian Eventing',
 'Freestyle Skiing',
 'GOLD',
 'Hockey',
 'Judo',
 'Marathon Swimming',
 'Men',
 'Mixed',
 'Modern Pentathlon',
 'Open',
 'Rowing',
 'Rugby',
 'SILVER',
 'Sailing',
 'Shooting',
 'Short Track Speed Skating',
 'Skateboarding',
 'Skeleton',
 'Snowboard',
 'Softball',
 'Surfing',
 'Swimming',
 'Taekwondo',
 'Tennis',
 'Trampoline',
 'Triathlon',
 'Water Polo',
 'Weightlifting',
 'Women',
 'Wrestling']

In [31]:
# Covert the array to a dataframe since Apriori function can handle dataframe only:
arm_df_Australia = pd.DataFrame(array_te_Australia, columns = te_Australia.columns_)

**- Association Rule Generation:**

In [32]:
#Find the frequent itemsets
frequent_itemsets_Australia = apriori(arm_df_Australia,min_support=0.05,use_colnames =True)

In [33]:
#Check the length of rules
frequent_itemsets_Australia["length"]=frequent_itemsets_Australia["itemsets"].apply(lambda x: len(x))

In [34]:
# Use confidence to filter out association rules that are not strong enough:
# Assume the min confidence is 0.5
rules_con_Australia = association_rules(frequent_itemsets_Australia, metric="confidence",min_threshold=0.5)

In [35]:
# Use lift to filter out association rules
# Assume the min lift is 1
rules_lift_Australia = association_rules(frequent_itemsets_Australia, metric="lift",min_threshold=1)

**- Result:**

In [36]:
#Based on min confidence (=0.5), 
#output antecedents, consequents, support, confidence and lift.
result_arm_Australia = rules_con_Australia[["antecedents","consequents","support","confidence","lift"]]

In [38]:
result_arm_Australia

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(Athletics),(Women),0.063796,0.526316,1.352459
1,(BRONZE),(Men),0.205742,0.53527,0.975622
2,(Cycling Track),(Men),0.062201,0.75,1.367006
3,(GOLD),(Men),0.165869,0.547368,0.997674
4,(Rowing),(Men),0.065391,0.719298,1.311047
5,(SILVER),(Men),0.177033,0.566327,1.032229
6,(Swimming),(Men),0.185008,0.54717,0.997312
7,"(Swimming, BRONZE)",(Men),0.068581,0.589041,1.07363
8,"(GOLD, Swimming)",(Women),0.055821,0.507246,1.303457
9,"(Swimming, SILVER)",(Men),0.062201,0.557143,1.01549
