In [1]:
import numpy as np
import pandas as pd

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

### 1. Choose a transactional dataset which is suitable for association rule analysis problem from the UCI Machine Learning Repository or from any other dataset repositories.
### 2. Download the selected dataset – you are required to provide a link to the download page for your selected dataset.
The dataset can be found [here](https://archive.ics.uci.edu/ml/datasets/online%20retail)

In [2]:

df = pd.read_excel('dataset/Online Retail.xlsx', engine='openpyxl')


In [3]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


### 3. Describe the dataset and the data mining task.
The dataset is a transnational data set that contains online retail transactions. The task is to build the rules to uncover associations between.

In [14]:
print(f'The dataset containe recorders start form {pd.to_datetime(df.InvoiceDate.min()).date() } to {pd.to_datetime(df.InvoiceDate.max()).date() } and has {df.StockCode.nunique()} unique items')

The dataset containe recorders start form 2010-12-01 to 2011-12-09 and has 4070 unique item


### 4. Display the number of instances.
### 5. Display the number of attributes.


In [None]:
print(f"The data has {df.shape[0]} records, {df.shape[1]} attributes")

### 6. Display a statistical summary for all the attributes.

In [None]:
df.describe()

### 7. Check whether the selected dataset has any data quality issues and choose suitable strategies to deal with any issue (if exists). 
The dataset apparently doesn't have any issue but for making sure, the rows that don't contain inovice number are dropped

In [None]:
# Dropping rows that doesn't contain invoice number
df.dropna(inplace=True)
df = df[~df['InvoiceNo'].str.contains('C', na=False)]

In this dataset, InvoiceNo means the orderIDNo, and the StockCode represents different kinds of products. In order to reorgnize this dataset, orders rearranged as rows and products being columns.

In [None]:
df = df.groupby(['InvoiceNo','Description'])['Quantity'].sum().unstack().fillna(0)

In [None]:
# replace all values of quantity >=1 by 1
def encoding(x):
    if x <= 0:
        return 0
    else:
        return 1

basket_final = df.applymap(encoding)

In [None]:
# drop "POSTAGE" since it's not a real product.
basket_final.drop('POSTAGE', inplace=True, axis=1)

basket_final.head()

### 8. Generate Rules using Apriori Algorithm 


In [None]:
## Apriori to select the most important itemsets
frequent_itemsets = apriori(basket_final, min_support = 0.02, use_colnames = True)

In [None]:
rules = association_rules(frequent_itemsets, metric = "lift", min_threshold = 1)
rules.sort_values('lift',ascending = False)

In [None]:
rules.head()

From the result above, I observe that:

1. **ALARM CLOCK BAKELIKE RED** and **ALARM CLOCK BAKELIKE GREEN** are purchased together.
2. **ALARM CLOCK BAKELIKE PINK** and **ALARM CLOCK BAKELIKE RED** are purchased together.
3. **DOLLY GIRL LUNCH BOX** and **SPACEBOY LUNCH BOX** are purchased together.
