In [1]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/SMU_MITB_NLP/project/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/SMU_MITB_NLP/project


In [2]:
transactions_cleaned = pd.read_pickle("transactions_cleaned.pkl")
transactions_cleaned.head()

### Cleaning up "Unknown" product_type_name ###
transactions_cleaned = transactions_cleaned[transactions_cleaned['product_type_name'] != 'Unknown']

In [4]:
### Extract only transactions in 2020 July, Aug, Sep due to computing resources limitation ###
transactions_cleaned_limited = transactions_cleaned[(transactions_cleaned['t_dat'].dt.year == 2020) & (transactions_cleaned['t_dat'].dt.month.isin([7,8,9]))]

### Preparing the data into a list of lists such that it is a list of customer's articles ###
transactions_list = transactions_cleaned_limited.groupby('customer_id')['product_type_name'].apply(list).values.tolist()

In [6]:
### Removing repeated product_name_types ###
unique_transactions = [list(set(trans)) for trans in transactions_list]

# Frequent Pattern Mining

In [8]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [None]:
# TransactionEncoder for one-hot encoding
te = TransactionEncoder()
te_ary = te.fit(unique_transactions).transform(unique_transactions)

# Convert encoded transactions to a DataFrame
df = pd.DataFrame(te_ary, columns=te.columns_)

In [18]:
# Frequent itemsets
frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)

# Association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.25)

In [22]:
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)
frequent_itemsets = frequent_itemsets[frequent_itemsets['itemsets'].apply(lambda x: len(x) > 1)]

# Print frequent itemsets
print("Frequent Itemsets:")
print(frequent_itemsets)

Frequent Itemsets:
     support                       itemsets
34  0.122378              (Trousers, Dress)
44  0.115995            (T-shirt, Trousers)
42  0.105433            (Trousers, Sweater)
46  0.103287                (Trousers, Top)
20  0.091822  (Bikini top, Swimwear bottom)
32  0.089486               (T-shirt, Dress)
33  0.089324                   (Dress, Top)
43  0.083086                 (T-shirt, Top)
24  0.081956             (Trousers, Blouse)
21  0.080015                (Dress, Blouse)
48  0.074883           (Trousers, Vest top)
40  0.074494             (T-shirt, Sweater)
31  0.072944               (Dress, Sweater)
41  0.072266                 (Sweater, Top)
45  0.069543            (T-shirt, Vest top)
35  0.067664              (Dress, Vest top)
47  0.063950                (Vest top, Top)
23  0.063227                  (Top, Blouse)
36  0.061980              (Trousers, Shirt)
27  0.061385                (Trousers, Bra)
28  0.060787        (Underwear bottom, Bra)
30  0.060582 

In [27]:
# frequent_itemsets.to_pickle('frequent_itemsets.pkl')
# frequent_itemsets.to_excel('frequent_itemsets.xlsx')

In [23]:
rules = rules.sort_values(by='support', ascending=False)

# Print association rules
print("Association Rules:")
print(rules)

Association Rules:
            antecedents         consequents  antecedent support  \
21              (Dress)          (Trousers)            0.297836   
20           (Trousers)             (Dress)            0.355470   
36           (Trousers)           (T-shirt)            0.355470   
35            (T-shirt)          (Trousers)            0.253765   
32            (Sweater)          (Trousers)            0.205147   
31           (Trousers)           (Sweater)            0.355470   
40                (Top)          (Trousers)            0.220443   
39           (Trousers)               (Top)            0.355470   
1     (Swimwear bottom)        (Bikini top)            0.114977   
0          (Bikini top)   (Swimwear bottom)            0.117891   
16            (T-shirt)             (Dress)            0.253765   
17              (Dress)           (T-shirt)            0.297836   
18              (Dress)               (Top)            0.297836   
19                (Top)             (Dress)

In [28]:
# rules.to_pickle('rules.pkl')
# rules.to_excel('rules.xlsx')