In [65]:
import pandas as pd
from mlxtend.frequent_patterns import association_rules, apriori
import json

# Read dataset

In [17]:
sales_receipts = pd.read_csv("dataset/201904 sales reciepts.csv")

In [18]:
products = pd.read_csv("dataset/product.csv")

# Data merge

In [19]:
sales_receipts = sales_receipts[["transaction_id", "transaction_date", "customer_id", "product_id", "sales_outlet_id", "quantity"]]
products = products[["product_id", "product", "product_category"]]

In [20]:
dataset = pd.merge(sales_receipts, products, on="product_id", how="left")

In [None]:
dataset.head()

## Remove sizes

In [22]:
dataset[dataset["product"].str.contains("Dark chocolate")]["product"].unique()

array(['Dark chocolate Lg', 'Dark chocolate Rg', 'Dark chocolate'],
      dtype=object)

In [23]:
dataset["product"].nunique()

80

In [24]:
dataset["product"] = dataset["product"].str.replace(' Rg', '')
dataset["product"] = dataset["product"].str.replace(' Lg', '')
dataset["product"] = dataset["product"].str.replace(' Sm', '')


In [25]:
dataset.head()
dataset["product"].nunique()

45

In [None]:
print(dataset["product"].unique())

## Choose product subset

In [27]:
products_to_keep = ["Cappuccino", "Latte", "Espresso shot", "Dark chocolate", "Sugar Free Vanilla syrup", "Carmel syrup",
                    "Chocolate syrup", "Hazelnut syrup", "Ginger Scone", "Chocolate Croissant", "Jumbo Savory Scone",
                    "Cranberry Scone", "Hazelnut Biscotti", "Croissant", "Almond Croissant", "Oatmeal Scone", "Chocolate Chip Biscotti",
                    "Ginger Biscotti"
                    ]

In [34]:
dataset = dataset[dataset["product"].isin(products_to_keep)]

In [None]:
dataset.head()

## Clean transactions

In [None]:
dataset["transaction"] = dataset["transaction_id"].astype(str) + "_" + dataset["customer_id"].astype(str)

In [None]:
num_of_items_per_transaction = dataset["transaction"].value_counts().reset_index()
num_of_items_per_transaction

In [38]:
valid_transaction = num_of_items_per_transaction[(num_of_items_per_transaction["count"] > 1)]["transaction"].tolist()

In [39]:
dataset = dataset[dataset["transaction"].isin(valid_transaction)]

In [40]:
dataset.shape

(10189, 9)

In [None]:
dataset["product"].value_counts()
dataset["product_category"].value_counts()

# Popularity recommendation engine

In [42]:
product_recommendation = dataset.groupby(["product", "product_category"]).count().reset_index()

In [None]:
product_recommendation.head()

In [44]:
product_recommendation = product_recommendation[["product", "product_category", "transaction_id"]]
product_recommendation = product_recommendation.rename(columns={"transaction_id": "num_of_transactions"})

In [46]:
product_recommendation.to_csv("api/recommendation_dataset/popularity_recommendation.csv", index=False)

# Apriori recommendation engine

In [47]:
train_basket = (dataset.groupby(["transaction", "product"])["product"].count().reset_index(name="count"))

In [50]:
wide_basket = train_basket.pivot_table(index="transaction", columns="product", values="count").fillna(0)
wide_basket.head()

In [53]:
def encode_units(x):
    if x <= 0:
        return 0
    else:
        return 1

wide_basket = wide_basket.map(encode_units)

In [None]:
wide_basket.head()

In [55]:
frequent_items = apriori(wide_basket, min_support=0.05, use_colnames=True)

In [None]:
frequent_items.head(20)

In [59]:
rules_basket = association_rules(frequent_items, metric="lift", min_threshold=1)

In [None]:
rules_basket.head()

In [None]:
rules_basket[rules_basket["antecedents"] == {"Latte"}].sort_values("confidence", ascending=False)

## Save in json format

In [62]:
product_categories = dataset[["product", "product_category"]].drop_duplicates().set_index("product").to_dict()["product_category"]

In [63]:
recommendations_json = {}
antecedents = rules_basket["antecedents"].unique()
for antecedent in antecedents:
    df_rec = rules_basket[rules_basket["antecedents"] == antecedent]
    df_rec = df_rec.sort_values("confidence", ascending=False)
    key = "_".join(antecedent)
    recommendations_json[key] = []
    for _,row in df_rec.iterrows():
        objects = row["consequents"]
        for object in objects:
            already_exist = False
            for current_object in recommendations_json[key]:
                if object == current_object["product"]:
                    already_exist = True
            if already_exist:
                continue
            rec_dict = {
                "product": object,
                "product_category": product_categories[object],
                "confidence": row["confidence"]
            }
            recommendations_json[key].append(rec_dict)



In [None]:
recommendations_json

In [66]:
with open("api/recommendation_dataset/apriori_recommendations.json", "w") as json_file:
    json.dump(recommendations_json, json_file)