# Treinamento do Mecanismo de Recomendação

*Dataset: https://www.kaggle.com/datasets/ylchang/coffee-shop-sample-data-1113*

In [94]:
# Bibliotecas
import pandas as pd
from mlxtend.frequent_patterns import association_rules, apriori

# Leitura do dataset

In [95]:
# Recibos de venda
sales_reciepts = pd.read_csv("dataset/201904 sales reciepts.csv")
sales_reciepts.head() # Exibir dados

Unnamed: 0,transaction_id,transaction_date,transaction_time,sales_outlet_id,staff_id,customer_id,instore_yn,order,line_item_id,product_id,quantity,line_item_amount,unit_price,promo_item_yn
0,7,2019-04-01,12:04:43,3,12,558,N,1,1,52,1,2.5,2.5,N
1,11,2019-04-01,15:54:39,3,17,781,N,1,1,27,2,7.0,3.5,N
2,19,2019-04-01,14:34:59,3,17,788,Y,1,1,46,2,5.0,2.5,N
3,32,2019-04-01,16:06:04,3,12,683,N,1,1,23,2,5.0,2.5,N
4,33,2019-04-01,19:18:37,3,17,99,Y,1,1,34,1,2.45,2.45,N


In [96]:
# Produtos
product = pd.read_csv("dataset/product.csv")
product.head()

Unnamed: 0,product_id,product_group,product_category,product_type,product,product_description,unit_of_measure,current_wholesale_price,current_retail_price,tax_exempt_yn,promo_yn,new_product_yn
0,1,Whole Bean/Teas,Coffee beans,Organic Beans,Brazilian - Organic,It's like Carnival in a cup. Clean and smooth.,12 oz,14.4,$18.00,Y,N,N
1,2,Whole Bean/Teas,Coffee beans,House blend Beans,Our Old Time Diner Blend,Out packed blend of beans that is reminiscent ...,12 oz,14.4,$18.00,Y,N,N
2,3,Whole Bean/Teas,Coffee beans,Espresso Beans,Espresso Roast,Our house blend for a good espresso shot.,1 lb,11.8,$14.75,Y,N,N
3,4,Whole Bean/Teas,Coffee beans,Espresso Beans,Primo Espresso Roast,Our primium single source of hand roasted beans.,1 lb,16.36,$20.45,Y,N,N
4,5,Whole Bean/Teas,Coffee beans,Gourmet Beans,Columbian Medium Roast,A smooth cup of coffee any time of day.,1 lb,12.0,$15.00,Y,N,N


# Manipulação dos dados

## Filtragem dos dados
Filtrar as colunas necessárias nos dataframes.

In [97]:
sales_reciepts = sales_reciepts[[
    "transaction_id", "transaction_date", "sales_outlet_id", "customer_id", "product_id", "quantity"]]
sales_reciepts.head()

Unnamed: 0,transaction_id,transaction_date,sales_outlet_id,customer_id,product_id,quantity
0,7,2019-04-01,3,558,52,1
1,11,2019-04-01,3,781,27,2
2,19,2019-04-01,3,788,46,2
3,32,2019-04-01,3,683,23,2
4,33,2019-04-01,3,99,34,1


In [98]:
product = product[[
    "product_id", "product_category", "product"
]]
product.head()

Unnamed: 0,product_id,product_category,product
0,1,Coffee beans,Brazilian - Organic
1,2,Coffee beans,Our Old Time Diner Blend
2,3,Coffee beans,Espresso Roast
3,4,Coffee beans,Primo Espresso Roast
4,5,Coffee beans,Columbian Medium Roast


## Merge Data
Mescla os dados dos dataframes.

In [99]:
# Mesclar os dados do dataset
dataset = pd.merge(
    sales_reciepts, product, # dataframes
    on="product_id", # coluna correspondente
    how="left" # "LEFT JOIN"
    )
dataset.head()

Unnamed: 0,transaction_id,transaction_date,sales_outlet_id,customer_id,product_id,quantity,product_category,product
0,7,2019-04-01,3,558,52,1,Tea,Traditional Blend Chai Rg
1,11,2019-04-01,3,781,27,2,Coffee,Brazilian Lg
2,19,2019-04-01,3,788,46,2,Tea,Serenity Green Tea Rg
3,32,2019-04-01,3,683,23,2,Coffee,Our Old Time Diner Blend Rg
4,33,2019-04-01,3,99,34,1,Coffee,Jamaican Coffee River Sm


## Remover tamanhos
Alguns produtos possuem no nome seus tamanhos, como "lg", "rg", "small", etc.

In [100]:
# Filtrar os dados que possuem o produto "Dark chocolate" para obter os tamanhos (lg, rg, default)
dataset[dataset["product"].str.contains("Dark chocolate")]["product"].unique()

array(['Dark chocolate Lg', 'Dark chocolate Rg', 'Dark chocolate'],
      dtype=object)

In [101]:
# Quantidade de produtos (inclusive variações de tamanhos)
dataset["product"].nunique()

80

In [102]:
# Lista de produtos com variações de tamanho
print(dataset["product"].unique())

['Traditional Blend Chai Rg' 'Brazilian Lg' 'Serenity Green Tea Rg'
 'Our Old Time Diner Blend Rg' 'Jamaican Coffee River Sm' 'Ethiopia Rg'
 'English Breakfast Lg' 'Sustainably Grown Organic Rg' 'Earl Grey Lg'
 'Jamaican Coffee River Rg' 'Serenity Green Tea Lg' 'Brazilian Sm'
 'English Breakfast Rg' 'Traditional Blend Chai Lg' 'Cappuccino'
 'Espresso shot' 'Cappuccino Lg' 'Latte' 'Earl Grey Rg'
 'Dark chocolate Lg' 'Columbian Medium Roast Sm' 'Oatmeal Scone'
 'Morning Sunrise Chai Lg' 'Morning Sunrise Chai Rg' 'Peppermint Lg'
 'Jumbo Savory Scone' 'Lemon Grass Lg' 'Sustainably Grown Organic Lg'
 'Dark chocolate Rg' 'Lemon Grass Rg' 'Ethiopia Sm' 'Latte Rg'
 'Our Old Time Diner Blend Sm' 'Chocolate Chip Biscotti'
 'Columbian Medium Roast Rg' 'Ethiopia Lg' 'Brazilian Rg'
 'Columbian Medium Roast Lg' 'Spicy Eye Opener Chai Rg' 'Ginger Biscotti'
 'Our Old Time Diner Blend Lg' 'Chocolate Croissant'
 'Jamaican Coffee River Lg' 'Hazelnut Biscotti' 'Spicy Eye Opener Chai Lg'
 'Cranberry Scone'

In [103]:
# Remover os tamanhos do nome do produto

sizes = ["Rg", "Sm", "Lg"] # Lista de tamanhos

for s in sizes:
    dataset["product"] = dataset["product"].str.replace(
        f" {s}", "" # remover tamanho
    )

In [104]:
# Quantidade de produtos (sem contar variações de tamanho)
dataset["product"].nunique()

45

In [105]:
# Lista de produtos sem variações de tamanho
print(dataset["product"].unique())

['Traditional Blend Chai' 'Brazilian' 'Serenity Green Tea'
 'Our Old Time Diner Blend' 'Jamaican Coffee River' 'Ethiopia'
 'English Breakfast' 'Sustainably Grown Organic' 'Earl Grey' 'Cappuccino'
 'Espresso shot' 'Latte' 'Dark chocolate' 'Columbian Medium Roast'
 'Oatmeal Scone' 'Morning Sunrise Chai' 'Peppermint' 'Jumbo Savory Scone'
 'Lemon Grass' 'Chocolate Chip Biscotti' 'Spicy Eye Opener Chai'
 'Ginger Biscotti' 'Chocolate Croissant' 'Hazelnut Biscotti'
 'Cranberry Scone' 'Scottish Cream Scone ' 'Croissant' 'Almond Croissant'
 'Ginger Scone' 'Ouro Brasileiro shot' 'Organic Decaf Blend'
 'Chocolate syrup' 'Hazelnut syrup' 'Carmel syrup'
 'Sugar Free Vanilla syrup' 'Jamacian Coffee River'
 'Guatemalan Sustainably Grown' 'Civet Cat' 'Chili Mayan'
 'Primo Espresso Roast' 'Brazilian - Organic' 'I Need My Bean! Diner mug'
 'Espresso Roast' 'I Need My Bean! T-shirt' 'I Need My Bean! Latte cup']


## Filtrar produtos que serão realmente vendidos

In [106]:
# Produtos que serão vendidos
products_to_take = [
    'Cappuccino', 'Latte', 'Espresso shot', 'Dark chocolate', 
    'Sugar Free Vanilla syrup', 'Chocolate syrup', 'Carmel syrup', 
    'Hazelnut syrup', 'Ginger Scone', 'Chocolate Croissant', 
    'Jumbo Savory Scone', 'Cranberry Scone', 'Hazelnut Biscotti',
    'Croissant', 'Almond Croissant', 'Oatmeal Scone', 
    'Chocolate Chip Biscotti', 'Ginger Biscotti'
    ]

In [107]:
dataset.size # Tamanho antes da filtragem

399152

In [108]:
# Filtrar os produtos
dataset = dataset[
    dataset['product'].isin(products_to_take) # mantém os dados com produtos que estão na lista
]
dataset.head()

Unnamed: 0,transaction_id,transaction_date,sales_outlet_id,customer_id,product_id,quantity,product_category,product
16,108,2019-04-01,3,65,40,1,Coffee,Cappuccino
17,112,2019-04-01,3,90,37,2,Coffee,Espresso shot
20,127,2019-04-01,3,116,41,2,Coffee,Cappuccino
21,134,2019-04-01,3,189,38,2,Coffee,Latte
22,135,2019-04-01,3,131,40,1,Coffee,Cappuccino


In [109]:
dataset.size  # Tamanho depois da filtragem

127432

## Listar Categorias

In [121]:
# Listar produtos e suas categorias
products_categories = dataset[["product", "product_category"]].drop_duplicates().reset_index(drop=True)
# Remove as duplicatas e reseta o index

display(products_categories)

Unnamed: 0,product,product_category
0,Cappuccino,Coffee
1,Espresso shot,Coffee
2,Latte,Coffee
3,Dark chocolate,Drinking Chocolate
4,Oatmeal Scone,Bakery
5,Jumbo Savory Scone,Bakery
6,Chocolate Chip Biscotti,Bakery
7,Ginger Biscotti,Bakery
8,Chocolate Croissant,Bakery
9,Hazelnut Biscotti,Bakery
