In [195]:
source("../Code/importa_dados.R")

## Ativação dos pacotes necessários:

In [196]:
require(tidyverse, quietly = T)
require(arules)
require(reticulate)
use_python("/opt/anaconda3/bin/python3")

## Clientes

In [198]:
glimpse(customers)

Observations: 99,441
Variables: 5
$ customer_id              [3m[90m<chr>[39m[23m "06b8999e2fba1a1fbc88172c00ba8bc7", "18955e8…
$ customer_unique_id       [3m[90m<chr>[39m[23m "861eff4711a542e4b93843c6dd7febb0", "290c77b…
$ customer_zip_code_prefix [3m[90m<int>[39m[23m 14409, 9790, 1151, 8775, 13056, 89254, 4534,…
$ customer_city            [3m[90m<chr>[39m[23m "franca", "sao bernardo do campo", "sao paul…
$ customer_state           [3m[90m<chr>[39m[23m "SP", "SP", "SP", "SP", "SP", "SC", "SP", "M…


In [200]:
customers %>% 
  count(customer_unique_id, sort = T) %>% 
  filter(n > 1) %>% 
  head()

customer_unique_id,n
<chr>,<int>
8d50f5eadf50201ccdcedfb9e2ac8455,17
3e43e6105506432c953e165fb2acf44c,9
1b6c7548a2a1f9037c1fd3ddfed95f33,7
6469f99c1f9dfae7733b25662e7f1782,7
ca77025e7201e3b30c44b472ff346268,7
12f5d6e1cbf93dafd9dcc19095df0b3d,6


In [201]:
customers %>% 
  count(customer_state, sort = T) %>% 
  head()

customer_state,n
<chr>,<int>
SP,41746
RJ,12852
MG,11635
RS,5466
PR,5045
SC,3637


## Produtos

In [202]:
glimpse(products)

Observations: 32,951
Variables: 9
$ product_id                 [3m[90m<chr>[39m[23m "1e9e8ef04dbcff4541ed26657ea517e5", "3aa07…
$ product_category_name      [3m[90m<chr>[39m[23m "perfumaria", "artes", "esporte_lazer", "b…
$ product_name_lenght        [3m[90m<int>[39m[23m 40, 44, 46, 27, 37, 60, 56, 56, 57, 36, 54…
$ product_description_lenght [3m[90m<int>[39m[23m 287, 276, 250, 261, 402, 745, 1272, 184, 1…
$ product_photos_qty         [3m[90m<int>[39m[23m 1, 1, 1, 1, 4, 1, 4, 2, 1, 1, 1, 4, 3, 2, …
$ product_weight_g           [3m[90m<int>[39m[23m 225, 1000, 154, 371, 625, 200, 18350, 900,…
$ product_length_cm          [3m[90m<int>[39m[23m 16, 30, 18, 26, 20, 38, 70, 40, 27, 17, 16…
$ product_height_cm          [3m[90m<int>[39m[23m 10, 18, 9, 4, 17, 5, 24, 8, 13, 10, 10, 19…
$ product_width_cm           [3m[90m<int>[39m[23m 14, 20, 15, 26, 13, 11, 44, 40, 17, 12, 16…


In [203]:
products %>% 
  count(product_category_name, sort = T) %>% 
  head()

product_category_name,n
<chr>,<int>
cama_mesa_banho,3029
esporte_lazer,2867
moveis_decoracao,2657
beleza_saude,2444
utilidades_domesticas,2335
automotivo,1900


## Pedidos

In [204]:
glimpse(orders)

Observations: 99,441
Variables: 8
$ order_id                      [3m[90m<chr>[39m[23m "e481f51cbdc54678b7cc49136f2d6af7", "53…
$ customer_id                   [3m[90m<chr>[39m[23m "9ef432eb6251297304e76186b10a928d", "b0…
$ order_status                  [3m[90m<chr>[39m[23m "delivered", "delivered", "delivered", …
$ order_purchase_timestamp      [3m[90m<chr>[39m[23m "2017-10-02 10:56:33", "2018-07-24 20:4…
$ order_approved_at             [3m[90m<chr>[39m[23m "2017-10-02 11:07:15", "2018-07-26 03:2…
$ order_delivered_carrier_date  [3m[90m<chr>[39m[23m "2017-10-04 19:55:00", "2018-07-26 14:3…
$ order_delivered_customer_date [3m[90m<chr>[39m[23m "2017-10-10 21:25:13", "2018-08-07 15:2…
$ order_estimated_delivery_date [3m[90m<chr>[39m[23m "2017-10-18 00:00:00", "2018-08-13 00:0…


In [205]:
orders %>% 
  count(order_status, sort = T)

order_status,n
<chr>,<int>
delivered,96478
shipped,1107
canceled,625
unavailable,609
invoiced,314
processing,301
created,5
approved,2


In [212]:
tempo_medio <- orders %>% 
  mutate_at(vars(4:8), lubridate::as_datetime) %>% 
  mutate(tempo_aprovacao_entrega = difftime(order_delivered_customer_date, order_approved_at, units = "days") %>% 
           as.numeric() %>% 
           floor(),
         tempo_estimado_realizado = difftime(order_estimated_delivery_date, order_delivered_customer_date, units = "days") %>% 
           as.numeric() %>% 
           floor()) %>%  
  summarise(Tempo_medio_aprovacao_entrega = mean(tempo_aprovacao_entrega, na.rm = T),
            Tempo_medio_estimado_realizado = mean(tempo_estimado_realizado, na.rm = T)) %>%
gather(key = 'chave', value = 'valor') %>%
    as.data.frame()

In [213]:
tempo_medio %>% head()

chave,valor
<chr>,<dbl>
Tempo_medio_aprovacao_entrega,11.64297
Tempo_medio_estimado_realizado,10.87688


In [210]:
plt <- import('matplotlib.pyplot')

In [231]:
graf = plt$bar(tempo_medio$chave, tempo_medio$valor)
plt$savefig('tempo_medio.png')

## Análise de cesta

In [178]:
orders = r_to_py(orders)
#items = r_to_py(items)
products = r_to_py(products)



In [179]:
class(orders)

In [180]:
class(items)

##### Importando pacote do python para ajudar no processo de análise

In [181]:
pd <- import('pandas')

In [182]:
orders = pd$merge(orders, items, left_on = "order_id", right_on = "order_id", how = "left")

In [183]:
orders = pd$merge(orders, products, left_on = "product_id", right_on = "product_id", how = "left")

In [184]:
orders$product_category_name <- as.character(orders$product_category_name)

In [185]:
orders <- orders %>% 
    distinct(order_id, product_category_name) %>%
    group_by(order_id) %>%
    filter(n() > 1, product_category_name != "")

In [186]:
# Transformando em lista por transações, para realizar basket analysis

lista_transacoes <- split(orders[["product_category_name"]], orders[["order_id"]])

# require(arules)

lista_transacoes <- lista_transacoes %>% 
    as("transactions")

# Minerando regras de associação 

regras <- apriori(lista_transacoes, parameter = list(support = 0.0005, confidence = 0.025,
                                          minlen = 2, maxlen = 2))



regras

Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
      0.025    0.1    1 none FALSE            TRUE       5   5e-04      2
 maxlen target   ext
      2  rules FALSE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 0 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[63 item(s), 786 transaction(s)] done [0.00s].
sorting and recoding items ... [63 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2

“Mining stopped (maxlen reached). Only patterns up to a length of 2 returned!”

 done [0.00s].
writing ... [354 rule(s)] done [0.00s].
creating S4 object  ... done [0.00s].


set of 354 rules 

In [187]:
redundant <- which(colSums(is.subset(regras, regras)) > 1)
regras <- regras[-redundant]

In [188]:
regras

set of 110 rules 

In [189]:

# 
basket <- regras %>% 
  as("data.frame")

basket <- basket %>% 
  tidyr::separate(rules, into = c("Antecedente", "Consequente"), sep = "=>") %>% 
  mutate_at(vars(1:2), function(x){stringr::str_remove_all(x, "\\{") %>% 
      stringr::str_remove_all("\\}")}) %>% 
  arrange(-lift, -support, -confidence) %>%
  as_tibble()

basket  %>% 
  head()


Antecedente,Consequente,support,confidence,lift,count
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
pc_gamer,cool_stuff,0.001272265,1.0,11.731343,1
fashion_calcados,bebes,0.002544529,1.0,8.451613,2
sinalizacao_e_seguranca,bebes,0.001272265,1.0,8.451613,1
artes_e_artesanato,brinquedos,0.001272265,0.5,7.705882,1
climatizacao,informatica_acessorios,0.001272265,0.5,7.557692,1
dvds_blu_ray,informatica_acessorios,0.001272265,0.5,7.557692,1


##### Agradecemos a todos e esperamos que o curso ajude nos seus trabalhos com R e python

##### Contatos

- Bruno Lucian:
    - [Github](https://github.com/brunolucian)
    - [LinkedIn](https://www.linkedin.com/in/bruno-lucian/)
- Leonardo Filgueira:
    - [Github](https://github.com/leo-filgueira)
    - [LinkedIn](https://www.linkedin.com/in/leonardo-filgueira-b1815b163/)