# Read the data😂

In [229]:
import pandas as pd
import numpy as np

In [230]:
data = pd.read_csv('./data/BreadBasket_DMS.csv')
data.head()

Unnamed: 0,Date,Time,Transaction,Item
0,2016-10-30,09:58:11,1,Bread
1,2016-10-30,10:05:34,2,Scandinavian
2,2016-10-30,10:05:34,2,Scandinavian
3,2016-10-30,10:07:57,3,Hot chocolate
4,2016-10-30,10:07:57,3,Jam


In [231]:
data.shape

(21293, 4)

# Clead the data😥

In [232]:
len(data[data["Item"] == "NONE"])

786

In [233]:
data['Item'] = data['Item'].replace("NONE",np.nan).dropna()

In [234]:
data = data.dropna()
data.shape

(20507, 4)

# Extract Hour & insert into column😀

In [235]:
import re
from pandas import Series

pattern_hour = '[0-9][0-9]'

hourlist = []
for item in data['Time']:
    hour = re.findall(pattern_hour,item)[0]
    hourlist.append(hour)
#hourlist

In [236]:
data.insert(2,'Hour',Series(hourlist))

In [237]:
data.head()

Unnamed: 0,Date,Time,Hour,Transaction,Item
0,2016-10-30,09:58:11,9,1,Bread
1,2016-10-30,10:05:34,10,2,Scandinavian
2,2016-10-30,10:05:34,10,2,Scandinavian
3,2016-10-30,10:07:57,10,3,Hot chocolate
4,2016-10-30,10:07:57,10,3,Jam


---

# Analysis methods😀

- Busy Hours Analysis
- Popular Items Analysis
- Transaction Analysis

---

# Busy Hours Analysis

In [238]:
hour_counts = data['Hour'].value_counts().sort_index()

## Busy Hour Bar Chart

In [296]:
from pyecharts import Bar

attr = list(map(int, hour_counts.index))
v =  hour_counts.values

bar = Bar("Busy Hour")
bar.add("", attr, v, mark_line=["max"],is_label_show=True,is_more_utils=True)

bar.render('./charts/Busy-Hour.html')
bar

- Busy Hours start at **10a.m.** ,end at **14p.m.**

---

# Popular Items Analysis

### TOP 10

In [240]:
item_counts = data['Item'].value_counts()

In [241]:
hot_items = item_counts[:10]
hot_items

Coffee           5471
Bread            3325
Tea              1435
Cake             1025
Pastry            856
Sandwich          771
Medialuna         616
Hot chocolate     590
Cookies           540
Brownie           379
Name: Item, dtype: int64

### Others

In [242]:
other_items_count = data.Item.count() - hot_items.sum()
item_list = hot_items.append(pd.Series([other_items_count], index=["Others"]))

In [243]:
item_list

Coffee           5471
Bread            3325
Tea              1435
Cake             1025
Pastry            856
Sandwich          771
Medialuna         616
Hot chocolate     590
Cookies           540
Brownie           379
Others           5499
dtype: int64

## Popular Items Pie Chart

In [295]:
from pyecharts import Pie, Grid

attr_items = item_list.index
count_items = item_list.values

pie1 = Pie("Popular Items",title_pos='center')
pie1.add("",attr_items,count_items,
         is_label_show=True,legend_orient="vertical",legend_pos="left")

pie1.render('./charts/Popular-Items.html')
pie1

- Apperently, **Coffee** is the most popular and the **bread** is second.

---

# Transaction Combination Analysis

In [245]:
coffee_transaction_list = data[data['Item'] == "Coffee"]["Transaction"].tolist()

In [246]:
# make a copy
data_copy = data.copy()

In [247]:
data_copy = data_copy[data_copy['Transaction'].isin(coffee_transaction_list)]

In [248]:
data_copy.head()

Unnamed: 0,Date,Time,Hour,Transaction,Item
7,2016-10-30,10:13:03,10,5,Coffee
8,2016-10-30,10:13:03,10,5,Pastry
9,2016-10-30,10:13:03,10,5,Bread
13,2016-10-30,10:19:12,10,7,Medialuna
14,2016-10-30,10:19:12,10,7,Pastry


### with-coffee-item list

In [249]:
with_coffee_items = data_copy.Item.value_counts()[1:10]
with_coffee_items

Bread            923
Cake             540
Tea              482
Pastry           474
Sandwich         421
Medialuna        345
Hot chocolate    293
Cookies          283
Toast            224
Name: Item, dtype: int64

### hot-item list

In [250]:
#item_list[1:10]

In [251]:
hot_items = item_list.drop(labels=["Coffee"])[:9]
hot_items

Bread            3325
Tea              1435
Cake             1025
Pastry            856
Sandwich          771
Medialuna         616
Hot chocolate     590
Cookies           540
Brownie           379
dtype: int64

In [252]:
values = hot_items.tolist()
values

[3325, 1435, 1025, 856, 771, 616, 590, 540, 379]

In [253]:
values_coffee_combine = with_coffee_items.tolist()
values_coffee_combine

[923, 540, 482, 474, 421, 345, 293, 283, 224]

In [254]:
values_without_coffee = [values[i]-v for i,v in enumerate(values_coffee_combine)]
values_without_coffee

[2402, 895, 543, 382, 350, 271, 297, 257, 155]

In [255]:
from pandas import DataFrame
coffee_compare = pd.DataFrame({'with_coffee':values_coffee_combine, 'without_coffee':values_without_coffee})
coffee_compare.index = with_coffee_items.index

In [256]:
coffee_compare

Unnamed: 0,with_coffee,without_coffee
Bread,923,2402
Cake,540,895
Tea,482,543
Pastry,474,382
Sandwich,421,350
Medialuna,345,271
Hot chocolate,293,297
Cookies,283,257
Toast,224,155


## Coffee Combination Line Chart

In [294]:
from pyecharts import Line

x1 = coffee_compare.index
y1 =  (coffee_compare['with_coffee'].values/coffee_compare['without_coffee'].values).round(2)

x2 = coffee_compare.index
y2 = coffee_compare['without_coffee'].values

line = Line("Coffee Combination")
line.add("with coffee", x1, y1,mark_point=["max"],is_smooth=True,xaxis_interval=0, xaxis_rotate=30, yaxis_rotate=30)
#line.add("without coffee", x2, y2,mark_point=["max"],is_smooth=True)

line.render('./charts/Coffee-Combination.html')
line

- Clearly, customers are most likely to buy **toast** with **coffee**
- Customers don't like **bread**/**Cake** plus **coffee** combine

---

# Predict Coffee Sales

In [273]:
coffee_data = data_copy[data_copy['Item']=='Coffee']
coffee_data.head()

Unnamed: 0,Date,Time,Hour,Transaction,Item
7,2016-10-30,10:13:03,10,5,Coffee
15,2016-10-30,10:19:12,10,7,Coffee
28,2016-10-30,10:30:14,10,12,Coffee
34,2016-10-30,10:31:24,10,13,Coffee
44,2016-10-30,10:37:08,10,16,Coffee


In [274]:
grouped = coffee_data['Item'].groupby(coffee_data['Date'])
pd_coffee = DataFrame(grouped.count())

In [275]:
pd_coffee.columns = ['Sales']

In [293]:
pd_coffee.head()

Unnamed: 0_level_0,Sales
Date,Unnamed: 1_level_1
2016-10-30,33
2016-10-31,60
2016-11-01,38
2016-11-02,42
2016-11-03,40


## Coffee-Sales Line Chart

In [297]:
from pyecharts import Line

date = pd_coffee.index
sales =  pd_coffee.values


line = Line("Coffee-Sales")
line.add("", date, sales,mark_point=["max"],mark_line=["average"],xaxis_interval=7, xaxis_rotate=30, yaxis_rotate=30)

line.render('./charts/Coffee-Sales.html')
line