# Intro
Initial exploration of the dataset, checking integrity

# Imports

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

ModuleNotFoundError: No module named 'pandas'

# Dataset

In [2]:
sales = (
    pd.read_csv("../data/sales_train.csv", parse_dates=["date"])
    .merge(pd.read_csv("../data/items_en.csv"), on="item_id")
    .merge(pd.read_csv("../data/item_categories_en.csv"), on="item_category_id")
)

NameError: name 'pd' is not defined

In [10]:
sales.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,item_category_name
0,2013-02-01,0,59,22154,999.0,1.0,Scene 2012 (BD),37,Movies - Blu-Ray
1,2013-01-23,0,24,22154,999.0,1.0,Scene 2012 (BD),37,Movies - Blu-Ray
2,2013-01-20,0,27,22154,999.0,1.0,Scene 2012 (BD),37,Movies - Blu-Ray
3,2013-02-01,0,25,22154,999.0,1.0,Scene 2012 (BD),37,Movies - Blu-Ray
4,2013-03-01,0,25,22154,999.0,1.0,Scene 2012 (BD),37,Movies - Blu-Ray


In [11]:
sales.isnull().sum()

date                  0
date_block_num        0
shop_id               0
item_id               0
item_price            0
item_cnt_day          0
item_name             0
item_category_id      0
item_category_name    0
dtype: int64

## Orders per date
Each row of the dataset is an order

In [28]:
px.line(sales.date.value_counts().sort_index().to_frame("orders"))

### Date block num
The dataset contains a column called `date_block_num`, which supposedly means:

*a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33*

However, this seems to be misleading

In [29]:
px.line(
    sales.rename_axis("order_id")
    .reset_index()
    .groupby(["date", "date_block_num"])
    .order_id.size()
    .rename("orders")
    .reset_index(),
    x="date",
    y="orders",
    color="date_block_num",
)

#### Orders with date_block_num == 0

In [33]:
px.line(
    sales[sales.date_block_num.eq(0)]
    .date.value_counts()
    .sort_index()
    .to_frame("orders")
)

## Shop ID
Curiously, 0 is a valid shop id.
All values from 0 to 59 seem to be present in the dataset

In [46]:
sales.shop_id.drop_duplicates().sort_values().unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59])

In [41]:
sales.shop_id.nunique(), sales.shop_id.max()

(60, 59)

### Shops timelines
Some shops are born and die at different times

In [62]:
px.timeline(
    sales.groupby("shop_id").date.agg(["min", "max"]).reset_index(),
    x_start="min",
    x_end="max",
    y="shop_id",
)

## Percentage of orders per shop

In [75]:
px.bar(
    sales.shop_id.value_counts(normalize=True).sort_values(ascending=False)
).update_xaxes(type="category")

## Item ID

In [10]:
sales.item_id.value_counts().head(1000).plot.bar(
    title="Sales of Top 1000 most sold items",
    labels={"value": "sales", "index": "item_id"},
).update_layout(xaxis={"type": "category"})

In [11]:
sales.item_id.value_counts().tail(1000).plot.bar(
    title="Sales of Top 1000 least sold items",
    labels={"value": "sales", "index": "item_id"},
).update_layout(xaxis={"type": "category"})

In [None]:
sales.item_id.value_counts().value_counts().head(100).plot.bar(
    title="How many items were sold how many times?",
    labels={"index": "number of times sold", "value": "number of items "},
)

In [90]:
y = sales.item_id.value_counts(normalize=True).cumsum()
y.index = list(range(len(y)))
y.plot.line(title="Cumulative of sales per number of diff items")

## Item Price

In [13]:
sales.item_price.quantile(np.linspace(0, 1, 200)).plot.line(
    title="Percentage of sales per item price"
)

## Item Count per Day

In [43]:
sales.item_cnt_day.describe

count    2.935849e+06
mean     1.242641e+00
std      2.618834e+00
min     -2.200000e+01
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      2.169000e+03
Name: item_cnt_day, dtype: float64

In [44]:
sales.item_cnt_day.lt(1).value_counts()

False    2928493
True        7356
Name: item_cnt_day, dtype: int64

In [47]:
sales.item_cnt_day.gt(1).value_counts(normalize=True)

False    0.898114
True     0.101886
Name: item_cnt_day, dtype: float64

In [61]:
pd.cut(sales.item_cnt_day, np.arange(-1, 10, 1)).value_counts(
    normalize=True, dropna=False, sort=False
)

(-1.0, 0.0]    0.000000
(0.0, 1.0]     0.895609
(1.0, 2.0]     0.066148
(2.0, 3.0]     0.016128
(3.0, 4.0]     0.006705
(4.0, 5.0]     0.003568
(5.0, 6.0]     0.002159
(6.0, 7.0]     0.001382
(7.0, 8.0]     0.000989
(8.0, 9.0]     0.000742
NaN            0.006571
Name: item_cnt_day, dtype: float64

## Item Category


In [93]:
sales.item_category_name.value_counts()

Кино - DVD                             564652
Игры PC - Стандартные издания          351591
Музыка - CD локального производства    339585
Игры - PS3                             208219
Кино - Blu-Ray                         192674
                                        ...  
Книги - Путеводители                        3
Книги - Открытки                            2
Аксессуары - PS2                            2
Игровые консоли - PS2                       1
Книги - Познавательная литература           1
Name: item_category_name, Length: 84, dtype: int64

In [98]:
sales.groupby(
    ["date_block_num", "item_category_name"]
).item_cnt_day.sum().reset_index().plot.hist(
    x="date_block_num", y="item_cnt_day", color="item_category_name"
)