In [6]:
import requests
import pandas as pd

In [7]:
DATA_URL = "https://downloader.disk.yandex.ru/disk/aef2aa4330747fa29567141d5e4ce1b8f9f3d680f4ae8047508428619430d03e/5e9dde3b/JdhhT8jvCMu_Pgugn6PhgikDN1Cl0gyob4EIPIlrJUuCBY5pV_cIfkKHe8ufmUwgSPrz6Q5GZ-5bc3hWDPazsg%3D%3D?uid=301383560&filename=do_10_products_20190924_185230.csv&disposition=attachment&hash=&limit=0&content_type=text%2Fplain&owner_uid=301383560&fsize=175856061&hid=3a2c438ebf3814aa05ee1ec6bc1649ab&media_type=spreadsheet&tknv=v2&etag=e5fc3320c27b0391270a8cfdd6a725ea"
PLOT_CONF = {'grid': False, 'figsize': (20,8), 'color': "#86bf91", 'zorder': 2}
FILE_NAME = 'products.csv'

In [8]:
with requests.get(DATA_URL, stream=True) as r:
    r.raise_for_status()

    with open(FILE_NAME, 'wb') as f:
        for chunk in r.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)
                f.flush()

KeyboardInterrupt: 

In [None]:
df = pd.read_csv(FILE_NAME, sep=';')
df['modified_time'] = pd.to_datetime(df['modified_time'], unit='s')

# Data Overview

### Rows Number

In [None]:
len(df)

### Columns

In [None]:
', '.join(df.columns.sort_values())

 ### Categories Description

General categories sorted by ascending price mean

In [None]:
df_categories = pd.DataFrame(
    {
        'cat_name': name,
        'cat_items': len(group),
        'price_min': group['price'].min(),
        'price_mean': group['price'].mean(),
        'price_median': group['price'].median(),
        'price_max': group['price'].max()
    }
    
    for name, group in df.groupby('categoryId')
    
).sort_values('price_mean').set_index('cat_name')

df_categories.at['Total', 'cat_items'] = df_categories['cat_items'].sum() # total sum of items

df_categories

### Overall Price Distribution 

In [None]:
_ = df['price'].hist(**PLOT_CONF, log=True, bins=20, rwidth=0.9 )

### Overall Price Growth Rate
Indicates that most of the price growth falls into first month of the year, whereas at the end of year price growth
nearly zeroed.

In [None]:
price_df = pd.DataFrame()
df_month_sample = df.set_index('modified_time').sort_index().resample('M').count()
price_df['pgr'] = (df_month_sample['price'] - df_month_sample['oldprice']) / df_month_sample['oldprice']
_ = price_df.dropna().plot(**PLOT_CONF)

### Categories Price Correlation

Highlights price correlation between product categories. According to data prices of categories like
 `furniture` / `beauty&health`, `Electrical equipment&materials` / `Clocks`  are positively correlated.

In [None]:
corr_df = pd.DataFrame()

for name, group in df.groupby('categoryId'):
    corr_df[name] = group.reset_index()['price']

c = corr_df.dropna().corr()

In [None]:
# Implemented in https://seaborn.pydata.org/examples/many_pairwise_correlations.html

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

mask = np.triu(np.ones_like(c, dtype=np.bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)

_ = sns.heatmap(c, mask=mask, cmap=cmap, vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})


### Overall Modification Time Distribution

Describes products modification time distribution. Plotted data shows that the most product changes
appeared at the end of the year.

In [None]:
_ = df['modified_time'].hist(**PLOT_CONF, log=True, bins=20, rwidth=0.9)