In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import warnings
import seaborn as sns
import matplotlib.pyplot as plt


warnings.filterwarnings(action='ignore')

In [3]:
data_path = '/kaggle/input/competitive-data-science-predict-future-sales/'

df_train = pd.read_csv(data_path+'sales_train.csv')
print(df_train.shape)
df_train.head()

In [4]:
df_test = pd.read_csv(data_path+'test.csv')
df_test.head()

In [5]:
df_samplesub = pd.read_csv(data_path+'sample_submission.csv')
df_samplesub.head()

## Total Sales per Month

In [6]:
df_train.groupby('date_block_num')['item_cnt_day'].sum().plot()
plt.title('monthly sales vs month')
plt.show()

## Sales per Item per Month

In [7]:
def top_item_sales(part = (0,10)):
    
    d = df_train.groupby(['item_id','date_block_num'])['item_cnt_day'].sum()

    #top 10 sold items
    topitems = df_train.groupby([ 'item_id'])['item_cnt_day'].sum().sort_values(ascending=False)[part[0]:part[1]]
    top_items_id = topitems.index
    print(top_items_id)

    df_itemVSmonth = pd.DataFrame(data = df_train['date_block_num'].unique(), columns = ['date_block_num'])
    df_itemVSmonth[top_items_id] = 0
    for item in top_items_id:
        pos = d[item].index
        vals = d[item].values
        df_itemVSmonth[item][pos] = vals
    return df_itemVSmonth, top_items_id

In [8]:
def plot_itemsVSmonth(shape=(10,1), title='title', items=[0,1]):
    m,n = shape
    fig, axes = plt.subplots(m,n, figsize=(5*n,5*m))
    x = df_itemVSmonth['date_block_num']
    for i,item in enumerate(items):
        column = df_itemVSmonth[item]
        if n!=1:
            ax =axes[i//n][i%n]
        else:
            ax =axes[i]
        sns.lineplot(x=x, y=column, ax=ax, marker='o')
    #     print(axes[i//n][i%n])
    #     df_itemVSmonth[top_items_id].plot()
    plt.title(title)
    plt.show()

In [9]:
%%time
num = 50
part = (0,num)
df_itemVSmonth, top_items_id = top_item_sales(part = part)
title = f'Sale of Top {num} items Vs Month'
print(title)
plot_itemsVSmonth((10,5), title=title, items=top_items_id)

In [10]:
%%time
num = 50
part = (-num,df_train['item_id'].unique().size)
df_itemVSmonth, bottom_items_id = top_item_sales(part = part)
title = f'Sale of Bottom {num} items Vs Month'
print(title)
plot_itemsVSmonth((10,5), title=title, items=bottom_items_id)

In [11]:
df_train.groupby([ 'item_id'])['item_cnt_day'].sum().sort_values(ascending=False).shape

In [12]:
%%time
num = 50
part = (1000, 1050) #num of unique item_id is 21807
df_itemVSmonth, items_id = top_item_sales(part = part)
title = f'Sale of middle {num} items Vs Month'
print(title)
plot_itemsVSmonth((10,5), title=title, items=items_id)

## Sales per Shop per month

In [13]:
def top_shop_sales(part = (0,10)):
    
    d = df_train.groupby(['shop_id','date_block_num'])['item_cnt_day'].sum()

    #top 10 sold shops
    topshops = df_train.groupby([ 'shop_id'])['item_cnt_day'].sum().sort_values(ascending=False)[part[0]:part[1]]
    top_shops_id = topshops.index
    print(top_shops_id)

    df_shopVSmonth = pd.DataFrame(data = df_train['date_block_num'].unique(), columns = ['date_block_num'])
    df_shopVSmonth[top_shops_id] = 0
    for item in top_shops_id:
        pos = d[item].index
        vals = d[item].values
        df_shopVSmonth[item][pos] = vals
    return df_shopVSmonth, top_shops_id

In [17]:
def plot_shopsVSmonth(shape=(10,1), title='title', items=[0,1]):
    m,n = shape
    fig, axes = plt.subplots(m,n, figsize=(5*n,5*m))
    x = df_shopVSmonth['date_block_num']
    for i,item in enumerate(items):
        column = df_shopVSmonth[item]
        if n!=1:
            ax =axes[i//n][i%n]
        else:
            ax =axes[i]
        sns.lineplot(x=x, y=column, ax=ax, marker='o')
    #     print(axes[i//n][i%n])
    #     df_itemVSmonth[top_items_id].plot()
    plt.title(title)
    plt.show()

In [18]:
%%time
#plot of all shops 
num = df_train['shop_id'].unique().size
print(num)
part = (0,num) # max 50 shops
df_shopVSmonth, top_shops_id = top_shop_sales(part = part)
title = f'Sale of All shops Vs Month'
print(title)
plot_shopsVSmonth((10,6), title=title, items=top_shops_id)