In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
DIRECTORY = './data/'
categories = pd.read_csv(DIRECTORY + 'item_categories.csv')
items = pd.read_csv(DIRECTORY + 'items.csv')
shops = pd.read_csv(DIRECTORY + 'shops.csv')
train = pd.read_csv(DIRECTORY + 'sales_train.csv', parse_dates=['date'], date_format='%d.%m.%Y')
test = pd.read_csv(DIRECTORY + 'test.csv')
train['month'] = train['date'].dt.to_period('M')

# 1. Test new items

In [15]:
# get new items in 
new_items = np.setdiff1d(test['item_id'].unique(), train['item_id'].unique())
len(new_items)

363

In [16]:
# Count of each 
items['subcategory'] = items['item_name'].str.split().str[0]
count_categories = items['subcategory'].value_counts()
pd.cut(count_categories, bins=[0,1,2,3,5,10,100,1000]).value_counts(normalize=True)

count
(0, 1]         0.472391
(1, 2]         0.195925
(2, 3]         0.098819
(3, 5]         0.091965
(5, 10]        0.075400
(10, 100]      0.063024
(100, 1000]    0.002475
Name: proportion, dtype: float64

# Nov. 2014 Testing

In [114]:
# Try testing on Nov. 2014
# predict the avg. of the ones with the same first name
# predict avg. of ones in same category

nov = pd.to_datetime('2014-11-01').to_period('M')
nov_train = train[train['month'] == nov]
nov_new_items = np.setdiff1d(nov_train['item_id'].unique(), train[train['month'] < nov]['item_id'].unique())
nov_stores = nov_train['shop_id'].unique()
nov_train_new = nov_train[nov_train['item_id'].isin(nov_new_items)]

In [115]:
nov_train_aggregated = nov_train_new.groupby(['shop_id', 'item_id'])['item_cnt_day'].sum().to_frame('answers').reset_index()
nov_train_aggregated['clipped'] = np.clip(nov_train_aggregated['answers'], 0, 20)
np.square(nov_train_aggregated['clipped']).sum() / len(nov_stores) / len(nov_new_items)

10.711347826086957

In [116]:
# Guessing a constant value (only down to 9.7)
guess = nov_train_aggregated['clipped'].sum() / len(nov_stores) / len(nov_new_items)
np.square(nov_train_aggregated['clipped'] - 1).sum() / len(nov_stores) / len(nov_new_items) + 1 - len(nov_train_aggregated) / len(nov_stores) / len(nov_new_items)

9.694565217391306

In [118]:
# Guess based on previous first month sales
before_nov = train[train['month'] < nov].copy()

# calculate first month sales for each item, before grouping by item first name
first_month_map = before_nov.groupby(['item_id'])['month'].min() # for each item, get its first month
before_nov['first_month'] = before_nov['item_id'].map(first_month_map)
first_month_sales = before_nov[before_nov['month'] == before_nov['first_month']].groupby(['item_id', 'shop_id'])['item_cnt_day'].sum().to_frame('first_month_sales').reset_index()
first_month_sales = first_month_sales.merge(items, on=['item_id'], how='left')
first_month_sales['first_month_sales'] = np.clip(first_month_sales['first_month_sales'], 0, 20)
# by category, and by subcategory

by_category = first_month_sales.groupby(['item_category_id', 'shop_id'])['first_month_sales'].mean().to_frame('prediction_category').reset_index()
by_category['prediction_category'] = np.clip(by_category['prediction_category'], 0, 20)

by_subcategory = first_month_sales.groupby(['subcategory', 'shop_id'])['first_month_sales'].mean().to_frame('prediction_subcategory').reset_index()
by_subcategory['prediction_subcategory'] = np.clip(by_subcategory['prediction_subcategory'], 0, 20)

In [119]:
# guessing avg. of first month for same category
prediction_df = nov_train_aggregated.merge(items, on=['item_id'], how='left').merge(by_category, on=['item_category_id', 'shop_id'], how='outer')
prediction_df = prediction_df.merge(by_subcategory, on=['subcategory', 'shop_id'], how='outer')
print("Baseline:", np.square(prediction_df['clipped'].fillna(0)).sum() / len(nov_stores) / len(nov_new_items))
print("Category:", np.square(prediction_df['clipped'].fillna(0) - prediction_df['prediction_category'].fillna(0)).sum() / len(nov_stores) / len(nov_new_items))
print("Subcategory:", np.square(prediction_df['clipped'].fillna(0) - prediction_df['prediction_subcategory'].fillna(0)).sum() / len(nov_stores) / len(nov_new_items))

Baseline: 10.711347826086957
Category: 5.794799709517537
Subcategory: 39.57677661994185


In [120]:
# either or
prediction_df['na_conditional'] = np.where(prediction_df['prediction_subcategory'].isna(), prediction_df['prediction_category'], prediction_df['prediction_subcategory'])
print("Conditional:", np.square(prediction_df['clipped'].fillna(0) - prediction_df['na_conditional'].fillna(0)).sum() / len(nov_stores) / len(nov_new_items))

Conditional: 39.72369839036163


# Investigating shortcomings

In [102]:
prediction_df['error'] = prediction_df['clipped'] - prediction_df['prediction_category']

Unnamed: 0,shop_id,item_id,answers,clipped,item_name,item_category_id,subcategory,prediction_category,prediction_subcategory,na_conditional
0,2,1534,1.0,1.0,"Assassin's Creed. Сага о Новом Свете [PC, русс...",28,Assassin's,6.228070,5.863636,5.863636
1,2,1542,3.0,3.0,Assassin's Creed: Единство. Bastille Edition [...,20,Assassin's,10.931818,5.863636,5.863636
2,2,1548,4.0,4.0,Assassin's Creed: Единство. Notre Dame Edition...,29,Assassin's,7.714286,5.863636,5.863636
3,2,1549,6.0,6.0,Assassin's Creed: Единство. Notre Dame Edition...,20,Assassin's,10.931818,5.863636,5.863636
4,2,1555,29.0,20.0,Assassin's Creed: Единство. Специальное издани...,28,Assassin's,6.228070,5.863636,5.863636
...,...,...,...,...,...,...,...,...,...,...
5352,59,21387,1.0,1.0,ХОББИТ: ПУСТОШЬ СМАУГА (реж.версия) (2 диска 3...,38,ХОББИТ:,1.490566,6.727273,6.727273
5353,59,21389,1.0,1.0,ХОББИТ: ПУСТОШЬ СМАУГА (реж.версия) (5DVD),40,ХОББИТ:,2.407246,6.727273,6.727273
5354,59,21430,4.0,4.0,ХРАБРОЕ СЕРДЦЕ (КОЛЛЕКЦИОННОЕ ИЗДАНИЕ) (2BD),37,ХРАБРОЕ,1.617512,,1.617512
5355,59,21627,1.0,1.0,ЧЕЛОВЕК НОЯБРЯ,40,ЧЕЛОВЕК,2.407246,2.000000,2.000000
