In [106]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

pd.set_option('display.float_format', lambda x: '%.2f' % x)
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 1.99 ms


In [146]:
def write_to_submission(y, name):
    sample_submission = pd.read_csv('sample_submission.csv')
    sample_submission['item_cnt_month'] = y_test
    sample_submission.to_csv(name, index=False)

time: 997 µs


# Data Description
* ID - an Id that represents a (Shop, Item) tuple within the test set
* shop_id - unique identifier of a shop
* item_id - unique identifier of a product
* item_category_id - unique identifier of item category
* item_cnt_day - number of products sold. You are predicting a monthly amount of this measure
* item_price - current price of an item
* date - date in format dd/mm/yyyy
* date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33
* item_name - name of item
* shop_name - name of shop
* item_category_name - name of item category

In [33]:
train_df = pd.read_csv('sales_train.csv')

time: 1.39 s


In [46]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


time: 62.8 ms


In [44]:
!dir

 Volume in drive D is Programs
 Volume Serial Number is BEA1-4357

 Directory of D:\Projects\predict-future-sales

22-09-2020  18:42    <DIR>          .
22-09-2020  18:42    <DIR>          ..
21-09-2020  22:17                 7 .gitignore
21-09-2020  22:18    <DIR>          .ipynb_checkpoints
22-09-2020  18:42            14,073 Base Line Model.ipynb
15-12-2019  22:01         1,568,417 items.csv
15-12-2019  22:01             3,573 item_categories.csv
21-09-2020  22:13               173 README.md
15-12-2019  22:01        94,603,866 sales_train.csv
15-12-2019  22:01         2,245,108 sample_submission.csv
15-12-2019  22:01             2,977 shops.csv
15-12-2019  22:01         3,182,735 test.csv
               9 File(s)    101,620,929 bytes
               3 Dir(s)  139,299,561,472 bytes free
time: 21.9 ms


In [47]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


time: 78.8 ms


In [36]:
train_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


time: 24.9 ms


Item Price Shows outliers (Value -1) But only one such record is present

In [37]:
train_df.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0
mean,14.57,33.0,10197.23,890.85,1.24
std,9.42,16.23,6324.3,1729.8,2.62
min,0.0,0.0,0.0,-1.0,-22.0
25%,7.0,22.0,4476.0,249.0,1.0
50%,14.0,31.0,9343.0,399.0,1.0
75%,23.0,47.0,15684.0,999.0,1.0
max,33.0,59.0,22169.0,307980.0,2169.0


time: 859 ms


In [38]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
date              object
date_block_num    int64
shop_id           int64
item_id           int64
item_price        float64
item_cnt_day      float64
dtypes: float64(2), int64(3), object(1)
memory usage: 134.4+ MB
time: 6.98 ms


In [39]:
train_df.count()

date              2935849
date_block_num    2935849
shop_id           2935849
item_id           2935849
item_price        2935849
item_cnt_day      2935849
dtype: int64

time: 171 ms


# Baseline 1 

Features Used
* shop_id
* item_id
* item_cnt_day (Aggregated)

# Result = 8.69909

In [122]:
y = train_df[['shop_id','item_id','item_cnt_day']].groupby(['shop_id','item_id']).sum().values
X = train_df[['shop_id','item_id','item_cnt_day']].groupby(['shop_id','item_id']).sum().reset_index()[['shop_id','item_id']].values

time: 566 ms


In [98]:
X.shape, y.shape

((424124, 2), (424124, 1))

time: 1.99 ms


In [124]:
X_train, X_cross, y_train, y_cross = train_test_split(X, y, test_size = 0.2 ,random_state = 17)

time: 65.8 ms


In [125]:
X_train.shape, X_cross.shape, y_train.shape, y_cross.shape

((339299, 2), (84825, 2), (339299, 1), (84825, 1))

time: 2 ms


In [126]:
clf = LinearRegression(n_jobs = -1)
clf.fit(X_train,y_train)
y_cross = clf.predict(X_cross)

time: 56.8 ms


In [127]:
mean_squared_error(y_test, y_cross)

1468.3679228534943

time: 4.99 ms


In [128]:
X_test = test_df[['shop_id', 'item_id']].values
y_test = clf.predict(X_test)

time: 13 ms


In [147]:
write_to_submission(y_test, 'Baseline1.csv')

time: 783 ms


# Base Line 2
Same as above just one hot encoding data

# Result = 200+
Performed worse(A LOT)
Todo - /Investigate.

In [148]:
y = train_df[['shop_id','item_id','item_cnt_day']].groupby(['shop_id','item_id']).sum().values
X = train_df[['shop_id','item_id','item_cnt_day']].groupby(['shop_id','item_id']).sum().reset_index()[['shop_id','item_id']].values

time: 1.07 s


In [149]:
enc = OneHotEncoder(handle_unknown='ignore')
enc = enc.fit(X)
X = enc.transform(X)

time: 90.7 ms


In [153]:
X_train, X_cross, y_train, y_cross_true = train_test_split(X, y, test_size = 0.2 ,random_state = 17)

time: 71.8 ms


In [151]:
clf = LinearRegression(n_jobs = -1)
clf.fit(X_train,y_train)
y_cross = clf.predict(X_cross)

time: 5.64 s


In [154]:
mean_squared_error(y_cross_true, y_cross)

2343.112703405918

time: 4.99 ms


In [155]:
X_test = test_df[['shop_id', 'item_id']].values
X_test = enc.transform(X_test)
y_test = clf.predict(X_test)

time: 79.8 ms


In [156]:
write_to_submission(y_test, 'BaseLine2.csv')

time: 791 ms
