In [70]:
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

pd.set_option('display.float_format', lambda x: '%.2f' % x)
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 2.99 ms


In [2]:
def write_to_submission(y, name):
    sample_submission = pd.read_csv('sample_submission.csv')
    sample_submission['item_cnt_month'] = y_test
    sample_submission.to_csv(name, index=False)
    
def rmse_score(y_true, y_predict):
    return sqrt(mean_squared_error(y_true, y_predict))
    

time: 997 µs


# Data Description
* ID - an Id that represents a (Shop, Item) tuple within the test set
* shop_id - unique identifier of a shop
* item_id - unique identifier of a product
* item_category_id - unique identifier of item category
* item_cnt_day - number of products sold. You are predicting a monthly amount of this measure
* item_price - current price of an item
* date - date in format dd/mm/yyyy
* date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33
* item_name - name of item
* shop_name - name of shop
* item_category_name - name of item category

In [3]:
train_df = pd.read_csv('sales_train.csv')

time: 1.29 s


In [36]:
test_df = pd.read_csv('test.csv')

time: 113 ms


In [35]:
!dir

 Volume in drive D is Programs
 Volume Serial Number is BEA1-4357

 Directory of D:\Projects\predict-future-sales

23-09-2020  20:10    <DIR>          .
23-09-2020  20:10    <DIR>          ..
21-09-2020  22:17                 7 .gitignore
21-09-2020  22:18    <DIR>          .ipynb_checkpoints
23-09-2020  20:10            23,019 Base Line Model.ipynb
22-09-2020  19:58         5,484,377 Baseline1.csv
22-09-2020  20:05         5,632,658 BaseLine2.csv
23-09-2020  10:25         2,696,903 BaseLine3.csv
15-12-2019  22:01         1,568,417 items.csv
15-12-2019  22:01             3,573 item_categories.csv
21-09-2020  22:13               173 README.md
15-12-2019  22:01        94,603,866 sales_train.csv
15-12-2019  22:01         2,245,108 sample_submission.csv
15-12-2019  22:01             2,977 shops.csv
15-12-2019  22:01         3,182,735 test.csv
              12 File(s)    115,443,813 bytes
               3 Dir(s)  139,285,684,224 bytes free
time: 33.9 ms


In [36]:
train_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


time: 24.9 ms


Item Price Shows outliers (Value -1) But only one such record is present

In [37]:
train_df.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0
mean,14.57,33.0,10197.23,890.85,1.24
std,9.42,16.23,6324.3,1729.8,2.62
min,0.0,0.0,0.0,-1.0,-22.0
25%,7.0,22.0,4476.0,249.0,1.0
50%,14.0,31.0,9343.0,399.0,1.0
75%,23.0,47.0,15684.0,999.0,1.0
max,33.0,59.0,22169.0,307980.0,2169.0


time: 859 ms


In [38]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
date              object
date_block_num    int64
shop_id           int64
item_id           int64
item_price        float64
item_cnt_day      float64
dtypes: float64(2), int64(3), object(1)
memory usage: 134.4+ MB
time: 6.98 ms


In [39]:
train_df.count()

date              2935849
date_block_num    2935849
shop_id           2935849
item_id           2935849
item_price        2935849
item_cnt_day      2935849
dtype: int64

time: 171 ms


# Baseline 1 

Features Used
* shop_id
* item_id
* item_cnt_day (Aggregated)

# Result = 8.69909

In [46]:
y = train_df[['shop_id','item_id','item_cnt_day']].groupby(['shop_id','item_id']).sum().values
X = train_df[['shop_id','item_id','item_cnt_day']].groupby(['shop_id','item_id']).sum().reset_index()[['shop_id','item_id']].values

time: 1.31 s


In [173]:
X.shape, y.shape

((424124, 2), (424124, 1))

time: 2.98 ms


In [47]:
X_train, X_cross, y_train, y_cross_true = train_test_split(X, y, test_size = 0.2 ,random_state = 17)

time: 66.8 ms


In [181]:
X_train.shape, X_cross.shape, y_train.shape, y_cross.shape

((339299, 2), (84825, 2), (339299, 1), (84825, 1))

time: 2.99 ms


In [182]:
clf = LinearRegression(n_jobs = -1)
clf.fit(X_train,y_train)
y_cross_pred = clf.predict(X_cross)

time: 39.9 ms


In [183]:
rmse_score(y_cross_true, y_cross_pred)

38.31928917468974

time: 3.99 ms


In [184]:
X_test = test_df[['shop_id', 'item_id']].values
y_test = clf.predict(X_test)

time: 23.9 ms


In [147]:
write_to_submission(y_test, 'Baseline1.csv')

time: 783 ms


Seems to still be overfitting

In [66]:
clf = Ridge(random_state = 17)
clf.fit(X_train,y_train)
y_cross_pred = clf.predict(X_cross)

time: 29.9 ms


In [67]:
rmse_score(y_cross, y_cross_pred)

5.193806138928656

time: 4.01 ms


In [68]:
X_test = test_df[['shop_id', 'item_id']].values
y_test = clf.predict(X_test)

time: 11 ms


In [69]:
write_to_submission(y_test, 'Baseline5.csv')

time: 824 ms


This submission gave the same score 

# Base Line 2
Same as above just one hot encoding data

# Result = 200+
Performed worse(A LOT)

Why?

Hypothesis

Posible Reason Due to extremely sparse features, model cannot seem to find any pattern

In [185]:
y = train_df[['shop_id','item_id','item_cnt_day']].groupby(['shop_id','item_id']).sum().values
X = train_df[['shop_id','item_id','item_cnt_day']].groupby(['shop_id','item_id']).sum().reset_index()[['shop_id','item_id']].values

time: 1.14 s


In [186]:
enc = OneHotEncoder(handle_unknown='ignore')
enc = enc.fit(X)
X = enc.transform(X)

time: 108 ms


In [188]:
X_train, X_cross, y_train, y_cross_true = train_test_split(X, y, test_size = 0.2 ,random_state = 17)

time: 81.8 ms


In [189]:
clf = LinearRegression(n_jobs = -1)
clf.fit(X_train,y_train)
y_cross = clf.predict(X_cross)

time: 7.1 s


In [191]:
rmse_score(y_cross_true, y_cross)

48.40570940917939

time: 4.99 ms


In [192]:
X_test = test_df[['shop_id', 'item_id']].values
X_test = enc.transform(X_test)
y_test = clf.predict(X_test)

time: 86.8 ms


In [156]:
write_to_submission(y_test, 'BaseLine2.csv')

time: 791 ms


# BaseLine 3

Using Random Forest Classifier

Doesnt seem to work

In [5]:
y = train_df[['shop_id','item_id','item_cnt_day']].groupby(['shop_id','item_id']).sum().values
X = train_df[['shop_id','item_id','item_cnt_day']].groupby(['shop_id','item_id']).sum().reset_index()[['shop_id','item_id']].values

time: 1.06 s


In [11]:
X_train, X_cross, y_train, y_cross_true = train_test_split(X, y, test_size = 0.2 ,random_state = 17)

time: 76.8 ms


In [164]:
clf = RandomForestRegressor(n_jobs = -1, random_state = 17)
clf.fit(X_train,y_train.ravel())
y_cross = clf.predict(X_cross)

time: 35.2 s


In [171]:
rmse_score(y_cross_true, y_cross)

24.875971041518625

time: 23.9 ms


In [167]:
X_test = test_df[['shop_id', 'item_id']].values
y_test = clf.predict(X_test)
write_to_submission(y_test, 'BaseLine3.csv')

time: 3.48 s


Above Submission scores - 66

Model Seems to be overfitting

In [43]:
clf = RandomForestRegressor(n_jobs = -1, random_state = 17, n_estimators = 6, max_depth = 2)
clf.fit(X_train,y_train.ravel())
y_cross = clf.predict(X_cross)

time: 421 ms


In [44]:
rmse_score(y_cross_true, y_cross)

36.64203981741157

time: 2.99 ms


In [42]:
X_test = test_df[['shop_id', 'item_id']].values
y_test = clf.predict(X_test)
write_to_submission(y_test, 'BaseLine4.csv')

time: 889 ms


# BaseLine 4
Trying Gradient boosting Regressor

In [71]:
y = train_df[['shop_id','item_id','item_cnt_day']].groupby(['shop_id','item_id']).sum().values
X = train_df[['shop_id','item_id','item_cnt_day']].groupby(['shop_id','item_id']).sum().reset_index()[['shop_id','item_id']].values

time: 1.17 s


In [72]:
X_train, X_cross, y_train, y_cross_true = train_test_split(X, y, test_size = 0.2 ,random_state = 17)

time: 69.8 ms


In [74]:
clf = GradientBoostingRegressor(random_state = 17)
clf.fit(X_train,y_train.ravel())
y_cross = clf.predict(X_cross)

time: 23.4 s


In [75]:
rmse_score(y_cross_true, y_cross)

26.2692760638036

time: 4.99 ms


Still Over Fitting

# Conclusion

After attempting all methods max score was 8.69 ....

All Models seem to be overfitting

Start incorporating Date Time Features