In [1]:
import os
import numpy as np
import pandas as pd
import platform
import pickle
from pathlib import Path

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%qtconsole

In [3]:
if platform.system() == 'Linux':
    data_dir = '/home/alin/Data/DS_Competition'
else:
    data_dir = 'C:/Users/alin/Documents/Data/DS_Competition'

In [4]:
item_categories = pd.read_csv(data_dir + '/item_categories.csv')
items = pd.read_csv(data_dir + '/items.csv')
shops = pd.read_csv(data_dir + '/shops.csv')

sales_train_dump = data_dir + '/sales_train.p'
if Path(sales_train_dump).is_file():
    print('load previous dump')
    sales_train = pickle.load(open(sales_train_dump, 'rb'))
else:
    print('load from original csv')
    sales_train = pd.read_csv(data_dir + '/sales_train.csv')
    sales_train['year'] = sales_train.apply(lambda r: int(r['date'][6:]), axis=1)
    sales_train['month'] = sales_train.apply(lambda r: int(r['date'][3:5]), axis=1)
    sales_train['day'] = sales_train.apply(lambda r: int(r['date'][:2]), axis=1)
    sales_train['year_month'] = sales_train.apply(lambda r: r['year']*100 + r['month'], axis=1)
    sales_train['weekday'] = sales_train.apply(lambda r: datetime.datetime(r['year'], r['month'], r['day']).weekday() + 1, axis=1)
    sales_train['transaction'] = 1
    sales_train['money'] = sales_train.apply(lambda r: r['item_price'] * r['item_cnt_day'], axis=1)
    sales_train = sales_train.join(items, on='item_id', how='inner', rsuffix='_r')
    sales_train = sales_train.drop(['item_id_r', 'item_name'], axis=1)
    sales_train['weekend'] = sales_train.apply(lambda r: 1 if r['weekday'] >= 6 else 0, axis=1)
    sales_train['year'] = sales_train['year'] - 2013
    sales_train = sales_train.rename(columns={'item_cnt_day': 'item_cnt'})
    pickle.dump(sales_train, open(sales_train_dump, 'wb'))

load previous dump


In [5]:
test = pd.read_csv(data_dir + '/test.csv')

In [6]:
test= test.join(items, on='item_id',  rsuffix='_r' )[['ID', 'shop_id', 'item_id', 'item_category_id']]

test['year'] = 2
test['month'] = 11

In [7]:
#output0 = pd.DataFrame({'ID':test.ID, 'item_cnt_month': 0.5})

In [8]:
#output0.to_csv(data_dir + '/submission0.csv', index=False)

### Idea1: break into categories, only keep shop_id, year, month, item_id, and item_counts,

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin

In [10]:
class Transformer1(BaseEstimator, TransformerMixin):
    def __init__(self, category):
        self.category = category
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X[X['item_category_id'] == self.category][['shop_id', 'item_id', 'item_cnt', 'year', 'month',
                                                      'year_month', 'weekend']]
        X = X.groupby(by = ['shop_id', 'year', 'month', 'year_month', 'item_id']).sum()['item_cnt'].reset_index()
        #X['year'] = X.apply(lambda r: int(r['year_month']/100) - 2013, axis=1)
        #X['month'] = X.apply(lambda r: r['year_month'] - 201300- r['year'] * 100, axis=1)
        X = X.drop(['year_month'], axis=1)
        return X

In [33]:
transformer = Transformer1(3)
X = transformer.fit_transform(sales_train)

In [34]:
X.head()

Unnamed: 0,shop_id,year,month,item_id,item_cnt
0,0,0,1,13071,28.0
1,0,0,2,13071,24.0
2,1,0,1,13071,13.0
3,1,0,2,13071,5.0
4,2,0,1,13071,9.0


In [37]:
X = X.set_index(['shop_id', 'year', 'month', 'item_id'])
X.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,item_cnt
shop_id,year,month,item_id,Unnamed: 4_level_1
0,0,1,13071,28.0
0,0,2,13071,24.0
1,0,1,13071,13.0
1,0,2,13071,5.0
2,0,1,13071,9.0


In [42]:
all_shops = list(shops.shop_id)

In [48]:
all_year = range(0,3)
all_month = range(1,13)

In [50]:
items0 = items[items.item_category_id ==3]

In [53]:
all_items = list(items0.item_id)

In [55]:
import itertools

In [58]:
cb = list(itertools.product(all_shops, all_year, all_month, all_items))

In [64]:
all_shops = [e[0] for e in cb]
all_years = [e[1] for e in cb]
all_months = [e[2] for e in cb]
all_items = [e[3] for e in cb]

In [69]:
X1 = pd.DataFrame({'shop_id': all_shops, 'year': all_years, 'month': all_months, 'item_id': all_items})

In [72]:
X1['item_cnt'] = 0

In [74]:
X2 = X1.join(X, on=['shop_id', 'year', 'month', 'item_id'],  how='left', rsuffix='_r')

In [94]:
X2['item_cnt'] = X2.apply(lambda r: 0 if pd.isnull(r['item_cnt_r']) else r['item_cnt_r'], axis=1)

In [102]:
X2 = X2.drop('item_cnt_r', axis=1)

In [113]:
from sklearn.ensemble import RandomForestRegressor


In [None]:
import itertools

In [108]:
all_shops = list(shops.shop_id)
all_year = range(0,3)
all_month = range(1,13)

In [125]:
models = {}
for category in range(item_categories.shape[0]):
    transformer = Transformer1(category)
    A = transformer.fit_transform(sales_train)
    A = A.set_index(['shop_id', 'year', 'month', 'item_id'])
    
    items0 = items[items.item_category_id ==category]
    all_items = list(items0.item_id)
    cb = list(itertools.product(all_shops, all_year, all_month, all_items))
    shop_lst = [e[0] for e in cb]
    year_lst = [e[1] for e in cb]
    month_lst = [e[2] for e in cb]
    item_lst = [e[3] for e in cb]
    A1 = pd.DataFrame({'shop_id': shop_lst, 'year': year_lst, 'month': month_lst, 'item_id': item_lst})
    A1 = A1[(A1.year < 2) | (A1.month < 11)]
    A1['item_cnt'] = 0
    A2 = A1.join(A, on=['shop_id', 'year', 'month', 'item_id'],  how='left', rsuffix='_r')
    A2['item_cnt'] = A2.apply(lambda r: 0 if pd.isnull(r['item_cnt_r']) else r['item_cnt_r'], axis=1)
    A2 = A2.drop('item_cnt_r', axis=1)
    y = A2['item_cnt']
    X = A2.drop('item_cnt', axis=1)
    rr = RandomForestRegressor(n_estimators=300, max_depth=5, random_state=42)
    rr.fit(X, y)
    models[category] = rr
#pickle.dump(models, open(data_dir + '/models_1.p', 'wb'))

In [126]:
pickle.dump(models, open(data_dir + '/models_1.p', 'wb'))

In [127]:
test_by_category = {}
for category in range(item_categories.shape[0]):
    tc = test[test.item_category_id == category][['ID', 'shop_id', 'year', 'month', 'item_id']]
    test_by_category[category] = (tc.ID, tc[['shop_id', 'year', 'month', 'item_id']])

In [128]:
result_lst = []
for category in range(item_categories.shape[0]):
    Ids = test_by_category[category][0]
    if Ids.shape[0] > 0:
        X = test_by_category[category][1]
        result = models[category].predict(X)
        result_lst.append(pd.DataFrame({'ID': Ids, 'item_cnt_month': result}))
output = pd.concat(result_lst, axis=0)

In [129]:
output.sort_values(by='ID', inplace=True)

In [133]:
output.to_csv(data_dir + '/submission2.csv', index=False)

In [12]:
models[0]

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=300, n_jobs=1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [129]:
transformer1 = Transformer1(40)

In [134]:
A = transformer1.fit_transform(sales_train)

In [135]:
A.head()

Unnamed: 0,shop_id,year,month,item_id,item_cnt
0,0,0,1,32,6.0
1,0,0,1,35,1.0
2,0,0,1,43,1.0
3,0,0,1,75,1.0
4,0,0,1,88,1.0


In [136]:
A_train = X[(X.year < 2) | ((X.year == 2) & (X.month < 10))]
A_val = X[(X.year == 2) & (X.month == 10)]

In [141]:
y_train = A_train['item_cnt']
X_train = A_train.drop('item_cnt', axis=1)

In [145]:
y_val = A_val['item_cnt']
X_val = A_val.drop('item_cnt', axis=1)

In [156]:
from sklearn.ensemble import RandomForestRegressor

In [157]:
rr = RandomForestRegressor(max_depth=5, random_state=0)

In [158]:
rr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [159]:
y_pred = rr.predict(X_val)

In [165]:
from sklearn.metrics import  mean_squared_error

In [167]:
print(np.sqrt(mean_squared_error(y_val, y_pred)))

1.62894710662


In [168]:
from sklearn.model_selection import GridSearchCV

In [169]:
parameters = {'max_depth': [3, 5, 7, 9]}

In [176]:
rr3 = RandomForestRegressor(n_estimators=500, max_depth=3)
rr5 = RandomForestRegressor(n_estimators=500, max_depth=5)
rr7 = RandomForestRegressor(n_estimators=500, max_depth=7)
rr9 = RandomForestRegressor(n_estimators=500, max_depth=9)


In [177]:
rr3.fit(X_train, y_train)
rr5.fit(X_train, y_train)
rr7.fit(X_train, y_train)
rr9.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=9,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [178]:
y_pred3 = rr3.predict(X_val)
y_pred5 = rr5.predict(X_val)
y_pred7 = rr7.predict(X_val)
y_pred9 = rr9.predict(X_val)

In [179]:
print(np.sqrt(mean_squared_error(y_pred3, y_val)))
print(np.sqrt(mean_squared_error(y_pred5, y_val)))
print(np.sqrt(mean_squared_error(y_pred7, y_val)))
print(np.sqrt(mean_squared_error(y_pred9, y_val)))


1.62467374273
1.62670638015
1.62892115378
1.63174236757


In [180]:
np.max(y_pred3)

2.9640348262290703

In [181]:
np.max(y_pred5)

5.3130374692645743

In [182]:
np.max(y_pred9)

9.6075721497960007

In [183]:
np.min(y_pred9)

1.1630781011235689

In [184]:
np.min(y_val)

0.0

In [185]:
np.max(y_val)

29.0

In [186]:
pickle.dump(rr3, open(data_dir + '/rf.p', 'wb'))

In [188]:
rf = pickle.load(open(data_dir + '/rf.p', 'rb'))

In [190]:
y_pred3a = rf.predict(X_val)

In [197]:
models = {}
for category in range(2):
    transformer = Transformer1(category)
    A = transformer.fit_transform(sales_train)
    y = A['item_cnt']
    X = A.drop('item_cnt', axis=1)
    rr = RandomForestRegressor(n_estimators=300, max_depth=5, random_state=42)
    rr.fit(X, y)
    models[category] = rr

In [198]:
models

{0: RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False),
 1: RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)}

In [199]:
transformer0 = Transformer1(0)
transformer1 = Transformer1(1)

In [200]:
A0 = transformer0.fit_transform(sales_train)
A1 = transformer1.fit_transform(sales_train)

In [201]:
y0 = A0['item_cnt']
X0 = A0.drop('item_cnt', axis=1)
y1 = A1['item_cnt']
X1 = A1.drop('item_cnt', axis=1)

In [202]:
y_pred0 = models[0].predict(X0)

In [203]:
y_pred1 = models[1].predict(X1)

In [204]:
pickle.dump(models, open(data_dir + '/models.p', 'wb'))

In [205]:
new_model = pickle.load( open(data_dir + '/models.p', 'rb'))

In [206]:
y_pred0n = new_model[0].predict(X0)
y_pred1n = new_model[1].predict(X1)

In [207]:
np.linalg.norm(y_pred0 - y_pred0n)

0.0