In [5]:
import pandas as pd
import numpy as np

import preprocessing as pp
import model
from train import train_model
from predict import predict

pd.set_option('mode.chained_assignment', None)

In [6]:
# Data loading
df_store = pd.read_csv('../data/store.csv')
df_train = pd.read_csv('../data/train.csv', low_memory=False)

## General pipeline test

### Model selection (Time-split cross-validation)

In [7]:
from sklearn.model_selection import TimeSeriesSplit
from train import rmse, r2

train = df_train.copy().iloc[::-1]
train.Date = pd.to_datetime(train.Date)

n_splits = 10
test_size = 42
tscv = TimeSeriesSplit(n_splits=n_splits)

reg_model = model.Regressor()
rmse_scores = []
r2_scores = []

date_grouping = train.groupby(train.Date)['Store']
date_list = [g[0] for g in list(date_grouping)[:]]
for train_index, test_index in tscv.split(date_grouping):
    
    # Fixed test set cardinality (in number of days)
    train_index = np.append(train_index, list(range(len(train_index), 1 + int(test_index[-1] - test_size))))
    test_index = test_index[(1 + int(train_index[-1] - test_index[0])):]
    
    train_dates = [date_list[train_index[0]], date_list[train_index[-1]]]
    test_dates = [date_list[test_index[0]], date_list[test_index[-1]]]
    train_mask = (train.Date >= train_dates[0]) & (train.Date <= train_dates[1])
    test_mask = (train.Date >= test_dates[0]) & (train.Date <= test_dates[1])
    
    # Train and test sets
    X_train, y_train, X_PCA_train = pp.Preprocessor().transform(df_store, train.loc[train_mask])
    X_test, y_test, X_PCA_test = pp.Preprocessor().transform(df_store, train.loc[test_mask])
    
    # Dummy variables can induce differences in the schemas
    missing_test = set(X_train.columns) - set(X_test.columns)
    missing_train = set(X_test.columns) - set(X_train.columns)
    for c in missing_test:
        X_test[c] = 0
    for c in missing_train:
        X_train[c] = 0
    # Reorder to match columns order in train and test
    X_test = X_test[X_train.columns]
    
    # Model fitting on training set
    train_model(reg_model, X_train, y_train)

    # Scoring on test set
    y_pred = reg_model.predict(X_test)
    rmse_scores.append(rmse(y_test, y_pred))
    r2_scores.append(r2(y_test, y_pred))
        
# Final display
for i in range(n_splits):
    print("FOLD " + str(i + 1) + ": " + "RSME = " + str(rmse_scores[i]) + 
      " | R² = " + str(r2_scores[i]))
    
# Overall scores
w = [1 + 0.5 * i for i in range(1, n_splits + 1)]
print("--- OVERALL ---")
print("RSME = " + '{0:.2f}'.format(np.average(rmse_scores, weights=w)) + " | R² = " + '{0:.2f}'.format(np.average(r2_scores, weights=w)))

  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)


FOLD 1: RSME = 1231.3391854777265 | R² = 0.8216264570573546
FOLD 2: RSME = 1286.9352966696458 | R² = 0.8050295593719106
FOLD 3: RSME = 1629.850235073431 | R² = 0.7282426717737905
FOLD 4: RSME = 1294.3491347753384 | R² = 0.8059383491898818
FOLD 5: RSME = 1553.8094595255377 | R² = 0.7380040545139812
FOLD 6: RSME = 1551.0148798335006 | R² = 0.7204446268154161
FOLD 7: RSME = 1479.35319217069 | R² = 0.7256451998099787
FOLD 8: RSME = 1415.9167252460059 | R² = 0.7548224752943464
FOLD 9: RSME = 1965.437493636001 | R² = 0.6544917914087907
FOLD 10: RSME = 1724.6604317543934 | R² = 0.6812301906237108
--- OVERALL ---
RSME = 1571.08 | R² = 0.73


  Xt = transform.transform(Xt)


### Model selection (Held-out test set of the last 6 weeks)

In [8]:
from sklearn.model_selection import TimeSeriesSplit
from train import rmse, r2

train = df_train.copy().iloc[::-1]
train.Date = pd.to_datetime(train.Date)
train_set = train[train.Date < '2015-06-19']
test_set = train[train.Date >= '2015-06-19']

reg_model = model.Regressor()
rmse_scores = []
r2_scores = []

X_train, y_train, X_PCA_train = pp.Preprocessor().transform(df_store, train_set)
X_test, y_test, X_PCA_test = pp.Preprocessor().transform(df_store, test_set)

# Dummy variables can induce differences in the schemas
missing_test = set(X_train.columns) - set(X_test.columns)
missing_train = set(X_test.columns) - set(X_train.columns)
for c in missing_test:
    X_test[c] = 0
for c in missing_train:
    X_train[c] = 0
# Reorder to match columns order in train and test
X_test = X_test[X_train.columns]

# Model fitting on training set
train_model(reg_model, X_train, y_train)

# Scoring on test set
y_pred = reg_model.predict(X_test)
rmse_scores = rmse(y_test, y_pred)
r2_scores = r2(y_test, y_pred)

print("RSME = " + str(rmse_scores) + " | R² = " + str(r2_scores))



RSME = 1712.404206894745 | R² = 0.6835392522629897


  Xt = transform.transform(Xt)


### Model selection

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

models = []
models.append(('LR', LinearRegression()))
models.append(('XGB', XGBRegressor()))
models.append(('RF', RandomForestRegressor()))

std = StandardScaler()

results = []
names = []

train = df_train.copy().iloc[::-1]
train.Date = pd.to_datetime(train.Date)

n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

date_grouping = train.groupby(train.Date)['Store']
date_list = [g[0] for g in list(date_grouping)[:]]

for name, model in models:
    r2_scores = []
    
    for train_index, test_index in tscv.split(date_grouping):
        # Fixed test set cardinality (in number of days)
        train_index = np.append(train_index, list(range(len(train_index), 1 + int(test_index[-1] - test_size))))
        test_index = test_index[(1 + int(train_index[-1] - test_index[0])):]

        train_dates = [date_list[train_index[0]], date_list[train_index[-1]]]
        test_dates = [date_list[test_index[0]], date_list[test_index[-1]]]
        train_mask = (train.Date >= train_dates[0]) & (train.Date <= train_dates[1])
        test_mask = (train.Date >= test_dates[0]) & (train.Date <= test_dates[1])

        # Train and test sets
        X_train, y_train, X_PCA_train = pp.Preprocessor().transform(df_store, train.loc[train_mask])
        X_test, y_test, X_PCA_test = pp.Preprocessor().transform(df_store, train.loc[test_mask])
        
         # Train and test sets
        X_train, y_train, X_PCA_train = pp.Preprocessor().transform(df_store, train.loc[train_mask])
        X_test, y_test, X_PCA_test = pp.Preprocessor().transform(df_store, train.loc[test_mask])

        # Dummy variables can induce differences in the schemas
        missing_test = set(X_train.columns) - set(X_test.columns)
        missing_train = set(X_test.columns) - set(X_train.columns)
        for c in missing_test:
            X_test[c] = 0
        for c in missing_train:
            X_train[c] = 0
        # Reorder to match columns order in train and test
        X_test = X_test[X_train.columns]
        
        X_train = std.fit_transform(X_train.loc[:, X_train.columns != 'Date'])
        X_test = std.transform(X_test.loc[:, X_test.columns != 'Date'])

        # Model fitting on training set
        model.fit(X_train, y_train)

        # Scoring on test set
        y_pred = model.predict(X_test)
        r2_scores.append(r2(y_test, y_pred))
    
    results.append(r2_scores)
    names.append(name)
    msg = "%s: %f (%f)" % (name, np.mean(r2_scores), np.std(r2_scores))
    print(msg)



LR: 0.224845 (0.041672)




In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

fig = plt.figure(figsize=(15,10))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

## Preprocessing tests

In [17]:
# Data loading
df_store = pd.read_csv('../data/store.csv')
df_train = pd.read_csv('../data/train.csv', low_memory=False)

In [18]:
# Preprocessing
X, y, X_PCA = pp.Preprocessor().transform(df_store, df_train)
print(X.shape, y.shape, X_PCA.shape)
X.head()

(844392, 22) (844392,) (844392, 13)


Unnamed: 0,DayOfWeek,Date,Open,Promo,SchoolHoliday,StateHoliday_a,StateHoliday_b,StateHoliday_c,cos_DayOfWeek,sin_DayOfWeek,...,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",StoreType_b,StoreType_c,StoreType_d,Assortment_b,Assortment_c,CompetitionOpenSince,Promo2Since
0,5,2015-07-31,1,1,1,0,0,0,-0.222521,-0.974928,...,0,0,0,0,1,0,0,0,105.0,648.142857
1,4,2015-07-30,1,1,1,0,0,0,-0.900969,-0.433884,...,0,0,0,0,1,0,0,0,105.0,648.142857
2,3,2015-07-29,1,1,1,0,0,0,-0.900969,0.433884,...,0,0,0,0,1,0,0,0,105.0,648.142857
3,2,2015-07-28,1,1,1,0,0,0,-0.222521,0.974928,...,0,0,0,0,1,0,0,0,105.0,648.142857
4,1,2015-07-27,1,1,1,0,0,0,0.62349,0.781831,...,0,0,0,0,1,0,0,0,105.0,648.142857


In [20]:
reg_model = model.Regressor()
train_model(reg_model, X, y)

In [21]:
print(reg_model)

Regressor()
