In [1]:
import pandas as pd

import preprocessing as pp
import model
from train import train_model
from predict import predict

In [2]:
# Data loading
df_store = pd.read_csv('../data/store.csv')
df_train = pd.read_csv('../data/train.csv', low_memory=False)

## General pipeline test

### Model selection (Time-split cross-validation)

In [4]:
from sklearn.model_selection import TimeSeriesSplit
from train import rmse, r2

train = df_train.copy().iloc[::-1]
train.Date = pd.to_datetime(train.Date)

n_splits = 10
tscv = TimeSeriesSplit(n_splits=n_splits)

reg_model = model.Regressor()
rmse_scores = []
r2_scores = []

date_grouping = train.groupby(train.Date)['Store']
date_list = [g[0] for g in list(date_grouping)[:]]
for train_index, test_index in tscv.split(date_grouping):
    train_dates = [date_list[train_index[0]], date_list[train_index[-1]]]
    test_dates = [date_list[test_index[0]], date_list[test_index[-1]]]
    train_mask = (train.Date >= train_dates[0]) & (train.Date <= train_dates[1])
    test_mask = (train.Date >= test_dates[0]) & (train.Date <= test_dates[1])
    
    train.loc[test_mask]
    
    # Train and test sets
    X_train, y_train, X_PCA_train = pp.Preprocessor().transform(df_store, train.loc[train_mask])
    X_test, y_test, X_PCA_test = pp.Preprocessor().transform(df_store, train.loc[test_mask])
    
    # Dummy variables can induce differences in the schemas
    missing_test = set(X_train.columns) - set(X_test.columns)
    missing_train = set(X_test.columns) - set(X_train.columns)
    for c in missing_test:
        X_test[c] = 0
    for c in missing_train:
        X_train[c] = 0
    # Reorder to match columns order in train and test
    X_test = X_test[X_train.columns]
    
    # Model fitting on training set
    train_model(reg_model, X_train, y_train)

    # Scoring on test set
    y_pred = reg_model.predict(X_test)
    rmse_scores.append(rmse(y_test, y_pred))
    r2_scores.append(r2(y_test, y_pred))
        
# Final display
for i in range(n_splits):
    print("FOLD " + str(i + 1) + ": " + "RSME = " + str(rmse_scores[i]) + 
      " | R² = " + str(r2_scores[i]))

  df_store_bis = scale(df_store_bis)
  df_store_bis = scale(df_store_bis)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


TypeError: float() argument must be a string or a number, not 'Timestamp'

### Model selection (Held-out test set of the last 6 weeks)

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from train import rmse, r2

train = df_train.copy().iloc[::-1]
train.Date = pd.to_datetime(train.Date)
train_set = train[train.Date < '2015-06-19']
test_set = train[train.Date >= '2015-06-19']

reg_model = model.Regressor()
rmse_scores = []
r2_scores = []

X_train, y_train, X_¨ = pp.Preprocessor().transform(df_store, train_set)
X_test, y_test = pp.Preprocessor().transform(df_store, test_set)

# Dummy variables can induce differences in the schemas
missing_test = set(X_train.columns) - set(X_test.columns)
missing_train = set(X_test.columns) - set(X_train.columns)
for c in missing_test:
    X_test[c] = 0
for c in missing_train:
    X_train[c] = 0
# Reorder to match columns order in train and test
X_test = X_test[X_train.columns]

# Model fitting on training set
train_model(reg_model, X_train, y_train)

# Scoring on test set
y_pred = reg_model.predict(X_test)
rmse_scores = rmse(y_test, y_pred)
r2_scores = r2(y_test, y_pred)

print("RSME = " + str(rmse_scores) + " | R² = " + str(r2_scores))

## Preprocessing tests

In [3]:
# Data loading
df_store = pd.read_csv('../data/store.csv')
df_train = pd.read_csv('../data/train.csv', low_memory=False)

In [4]:
# Preprocessing
X, y, X_PCA = pp.Preprocessor().transform(df_store, df_train)
print(X.shape, y.shape, X_PCA.shape)
X.head()

  df_store_bis = scale(df_store_bis)


(1017209, 22) (1017209,) (1017209, 13)


Unnamed: 0,DayOfWeek,Date,Open,Promo,SchoolHoliday,StateHoliday_a,StateHoliday_b,StateHoliday_c,cos_DayOfWeek,sin_DayOfWeek,...,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",StoreType_b,StoreType_c,StoreType_d,Assortment_b,Assortment_c,CompetitionOpenSince,Promo2Since
0,5,2015-07-31,1,1,1,0,0,0,-0.222521,-0.974928,...,0,0,0,0,1,0,0,0,105.0,648.142857
1,4,2015-07-30,1,1,1,0,0,0,-0.900969,-0.433884,...,0,0,0,0,1,0,0,0,105.0,648.142857
2,3,2015-07-29,1,1,1,0,0,0,-0.900969,0.433884,...,0,0,0,0,1,0,0,0,105.0,648.142857
3,2,2015-07-28,1,1,1,0,0,0,-0.222521,0.974928,...,0,0,0,0,1,0,0,0,105.0,648.142857
4,1,2015-07-27,1,1,1,0,0,0,0.62349,0.781831,...,0,0,0,0,1,0,0,0,105.0,648.142857


In [None]:
reg_model = model.Regressor()
train_model(reg_model, X, y)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


In [None]:
print(reg_model)