Developing a simple model based solely on features generated from item activation date. We separate this model since this data is available for all the training and test data.

In [5]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from utils import featurize_date_col, TpotAutoml
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

sns.set()
SEED = 13
%matplotlib inline
pd.options.mode.chained_assignment = None

In [6]:
train = pd.read_csv('data/train.csv', usecols=['item_id', 'deal_probability', 'activation_date'],
                   parse_dates=['activation_date'], infer_datetime_format=True)
test = pd.read_csv('data/test.csv', usecols=['item_id', 'activation_date'],
                  parse_dates=['activation_date'], infer_datetime_format=True)

In [7]:
train = train.rename(columns={'activation_date': 'item_activation_date'})
train = featurize_date_col(train, 'item_activation_date', remove_when_done=True)

In [8]:
train = train.set_index('item_id')

In [9]:
target = 'deal_probability'
SCORING = 'r2'
X = (train.drop(target, axis=1)).values
y = train[target].values
tss = TimeSeriesSplit(n_splits=4)
train_index, test_index = list(tss.split(X))[-1]
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

In [12]:
from utils import config_dict
base_model = RandomForestRegressor()
model = RandomizedSearchCV(estimator=base_model, random_state=SEED,
                          param_distributions=config_dict['sklearn.ensemble.RandomForestRegressor'],
                         n_iter=7,
                         scoring='r2',
                         cv=TimeSeriesSplit(n_splits=4),
                         verbose=2,
                         n_jobs=6)

In [13]:
model.fit(X_train, y_train)

Fitting 4 folds for each of 7 candidates, totalling 28 fits
[CV] n_estimators=200, min_samples_split=17, min_samples_leaf=1, max_features=0.5, bootstrap=False 
[CV] n_estimators=200, min_samples_split=17, min_samples_leaf=1, max_features=0.5, bootstrap=False 
[CV] n_estimators=200, min_samples_split=17, min_samples_leaf=1, max_features=0.5, bootstrap=False 
[CV] n_estimators=200, min_samples_split=17, min_samples_leaf=1, max_features=0.5, bootstrap=False 
[CV] n_estimators=200, min_samples_split=5, min_samples_leaf=17, max_features=0.45, bootstrap=False 
[CV] n_estimators=200, min_samples_split=5, min_samples_leaf=17, max_features=0.45, bootstrap=False 
[CV]  n_estimators=200, min_samples_split=17, min_samples_leaf=1, max_features=0.5, bootstrap=False, total=  18.7s
[CV] n_estimators=200, min_samples_split=5, min_samples_leaf=17, max_features=0.45, bootstrap=False 
[CV]  n_estimators=200, min_samples_split=5, min_samples_leaf=17, max_features=0.45, bootstrap=False, total=  18.9s
[CV] n

[Parallel(n_jobs=6)]: Done  28 out of  28 | elapsed:  5.1min finished


RandomizedSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=4),
          error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=7, n_jobs=6,
          param_distributions={'n_estimators': [100, 200, 400], 'max_features': array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ]), 'min_samples_split': range(2, 21), 'min_samples_leaf': range(1, 21), 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=13, refit=True,
          return_train_score='warn', scoring='r2', verbose

In [14]:
feat_importance_entropy = list(zip(train.drop(target, axis=1).columns.values, model.best_estimator_.feature_importances_))

In [15]:
sorted(feat_importance_entropy, key=lambda x:x[1], reverse=True)

[('item_activation_date_yday', 0.9306328561719555),
 ('item_activation_date_wday', 0.06936714382804414),
 ('item_activation_date_isholiday', 0.0)]

In [16]:
model.score(X_test, y_test)

0.00019463016228937757

In [17]:
rmse = np.sqrt(np.mean((model.predict(X_test) - y_test) ** 2))
rmse

0.2604694388939306

### Featurize test data and predict

In [18]:
test = test.rename(columns={'activation_date': 'item_activation_date'})
test = featurize_date_col(test, 'item_activation_date', remove_when_done=True)

In [19]:
test = test.set_index('item_id')

In [20]:
test['deal_probability'] = model.predict(test)

In [21]:
results = test[['deal_probability']]

In [22]:
results.to_csv('predictions/activation_date.csv')

## Provide predictions for train data to be used in ensembling

In [27]:
train['deal_probability'] = model.predict(train.drop('deal_probability', axis=1))

In [29]:
train[['deal_probability']].to_csv('predictions/activation_date_train.csv')