Developing a simple model based solely on features generated from item activation date. We separate this model since this data is available for all the training and test data.

In [70]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from utils import featurize_date_col, TpotAutoml
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

sns.set()
%matplotlib inline
pd.options.mode.chained_assignment = None

In [39]:
train = pd.read_csv('data/train.csv', usecols=['item_id', 'deal_probability', 'activation_date'],
                   parse_dates=['activation_date'], infer_datetime_format=True)
test = pd.read_csv('data/test.csv', usecols=['item_id', 'activation_date'],
                  parse_dates=['activation_date'], infer_datetime_format=True)

In [45]:
train = train.rename(columns={'activation_date': 'item_activation_date'})
train = featurize_date_col(train, 'item_activation_date', remove_when_done=True)

In [53]:
train = train.set_index('item_id')

In [54]:
target = 'deal_probability'
SCORING = 'r2'
X = (train.drop(target, axis=1)).values
y = train[target].values
tss = TimeSeriesSplit(n_splits=4)
train_index, test_index = list(tss.split(X))[-1]
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

In [68]:
config_dict = {'sklearn.ensemble.RandomForestRegressor': {
        'n_estimators': [100, 200, 400],
        'max_features': np.arange(0.05, 1.01, 0.05),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21),
        'bootstrap': [True, False]
    }}

In [71]:
base_model = RandomForestRegressor()
model = RandomizedSearchCV(estimator=base_model, random_state=SEED,
                          param_distributions=config_dict['sklearn.ensemble.RandomForestRegressor'],
                         n_iter=10,
                         scoring='r2',
                         cv=TimeSeriesSplit(n_splits=4),
                         verbose=1,
                         n_jobs=1)

In [None]:
model.fit(X_train, y_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


In [None]:
feat_importance_entropy = list(zip(train.drop(target, axis=1).columns.values, model.feature_importances_))

In [None]:
sorted(feat_importance_entropy, key=lambda x:x[1], reverse=True)

In [None]:
model.score(X_test, y_test)

In [None]:
rmse = np.sqrt(np.mean((model.predict(X_test) - y_test) ** 2))
rmse

### Featurize test data and predict

In [None]:
test = test.rename(columns={'activation_date': 'item_activation_date'})
test = featurize_date_col(train, 'item_activation_date', remove_when_done=True)

In [None]:
test = test.set_index('item_id')

In [None]:
test['deal_probability'] = model.predict(test)

In [None]:
test.head()