In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from utils import TpotAutoml
from sklearn.model_selection import TimeSeriesSplit

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

%matplotlib inline
pd.options.mode.chained_assignment = None

In [None]:
# user inputs:
SEED = 13
np.random.seed(SEED)
KEEP_ROWS_FRAC = 1.0 #0.05 # set to 1 if all rows are meant to be kept
skiprows_func = lambda i: i>0 and np.random.rand() > KEEP_ROWS_FRAC

In [None]:
usecols=['item_id', 'activation_date', 'city', 'region',
                              'parent_category_name', 'category_name', 
                             'item_seq_number', 'user_type', 'price_norm']

In [None]:
all_samples = pd.read_csv('data/all_samples_no_nulls.csv', usecols=usecols, skiprows=skiprows_func)

In [None]:
train = pd.read_csv('data/train.csv', usecols=['item_id', 'activation_date', 'deal_probability'])

In [None]:
all_samples.head()

In [None]:
train = train.merge(all_samples, how='left', on=['item_id', 'activation_date'])

In [None]:
train = train.dropna()

In [None]:
train = train.sort_values(by='activation_date').drop('activation_date', axis=1).set_index('item_id')

In [None]:
train = pd.get_dummies(train)

In [None]:
target = 'deal_probability'
TIMEOUT_MINS = None
SCORING = 'r2'
X = (train.drop(target, axis=1)).values
y = train[target].values
tss = TimeSeriesSplit(n_splits=4)
train_index, test_index = list(tss.split(X))[-1]
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

config_dict = {'sklearn.ensemble.GradientBoostingRegressor': {
        'n_estimators': [100, 200, 400],
        'loss': ["ls", "lad", "huber", "quantile"],
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        'max_depth': range(1, 11),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21),
        'subsample': np.arange(0.05, 1.01, 0.05),
        'max_features': np.arange(0.05, 1.01, 0.05),
        'alpha': [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]
    },
        'sklearn.ensemble.RandomForestRegressor': {
        'n_estimators': [100, 200, 400],
        'max_features': np.arange(0.05, 1.01, 0.05),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21),
        'bootstrap': [True, False]
    },
              }

tpot = TpotAutoml(mode='regression',
                  max_time_mins=TIMEOUT_MINS,
                  generations = 1, population_size=1,
                  scoring=SCORING,
                  random_state=SEED,
                  n_jobs=1,
                  verbosity=2,
                  cv=TimeSeriesSplit(n_splits=3),
                  config_dict=config_dict,                 
                 )

In [None]:
# base_model = GradientBoostingRegressor()
base_model = RandomForestRegressor()
# tpot = RandomizedSearchCV(estimator=base_model, random_state=RS,
# #                           param_distributions=config_dict['sklearn.ensemble.GradientBoostingRegressor'],
#                           param_distributions=config_dict['sklearn.ensemble.RandomForestRegressor'],
#                          n_iter=15,
#                          scoring='r2',
#                          cv=TimeSeriesSplit(n_splits=4),
#                          verbose=2,
#                          n_jobs=4)

tpot = RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
           max_features=0.2, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=20, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=6,
           oob_score=False, random_state=None, verbose=1, warm_start=False)

In [None]:
tpot.fit(X_train, y_train)

In [None]:
# feat_importance_entropy = list(zip(train.drop(target, axis=1).columns.values, tpot.best_estimator_.feature_importances_))
feat_importance_entropy = list(zip(train.drop(target, axis=1).columns.values, tpot.feature_importances_))

In [None]:
sorted(list(feat_importance_entropy), key=lambda x:x[1], reverse=True)[:10]

In [None]:
test_score = tpot.score(X_test, y_test)
print(test_score)

In [None]:
rmse = np.sqrt(np.mean((tpot.predict(X_test) - y_test) ** 2))
rmse

In [None]:
plt.hist(tpot.predict(X_test), log=True, bins=100)
print(tpot.predict(X_test).mean())
print(y_test.mean())

In [None]:
from analysis import Analysis
ea = Analysis(tpot, X_train, y_train, X_test, y_test,
                           mode='regression', target=target,
                           features=train.drop(target, axis=1).columns,
                           test_samples_index=test_index, random_state=RS)

In [None]:
ea.get_feature_importance(sort=True);

In [None]:
list(ea.feature_importance.items())[:10]

In [None]:
tpot.best_estimator_

In [None]:
test = pd.read_csv('data/test.csv', usecols=['item_id', 'activation_date'])
len(test)

In [None]:
test = test.merge(all_samples, how='left', on=['item_id', 'activation_date'])
len(test)

In [None]:
test = test.sort_values(by='activation_date').drop('activation_date', axis=1).set_index('item_id')

In [None]:
test = pd.get_dummies(test)

In [None]:
test['deal_probability'] = tpot.predict(test.values)

In [None]:
import pickle
model_name = 'rf_tabular'
pickle.dump(tpot, open('predictions/{}.pickle'.format(rf_tabular), 'wb'))
model = pickle.load(open('predictions/{}.pickle'.format(rf_tabular), 'rb'))

In [None]:
test_score = model.score(X_test, y_test)
print(test_score)

In [None]:
test['deal_probability']