In [1]:
import pandas as pd
import numpy as np
import gc
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2, venn2_circles # requires pip install matplotlib_venn
import string
from sklearn.model_selection import TimeSeriesSplit
import nltk
from nltk.corpus import stopwords
import scipy
# import lightgbm as lgb

from utils import featurize_date_col, TpotAutoml

sns.set()
%matplotlib inline
pd.options.mode.chained_assignment = None

In [2]:
# user inputs:
RS = 13
np.random.seed(RS)
KEEP_ROWS_FRAC = 1.0 # set to 1 if all rows are meant to be kept
skiprows_func = lambda i: i>0 and np.random.rand() > KEEP_ROWS_FRAC

In [3]:
periods_aggregate = pd.read_csv('data/periods_aggregate_features.csv', skiprows=skiprows_func)

In [4]:
train = pd.read_csv('data/train.csv', usecols=['user_id', 'deal_probability', 'activation_date'],
                   parse_dates=['activation_date'], infer_datetime_format=True)

we use only the aggregate periods data to see what features are the most important

In [5]:
all_train = train.merge(periods_aggregate, how='left', on='user_id')

In [6]:
# all_train.isna().sum() / len(all_train)
all_train = all_train.dropna()

In [7]:
all_train = all_train.sort_values(by='activation_date')

In [8]:
# all_train = all_train.drop('activation_date', axis=1)
all_train = all_train.rename(columns={'activation_date': 'item_activation_date'})
all_train = featurize_date_col(all_train, 'item_activation_date', remove_when_done=True)

In [9]:
all_train = all_train.set_index('user_id')

In [None]:
#******** NOTE ***** if you use this scorer, you would also need to transform the predict() results the same way
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer
def score_func(y_true, y_pred):
    y_pred = np.array(y_pred)
    y_pred = (y_pred - y_pred.mean()) / (y_pred.std()+1e-6)
    y_pred = 1 / (1 + np.exp(-y_pred))
    scaler = MinMaxScaler(feature_range=(0, 1))
    y_pred = scaler.fit_transform(y_pred.reshape((len(y_pred), 1)))
    rmse = np.sqrt(np.mean((y_pred.flatten() - y_true) ** 2))
    return -rmse
scorer = make_scorer(score_func=score_func, greater_is_better=True)

In [10]:
target = 'deal_probability'
TIMEOUT_MINS = None
# SCORING = 'neg_mean_squared_error'
# SCORING = scorer
SCORING = 'r2'
RS=27
X = (all_train.drop(target, axis=1)).values
y = train[target].values
# tss.split(X) is a generator object used for cross-validation
tss = TimeSeriesSplit(n_splits=4)
train_index, test_index = list(tss.split(X))[-1]
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

config_dict = {'sklearn.ensemble.GradientBoostingRegressor': {
        'n_estimators': [100, 200, 400],
        'loss': ["ls", "lad", "huber", "quantile"],
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        'max_depth': range(1, 11),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21),
        'subsample': np.arange(0.05, 1.01, 0.05),
        'max_features': np.arange(0.05, 1.01, 0.05),
        'alpha': [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]
    }}


tpot = TpotAutoml(mode='regression',
                  max_time_mins=TIMEOUT_MINS,
                  generations = 1, population_size=1,
                  scoring=SCORING,
                  random_state=RS,
                  n_jobs=1,
                  verbosity=2,
                  cv=TimeSeriesSplit(n_splits=3),
                  config_dict=config_dict,                 
                 )

In [12]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
gbr = GradientBoostingRegressor()
tpot = RandomizedSearchCV(estimator=gbr, random_state=RS,
                          param_distributions=config_dict['sklearn.ensemble.GradientBoostingRegressor'],
                         n_iter=50,
                         scoring='r2',
                         cv=TimeSeriesSplit(n_splits=4),
#                           cv=2,
                         verbose=2)

In [None]:
tpot.fit(X_train, y_train)

Fitting 4 folds for each of 50 candidates, totalling 200 fits
[CV] subsample=0.6500000000000001, n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.35000000000000003, max_depth=9, loss=ls, learning_rate=0.001, alpha=0.9 
[CV]  subsample=0.6500000000000001, n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.35000000000000003, max_depth=9, loss=ls, learning_rate=0.001, alpha=0.9, total= 5.6min
[CV] subsample=0.6500000000000001, n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.35000000000000003, max_depth=9, loss=ls, learning_rate=0.001, alpha=0.9 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.6min remaining:    0.0s


[CV]  subsample=0.6500000000000001, n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.35000000000000003, max_depth=9, loss=ls, learning_rate=0.001, alpha=0.9, total=13.1min
[CV] subsample=0.6500000000000001, n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.35000000000000003, max_depth=9, loss=ls, learning_rate=0.001, alpha=0.9 
[CV]  subsample=0.6500000000000001, n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.35000000000000003, max_depth=9, loss=ls, learning_rate=0.001, alpha=0.9, total=21.4min
[CV] subsample=0.6500000000000001, n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.35000000000000003, max_depth=9, loss=ls, learning_rate=0.001, alpha=0.9 
[CV]  subsample=0.6500000000000001, n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.35000000000000003, max_depth=9, loss=ls, learning_rate=0.001, alpha=0.9, total=32.4min
[CV] subsample=0.9000000000000001, n_estimato

In [None]:
# top_scores = tpot.get_top_models(return_scores=True)
# print('\ntop cv scores:')
# print(top_scores)
# print('\ntop models')
# print(tpot.top_models)
# print('\nthe best test score:')
test_score = tpot.score(X_test, y_test)
print(test_score)

In [None]:
rmse = np.sqrt(np.mean((tpot.predict(X_test) - y_test) ** 2))
rmse

In [None]:
plt.hist(tpot.predict(X_test), log=True, bins=100)
print(tpot.predict(X_test).mean())
print(y_test.mean())

In [None]:
from analysis import Analysis
ea = Analysis(tpot, X_train, y_train, X_test, y_test,
                           mode='regression', target=target,
                           features=all_train.drop(target, axis=1).columns,
                           test_samples_index=test_index, random_state=RS)

In [None]:
ea.get_feature_importance(sort=True)

In [None]:
# Best pipeline: XGBRegressor(FastICA(input_matrix, tol=0.7000000000000001), learning_rate=0.1, max_depth=2, min_child_weight=5, n_estimators=100, nthread=1, subsample=1.0)

In [None]:
test = pd.read_csv('data/test.csv', usecols=['item_id', 'user_id', 'activation_date'],
                   parse_dates=['activation_date'], infer_datetime_format=True)

In [None]:
all_test = test.merge(periods_aggregate, how='left', on='user_id')

In [None]:
all_test = all_test.dropna()

In [None]:
all_test = all_test.rename(columns={'activation_date': 'item_activation_date'})
all_test = featurize_date_col(all_test, 'item_activation_date', remove_when_done=True)

In [None]:
all_test = all_test.set_index('item_id').drop('user_id', axis=1)

In [None]:
all_test.shape

In [None]:
all_test['deal_probability'] = tpot.predict(all_test.values)

In [None]:
all_test[['deal_probability']].to_csv('predictions/periods.csv')