In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from tqdm import tqdm_notebook, tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
import catboost as cbst
import gensim
pd.options.display.max_columns=100
sns.set()
%matplotlib inline
tqdm_notebook().pandas()

In [None]:
orig_train = pd.read_csv("train.csv")
orig_view_log = pd.read_csv("view_log.csv")
orig_item_data = pd.read_csv("item_data.csv")
orig_test = pd.read_csv("test.csv")

In [None]:
orig_train['impression_time'] = pd.to_datetime(orig_train['impression_time'], format="%Y-%m-%d %H:%M:%S")
orig_test['impression_time'] = pd.to_datetime(orig_test['impression_time'], format="%Y-%m-%d %H:%M:%S")
orig_view_log['server_time'] = pd.to_datetime(orig_view_log['server_time'], format="%Y-%m-%d %H:%M:%S")
train_view_log = orig_view_log[orig_view_log['server_time'] <= "2018-11-14"]
train_view_log.sort_values("server_time", inplace=True)
orig_view_log.sort_values("server_time", inplace=True)

In [None]:
def encode_cat_cols(df, cat_cols):
    for c in cat_cols:
        enc_dict = {}
        for i, u in enumerate(df[c].unique()):
            enc_dict[u] = i
        df[c] = df[c].map(enc_dict)
    return df

In [None]:
def build_train_features(df):
    df['impression_min'] = df['impression_time'].dt.minute
    df['impression_hour'] = df['impression_time'].dt.hour
    df['impression_wkday'] = df['impression_time'].dt.weekday
    return df

In [None]:
def calc_click_td(train_, test_, cols, is_local=True, suffix=""):
    train = train_.copy()
    test = test_.copy()
    if is_local:
        train['is_click_dup'] = train['is_click'].copy()
        concat_df = pd.concat([train, test], ignore_index=True)
        concat_df.sort_values("impression_time", inplace=True)
        concat_df['click_td' + suffix] = concat_df.groupby(by=cols)['is_click_dup'].progress_apply(lambda x: x.expanding().mean().shift())
        train_new = concat_df[concat_df['is_click_dup'].notnull()].drop('is_click_dup', axis=1)
        test_new = concat_df[concat_df['is_click_dup'].isnull()].drop('is_click_dup', axis=1)
        return train_new, test_new
    else:
        concat_df = pd.concat([train, test], ignore_index=True)
        concat_df.sort_values("impression_time", inplace=True)
        concat_df['click_td' + suffix] = concat_df.groupby(by=cols)['is_click'].progress_apply(lambda x: x.expanding().mean().shift())
        train_new = concat_df[concat_df['is_click'].notnull()]
        test_new = concat_df[concat_df['is_click'].isnull()]
        return train_new, test_new

In [None]:
train_df = encode_cat_cols(orig_train, cat_cols=['os_version'])
test_df = encode_cat_cols(orig_test, cat_cols=['os_version'])

In [None]:
train_df.sort_values("impression_time", inplace=True)
test_df.sort_values("impression_time", inplace=True)

In [None]:
train_df = build_train_features(train_df)
test_df = build_train_features(test_df)

In [None]:
user_p2p_tr = train_view_log.groupby(by='user_id')['server_time'].progress_apply(lambda x: np.array(x).ptp()/np.timedelta64(1, 's')).reset_index().rename(columns={"server_time": "time_ptp"})
user_p2p_te = orig_view_log.groupby(by='user_id')['server_time'].progress_apply(lambda x: np.array(x).ptp()/np.timedelta64(1, 's')).reset_index().rename(columns={"server_time": "time_ptp"})
train_df = train_df.merge(user_p2p_tr, on='user_id', how='left')
test_df = test_df.merge(user_p2p_te, on='user_id', how='left')

In [None]:
views_of_user_tr = train_view_log.groupby(by='user_id', as_index=False)['session_id'].count().rename(columns={"session_id": "n_views_user"})
views_of_user_te = orig_view_log.groupby(by='user_id', as_index=False)['session_id'].count().rename(columns={"session_id": "n_views_user"})
train_df = train_df.merge(views_of_user_tr, on='user_id', how='left')
test_df = test_df.merge(views_of_user_te, on='user_id', how='left')

In [None]:
concat_df = pd.concat([train_df, test_df], sort=True, ignore_index=True)
concat_df = concat_df.sort_values("impression_time").reset_index(drop=True)

In [None]:
concat_df['next_impression_time'] = concat_df.groupby(by='user_id')['impression_time'].progress_apply(lambda x: x.shift(-1))
concat_df['time_to_next_visit'] = (concat_df['next_impression_time'] - concat_df['impression_time']).dt.total_seconds()
concat_df['last_app_code'] = concat_df.groupby(by='user_id')['app_code'].progress_apply(lambda x: x.shift())
user_counts = concat_df.groupby("user_id", as_index=False)['impression_id'].count().rename(columns={"impression_id": "user_counts"})
concat_df['prev_impression_time'] = concat_df.groupby(by='user_id')['impression_time'].progress_apply(lambda x: x.shift())
concat_df['sec_since_prev_impression'] = (concat_df['impression_time'] - concat_df['prev_impression_time']).dt.total_seconds()
concat_df = concat_df.merge(user_counts, on='user_id', how='left')

In [None]:
impression_time_ptp = concat_df.groupby(by='user_id')['impression_time'].progress_apply(lambda x: np.array(x).ptp()/np.timedelta64(1, 's')).reset_index().rename(columns={"impression_time": "impression_time_ptp"})
train_df = train_df.merge(impression_time_ptp, on='user_id', how='left')
test_df = test_df.merge(impression_time_ptp, on='user_id', how='left')

In [None]:
session_id_nunique_tr = train_view_log.groupby(by='user_id')['session_id'].nunique().reset_index().rename(columns={"session_id": "unique_sessions"})
item_id_nunique_tr = train_view_log.groupby(by='user_id')['item_id'].nunique().reset_index().rename(columns={"item_id": "unique_items"})
session_id_nunique_te = orig_view_log.groupby(by='user_id')['session_id'].nunique().reset_index().rename(columns={"session_id": "unique_sessions"})
item_id_nunique_te = orig_view_log.groupby(by='user_id')['item_id'].nunique().reset_index().rename(columns={"item_id": "unique_items"})

train_df = train_df.merge(session_id_nunique_tr, on='user_id', how='left')
test_df = test_df.merge(session_id_nunique_te, on='user_id', how='left')

train_df = train_df.merge(item_id_nunique_tr, on='user_id', how='left')
test_df = test_df.merge(item_id_nunique_te, on='user_id', how='left')

In [None]:
w2v_size = 100
concat_df['app_code_str'] = concat_df.app_code.astype(str)

w2v_app_codes = concat_df.groupby(by='user_id')['app_code_str'].apply(list)

longest = np.max(w2v_app_codes.apply(len))
model = gensim.models.Word2Vec(w2v_app_codes, size=w2v_size, window=longest, workers=4, seed=123)

w2v_acode = pd.DataFrame(columns=["app_code"] + ["vec_{}".format(i+1) for i in range(w2v_size)])
w2v_data = []
for acode in concat_df['app_code_str'].unique():
    try:
        w2v_data.append([acode] + list(model.wv.get_vector(acode)))
    except:
        w2v_data.append([acode] + list(np.zeros(w2v_size)))

w2v_acode = pd.DataFrame(w2v_data, columns=["app_code_str"] + ["vec_{}".format(i+1) for i in range(w2v_size)])
concat_df = concat_df.merge(w2v_acode, on='app_code_str', how='left')

In [None]:
concat_df['next_impression_min'] = concat_df['next_impression_time'].dt.minute
concat_df['next_impression_hour'] = concat_df['next_impression_time'].dt.hour
concat_df['next_impression_wkday'] = concat_df['next_impression_time'].dt.weekday
concat_df['prev_time_hr'] = concat_df['prev_impression_time'].dt.hour
concat_df['prev_time_min'] = concat_df['prev_impression_time'].dt.minute
concat_df['prev_time_wkday'] = concat_df['prev_impression_time'].dt.weekday

In [None]:
train_df = concat_df[concat_df['is_click'].notnull()].reset_index(drop=True)
test_df = concat_df[concat_df['is_click'].isnull()].reset_index(drop=True)

In [None]:
# X_train = train_df[train_df["impression_time"] < "2018-12-03"]
# X_test = train_df[train_df["impression_time"] >= "2018-12-03"]
# print(X_train.shape, X_test.shape)

In [None]:
# X_train, X_test = calc_click_td(X_train, X_test, cols=['user_id'], suffix="_user_id")
# X_train, X_test = calc_click_td(X_train, X_test, cols=['app_code'], suffix="_app_code")
# X_train, X_test = calc_click_td(X_train, X_test, cols=['user_id', 'app_code'], suffix="_user_app")

In [None]:
train_df, test_df = calc_click_td(train_df, test_df, cols=['user_id'], suffix="_user_id")
train_df, test_df = calc_click_td(train_df, test_df, cols=['app_code'], suffix="_app_code")
train_df, test_df = calc_click_td(train_df, test_df, cols=['user_id', 'app_code'], suffix="_user_app")

In [None]:
predictors_cb = train_df.columns.drop(['impression_id', 'impression_time','is_click', 'app_code', 'next_impression_time', 'prev_impression_time'])
print(list(predictors_cb))

In [None]:
predictors = train_df.columns.drop(['impression_id', 'impression_time','is_click', 'app_code_str', 'app_code', 'next_impression_time', 'prev_impression_time'])
print(list(predictors))

In [None]:
preds = np.zeros((len(test_df), 1))
for i in range(15):
    print("training LGBC model {}".format(i))
    lgbc = lgb.LGBMClassifier(n_estimators=1000, max_depth=5, learning_rate=0.01, random_state=i, colsample_bytree=0.2, reg_lambda=15, reg_alpha=10)
#     lgbc.fit(X_train[predictors], X_train['is_click'])
    lgbc.fit(train_df[predictors], train_df['is_click'])
    preds = preds + lgbc.predict_proba(test_df[predictors])[:,1].reshape(-1, 1)
preds = preds/15

In [None]:
cbc = cbst.CatBoostClassifier(random_seed=123, 
                              eval_metric='AUC', 
                              n_estimators=1100, 
                              max_depth=7, 
                              learning_rate=0.03, 
                              colsample_bylevel=0.1, reg_lambda=70)

cbc.fit(train_df[predictors_cb], train_df['is_click'])

In [None]:
preds_cb = cbc.predict_proba(test_df[predictors_cb])[:,1]

In [None]:
# preds = lgbc.predict_proba(X_test[predictors])[:,1]
# preds_tr = lgbc.predict_proba(X_train[predictors])[:,1]

In [None]:
# metrics.roc_auc_score(X_train['is_click'], preds_tr), metrics.roc_auc_score(X_test['is_click'], preds)

In [None]:
# lgb.plot_importance(lgbc, importance_type='gain', figsize=(10, 35))

In [None]:
from scipy.stats import hmean
sub = pd.DataFrame()
sub['impression_id'] = test_df['impression_id'].copy()
sub['is_click_x'] = preds
sub['is_click_y'] = preds_cb
hmean_preds = hmean(sub[['is_click_x', 'is_click_y']].values, axis=1)
sub['is_click'] = hmean_preds
sub[['impression_id', 'is_click']].to_csv("sub_1.csv", index=False)