In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from tools.cv_tools import (
    fit_and_score, neg_log_loss_score,
    train_test_split, score_one_param, score_one_test_day,
    score_one_param, score_params, best_param
)

import time

## Get stats
- 'both' on 50%:
        Private 0.4111518 (1162th); Public 0.4131281

        Tuning time:  9895.018527507782
        Best C:  {'logistic_regression__C': 1.9306977288832496e-05}
        {1e-05: -0.4224259270214176, 1.9306977288832496e-05: -0.4208304486561515, 3.727593720314938e-05: -0.4217097630340418, 7.196856730011514e-05: -0.4247162512096456, 0.00013894954943731373: -0.4291760863913517, 0.00026826957952797245: -0.43419233319954553, 0.0005179474679231213: -0.43884428513031326, 0.001: -0.4425155070224663}
        Test score:  -0.4199406310504107
- 'both' on mid (10%):
        Train time:  1078.5528402328491
        {'logistic_regression__C': 7.196856730011514e-05},
            {1e-05: -0.43486416112736803,
            1.9306977288832496e-05: -0.42865810789142617,
            3.727593720314938e-05: -0.42415665503160155,
            7.196856730011514e-05: -0.4220119611241461,
            0.00013894954943731373: -0.4224520487674228,
            0.00026826957952797245: -0.4253182420463687,
            0.0005179474679231213: -0.43024372965093827,
            0.001: -0.43664129249309813},
            -0.42239338987263836
- 'both' on small (2.5%):
        Private score: 0.4123864 (1178th); Public 0.4142476
        
        Train time:  235.1989951133728 for 8 parameters
        {'logistic_regression__C': 0.00026826957952797245},
         {1e-05: -0.45157329102889265,
          1.9306977288832496e-05: -0.4427007303707594,
          3.727593720314938e-05: -0.4352654288116506,
          7.196856730011514e-05: -0.42906363703874895,
          0.00013894954943731373: -0.4246260851551441,
          0.00026826957952797245: -0.4226054421117388,
          0.0005179474679231213: -0.42321956497464086,
          0.001: -0.42634305732673894},
         -0.41905488258751256
- 'user-site' on small (slightly better than 'both'):
        {'logistic_regression__C': 0.0005179474679231213},
         {1e-05: -0.4519688499795095,
          1.9306977288832496e-05: -0.4433754893213299,
          3.727593720314938e-05: -0.43615234190368524,
          7.196856730011514e-05: -0.4298023929125085,
          0.00013894954943731373: -0.4244771101575616,
          0.00026826957952797245: -0.420876018386614,
          0.0005179474679231213: -0.4196717499105797,
          0.001: -0.42093989821434175},
         -0.4180792146119277
- 'user-app' on small (about the same as 'both'):
        {'logistic_regression__C': 0.00026826957952797245},
         {1e-05: -0.45157329102889265,
          1.9306977288832496e-05: -0.4427007303707594,
          3.727593720314938e-05: -0.4352654288116506,
          7.196856730011514e-05: -0.42906363703874895,
          0.00013894954943731373: -0.4246260851551441,
          0.00026826957952797245: -0.4226054421117388,
          0.0005179474679231213: -0.42321956497464086,
          0.001: -0.42634305732673894},
         -0.41905488258751256
- 'both' without site/app_id on small (worse than with the id cols):
        {'logistic_regression__C': 0.00026826957952797245},
         {1e-05: -0.45416005751851846,
          1.9306977288832496e-05: -0.4455526673441733,
          3.727593720314938e-05: -0.4381691637960564,
          7.196856730011514e-05: -0.4317755399506975,
          0.00013894954943731373: -0.4270948231851417,
          0.00026826957952797245: -0.4249472014637636,
          0.0005179474679231213: -0.4256511483857503,
          0.001: -0.42909917307589296},
         -0.42207645174265146

In [2]:
class ClickRateBySiteEncoder(BaseEstimator, TransformerMixin):
    
    user_site_interaction_cols = ['site_id', 'device_id']
    user_app_interaction_cols = ['app_id', 'device_id']
    
    def __init__(self, interaction='both'):
        """
        interaction can be 'user-site', 'user-app', or 'both' (default).
        """
        assert interaction in ['user-app','user-site','both']
        self.interaction = interaction
        self.click_rates_by_site_id = None
        self.click_rates_by_app_id = None
    
    def fit(self, X, y=None):
        """
        X must have the following columns: 'click', 'site_id', 'app_id', and 'device_id'.
        Returns a transformed DataFrame with 'click_rate_site_id' and 'click_rate_app_id'.
        The 'click' column is dropped.
        """
        if self.interaction != 'app-site':
            self.click_rates_by_site_id = X.groupby(ClickRateBySiteEncoder.user_site_interaction_cols)\
                .agg({'click': 'mean'}).rename({'click': 'click_rate_site'}, axis=1)
        
        if self.interaction != 'user-site':
            self.click_rates_by_app_id = X.groupby(ClickRateBySiteEncoder.user_app_interaction_cols)\
                .agg({'click': 'mean'}).rename({'click': 'click_rate_app'}, axis=1)
    
        return self
    
    def transform(self, X):
        # TODO: need to deal with nulls that appear on rows without matching (device_id, site/app_id) rows.
        if self.interaction != 'app-site':
            X = pd.merge(X, self.click_rates_by_site_id, how='left',
                      on=ClickRateBySiteEncoder.user_site_interaction_cols)
            X = X.fillna({'click_rate_site': 0})
            
        if self.interaction != 'user-site':
            X = pd.merge(X, self.click_rates_by_app_id, how='left',
                      on=ClickRateBySiteEncoder.user_app_interaction_cols)
            X = X.fillna({'click_rate_app': 0})
        
        # test sets don't have a click column
        if 'click' in X.columns:
            X = X.drop('click', axis=1)
            
        return X.drop(['device_id','site_id','app_id'], axis=1)

In [3]:
def eval_model_three(df, params):
    model_three_cols = ['C1',
                  'click',
                 'banner_pos',
                  'app_id',
                  'site_id',
                  'device_id',
                 'device_type',
                 'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21']
    
    # all except click_rate are categorical features
    categorical_features = ['C1',
                  'banner_pos',
                  'app_id',
                  'site_id',
                 'device_type',
                 'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21']
    
    click_rate_cols = ['click', 'app_id',
                  'site_id',
                  'device_id']
    
    clicks = df.click
    df = df[model_three_cols + ['hour']]
    X_train, y_train, X_test, y_test = train_test_split(df, clicks, 30)
    test_day_ls = [25,26,27,28,29]
    
    cr_site_encoder = ClickRateEncoder(['site_id','device_id'], 'click_rate_by_site_id')
    cr_app_encoder = ClickRateEncoder(['app_id','device_id'], 'click_rate_by_app_id')
    oh_encoder = OneHotEncoder(handle_unknown='ignore')
    preprocessor = ColumnTransformer([
        ('one_hot_encoding', oh_encoder, categorical_features),
        ('click_rate_encoding_site', cr_site_encoder, ['click','site_id','device_id']),
        ('click_rate_encoding_app', cr_app_encoder, ['click','app_id','device_id'])
    ])

    lg = LogisticRegression(solver='liblinear')
    pipeline = Pipeline([
                    ('preprocessing', preprocessor),
                     ('logistic_regression', lg)])
    
    C_kwd = 'logistic_regression__C'
    params_dict_ls = [{C_kwd: p} for p in params]
    
    train_begin = time.time()
    scores = score_params(X_train, y_train, pipeline, params_dict_ls, test_day_ls)
    train_time = time.time() - train_begin
    print("Train time: ", train_time)
    best_C = best_param(scores, params_dict_ls)
    print("Best C: ", best_C)
    
    # Use the best parameter to evaluate the model on the test set.
    test_begin = time.time()
    test_score = fit_and_score(X_train, y_train, X_test, y_test, pipeline, best_C)
    test_time = time.time() - test_begin
    print("Test time: ", test_time)
    
    return params_dict_ls, scores, test_score

In [4]:
df = pd.read_csv('data/train_small.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")
params = np.logspace(-5, -3, num=8)
params

array([1.00000000e-05, 1.93069773e-05, 3.72759372e-05, 7.19685673e-05,
       1.38949549e-04, 2.68269580e-04, 5.17947468e-04, 1.00000000e-03])

In [5]:
params_dict_ls, scores, test_score = eval_model_three(df, params)
best_C = best_param(scores, params_dict_ls)
best_C, dict(zip(params, scores)), test_score

Train time:  236.44029760360718
Best C:  {'logistic_regression__C': 0.00026826957952797245}
Test time:  9.750065565109253


({'logistic_regression__C': 0.00026826957952797245},
 {1e-05: -0.45157329102889265,
  1.9306977288832496e-05: -0.4427007303707594,
  3.727593720314938e-05: -0.4352654288116506,
  7.196856730011514e-05: -0.42906363703874895,
  0.00013894954943731373: -0.4246260851551441,
  0.00026826957952797245: -0.4226054421117388,
  0.0005179474679231213: -0.42321956497464086,
  0.001: -0.42634305732673894},
 -0.41905488258751256)

## Test ClickRateEncoder

In [2]:
from models.base import ClickRateEncoder

In [3]:
def eval_model_three(df, params):
    model_three_cols = ['C1',
                  'click',
                 'banner_pos',
                  'app_id',
                  'site_id',
                  'device_id',
                 'device_type',
                 'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21']
    
    # all except click_rate are categorical features
    categorical_features = ['C1',
                  'banner_pos',
                  'app_id',
                  'site_id',
                 'device_type',
                 'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21']
    
    click_rate_cols = ['click', 'app_id',
                  'site_id',
                  'device_id']
    
    clicks = df.click
    df = df[model_three_cols + ['hour']]
    X_train, y_train, X_test, y_test = train_test_split(df, clicks, 30)
    test_day_ls = [25,26,27,28,29]
    
    cr_site_encoder = ClickRateEncoder(['site_id','device_id'], 'click_rate_by_site_id')
    cr_app_encoder = ClickRateEncoder(['app_id','device_id'], 'click_rate_by_app_id')
    oh_encoder = OneHotEncoder(handle_unknown='ignore')
    preprocessor = ColumnTransformer([
        ('one_hot_encoding', oh_encoder, categorical_features),
        ('click_rate_encoding_site', cr_site_encoder, ['click','site_id','device_id']),
        ('click_rate_encoding_app', cr_app_encoder, ['click','app_id','device_id'])
    ])

    lg = LogisticRegression(solver='liblinear')
    pipeline = Pipeline([
                    ('preprocessing', preprocessor),
                     ('logistic_regression', lg)])
    
    C_kwd = 'logistic_regression__C'
    params_dict_ls = [{C_kwd: p} for p in params]
    
    train_begin = time.time()
    scores = score_params(X_train, y_train, pipeline, params_dict_ls, test_day_ls)
    train_time = time.time() - train_begin
    print("Train time: ", train_time)
    best_C = best_param(scores, params_dict_ls)
    print("Best C: ", best_C)
    
    # Use the best parameter to evaluate the model on the test set.
    test_begin = time.time()
    test_score = fit_and_score(X_train, y_train, X_test, y_test, pipeline, best_C)
    test_time = time.time() - test_begin
    print("Test time: ", test_time)
    
    return params_dict_ls, scores, test_score

In [4]:
df = pd.read_csv('data/train_small.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")
params = np.logspace(-5, -3, num=8)
params_dict_ls, scores, test_score = eval_model_three(df, params)
best_C = best_param(scores, params_dict_ls)
best_C, dict(zip(params, scores)), test_score

Train time:  239.5106909275055
Best C:  {'logistic_regression__C': 0.00026826957952797245}
Test time:  9.722646474838257


({'logistic_regression__C': 0.00026826957952797245},
 {1e-05: -0.45157329102889265,
  1.9306977288832496e-05: -0.4427007303707594,
  3.727593720314938e-05: -0.4352654288116506,
  7.196856730011514e-05: -0.42906363703874895,
  0.00013894954943731373: -0.4246260851551441,
  0.00026826957952797245: -0.4226054421117388,
  0.0005179474679231213: -0.42321956497464086,
  0.001: -0.42634305732673894},
 -0.41905488258751256)