In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from models.model_three import ClickRateBySiteEncoder, tune_model_three, get_model_three_pipeline
from models.base import tune_logistic_regression_pipeline

from tools.cv_tools import (
    fit_and_score, neg_log_loss_score,
    train_test_split, score_one_param, score_one_test_day,
    score_one_param, score_params, best_param
)

import time

## Standardize features
Setting mean = 0, stddev = 1 would break sparsity. Just do stddev = 1.

- 'both' on small: shows some improvement over the non-standardized model.
        Tuning time:  267.5878794193268
        {'logistic_regression__C': 1.3894954943731361e-05},
        {1e-06: -0.44103828048443294,
        1.9306977288832498e-06: -0.4302102379517666,
        3.727593720314938e-06: -0.4222077887874803,
        7.196856730011514e-06: -0.4177093412638249,
        1.3894954943731361e-05: -0.4170384850937549,
        2.6826957952797274e-05: -0.4198264908550959,
        5.1794746792312125e-05: -0.42536875085310255,
        0.0001: -0.4329831633998218},
        -0.4131179379019657

In [2]:
df = pd.read_csv('data/train_small.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")

In [3]:
pipeline = get_model_three_pipeline()
lg_step = pipeline.steps.pop()

In [4]:
scaler = StandardScaler(copy=False, with_mean=False)
pipeline.steps.append(['standardize', scaler])
pipeline.steps.append(lg_step)
pipeline.steps

[('preprocessing',
  ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
           transformer_weights=None,
           transformers=[('one_hot_encoding', OneHotEncoder(categorical_features=None, categories=None,
         dtype=<class 'numpy.float64'>, handle_unknown='ignore',
         n_values=None, sparse=True), ['C1', 'banner_pos', 'app_id', 'site_id', 'device_type', 'device_conn_type', 'C15', 'C16', 'C18', 'C19', 'C2...e='click_rate_by_app_id',
           cols=['app_id', 'device_id']), ['click', 'app_id', 'device_id'])])),
 ['standardize', StandardScaler(copy=False, with_mean=False, with_std=True)],
 ('logistic_regression',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='warn',
            n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
            tol=0.0001, verbose=0, warm_start=False))]

In [5]:
params = np.logspace(-6, -4, num=8)
params

array([1.00000000e-06, 1.93069773e-06, 3.72759372e-06, 7.19685673e-06,
       1.38949549e-05, 2.68269580e-05, 5.17947468e-05, 1.00000000e-04])

In [6]:
best_C, params_dict_ls, scores, test_score = tune_logistic_regression_pipeline(df, pipeline, params)
best_C, dict(zip(params, scores)), test_score

Tuning time:  267.5878794193268


({'logistic_regression__C': 1.3894954943731361e-05},
 {1e-06: -0.44103828048443294,
  1.9306977288832498e-06: -0.4302102379517666,
  3.727593720314938e-06: -0.4222077887874803,
  7.196856730011514e-06: -0.4177093412638249,
  1.3894954943731361e-05: -0.4170384850937549,
  2.6826957952797274e-05: -0.4198264908550959,
  5.1794746792312125e-05: -0.42536875085310255,
  0.0001: -0.4329831633998218},
 -0.4131179379019657)

In [7]:
raise

RuntimeError: No active exception to reraise

## Study categorical features vs click rates
Question: Is the scale of click rates harming the model, which employs regularization?

Result:
- most categorical features are very sparse.
    The median of means of columns is 1e-6.
- some categorical features aren't so sparse.
    sum(avg > 0.5), sum(avg > 0.25), sum(avg > 0.1), sum(avg > 0.01) = (7, 12, 20, 72)
- the means of click rates are in the > 99.5% percentiles.

Answer: since the values of click rates are relatively large, their coefficients should be small.
    It is the categorical features that are sparse that 

In [None]:
df = pd.read_csv('data/train_small.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")

In [None]:
pipeline = get_model_three_pipeline()
pipeline.steps.pop()
#pipeline.set_params(**{'preprocessing__sparse_threshold': 0})
pipeline.steps

In [None]:
X = pipeline.fit_transform(df)
X.shape

In [None]:
# the last two columns are the click rates
X[:5, -4:].todense()

In [None]:
avg = np.asarray(X.mean(axis=0))[0] # when the output is a matrix
#avg = X.mean(axis=0)
avg.shape

In [None]:
std = np.std(X, axis=0)
std

In [None]:
# click rate avg
avg[-2:]

In [None]:
pd.Series(avg).describe()

In [None]:
sum(avg > 0.5), sum(avg > 0.25), sum(avg > 0.1), sum(avg > 0.01)

In [None]:
np.quantile(avg, 0.997)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.scatter(range(len(avg)),avg)

## Without regularization
... Logistic Regression doesn't allow no regularization.

In [None]:
pipeline = get_model_three_pipeline()

In [None]:
pipeline.get_params()

In [None]:
best_C, params_dict_ls, scores, test_score = tune_logistic_regression_pipeline(df, pipeline, params)

## Get stats
- 'both' on 50%:
        Private 0.4111518 (1162th); Public 0.4131281

        Tuning time:  9895.018527507782
        Best C:  {'logistic_regression__C': 1.9306977288832496e-05}
        {1e-05: -0.4224259270214176, 1.9306977288832496e-05: -0.4208304486561515, 3.727593720314938e-05: -0.4217097630340418, 7.196856730011514e-05: -0.4247162512096456, 0.00013894954943731373: -0.4291760863913517, 0.00026826957952797245: -0.43419233319954553, 0.0005179474679231213: -0.43884428513031326, 0.001: -0.4425155070224663}
        Test score:  -0.4199406310504107
- 'both' on mid (10%):
        Train time:  1078.5528402328491
        {'logistic_regression__C': 7.196856730011514e-05},
            {1e-05: -0.43486416112736803,
            1.9306977288832496e-05: -0.42865810789142617,
            3.727593720314938e-05: -0.42415665503160155,
            7.196856730011514e-05: -0.4220119611241461,
            0.00013894954943731373: -0.4224520487674228,
            0.00026826957952797245: -0.4253182420463687,
            0.0005179474679231213: -0.43024372965093827,
            0.001: -0.43664129249309813},
            -0.42239338987263836
- 'both' on small (2.5%):
        Private score: 0.4123864 (1178th); Public 0.4142476
        
        Train time:  235.1989951133728 for 8 parameters
        {'logistic_regression__C': 0.00026826957952797245},
         {1e-05: -0.45157329102889265,
          1.9306977288832496e-05: -0.4427007303707594,
          3.727593720314938e-05: -0.4352654288116506,
          7.196856730011514e-05: -0.42906363703874895,
          0.00013894954943731373: -0.4246260851551441,
          0.00026826957952797245: -0.4226054421117388,
          0.0005179474679231213: -0.42321956497464086,
          0.001: -0.42634305732673894},
         -0.41905488258751256
- 'user-site' on small (slightly better than 'both'):
        {'logistic_regression__C': 0.0005179474679231213},
         {1e-05: -0.4519688499795095,
          1.9306977288832496e-05: -0.4433754893213299,
          3.727593720314938e-05: -0.43615234190368524,
          7.196856730011514e-05: -0.4298023929125085,
          0.00013894954943731373: -0.4244771101575616,
          0.00026826957952797245: -0.420876018386614,
          0.0005179474679231213: -0.4196717499105797,
          0.001: -0.42093989821434175},
         -0.4180792146119277
- 'user-app' on small (about the same as 'both'):
        {'logistic_regression__C': 0.00026826957952797245},
         {1e-05: -0.45157329102889265,
          1.9306977288832496e-05: -0.4427007303707594,
          3.727593720314938e-05: -0.4352654288116506,
          7.196856730011514e-05: -0.42906363703874895,
          0.00013894954943731373: -0.4246260851551441,
          0.00026826957952797245: -0.4226054421117388,
          0.0005179474679231213: -0.42321956497464086,
          0.001: -0.42634305732673894},
         -0.41905488258751256
- 'both' without site/app_id on small (worse than with the id cols):
        {'logistic_regression__C': 0.00026826957952797245},
         {1e-05: -0.45416005751851846,
          1.9306977288832496e-05: -0.4455526673441733,
          3.727593720314938e-05: -0.4381691637960564,
          7.196856730011514e-05: -0.4317755399506975,
          0.00013894954943731373: -0.4270948231851417,
          0.00026826957952797245: -0.4249472014637636,
          0.0005179474679231213: -0.4256511483857503,
          0.001: -0.42909917307589296},
         -0.42207645174265146

In [None]:
df = pd.read_csv('data/train_small.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")
params = np.logspace(-5, -3, num=8)
params

In [None]:
best_C, params_dict_ls, scores, test_score = tune_model_three(df, params)
best_C, dict(zip(params, scores)), test_score

## Test tune_logistic_regression_pipeline

In [None]:
raise

## Test ClickRateEncoder

In [None]:
from models.base import ClickRateEncoder

In [None]:
def eval_model_three(df, params):
    model_three_cols = ['C1',
                  'click',
                 'banner_pos',
                  'app_id',
                  'site_id',
                  'device_id',
                 'device_type',
                 'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21']
    
    # all except click_rate are categorical features
    categorical_features = ['C1',
                  'banner_pos',
                  'app_id',
                  'site_id',
                 'device_type',
                 'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21']
    
    click_rate_cols = ['click', 'app_id',
                  'site_id',
                  'device_id']
    
    clicks = df.click
    df = df[model_three_cols + ['hour']]
    X_train, y_train, X_test, y_test = train_test_split(df, clicks, 30)
    test_day_ls = [25,26,27,28,29]
    
    cr_site_encoder = ClickRateEncoder(['site_id','device_id'], 'click_rate_by_site_id')
    cr_app_encoder = ClickRateEncoder(['app_id','device_id'], 'click_rate_by_app_id')
    oh_encoder = OneHotEncoder(handle_unknown='ignore')
    preprocessor = ColumnTransformer([
        ('one_hot_encoding', oh_encoder, categorical_features),
        ('click_rate_encoding_site', cr_site_encoder, ['click','site_id','device_id']),
        ('click_rate_encoding_app', cr_app_encoder, ['click','app_id','device_id'])
    ])

    lg = LogisticRegression(solver='liblinear')
    pipeline = Pipeline([
                    ('preprocessing', preprocessor),
                     ('logistic_regression', lg)])
    
    C_kwd = 'logistic_regression__C'
    params_dict_ls = [{C_kwd: p} for p in params]
    
    train_begin = time.time()
    scores = score_params(X_train, y_train, pipeline, params_dict_ls, test_day_ls)
    train_time = time.time() - train_begin
    print("Train time: ", train_time)
    best_C = best_param(scores, params_dict_ls)
    print("Best C: ", best_C)
    
    # Use the best parameter to evaluate the model on the test set.
    test_begin = time.time()
    test_score = fit_and_score(X_train, y_train, X_test, y_test, pipeline, best_C)
    test_time = time.time() - test_begin
    print("Test time: ", test_time)
    
    return params_dict_ls, scores, test_score

In [None]:
df = pd.read_csv('data/train_small.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")
params = np.logspace(-5, -3, num=8)
params_dict_ls, scores, test_score = eval_model_three(df, params)
best_C = best_param(scores, params_dict_ls)
best_C, dict(zip(params, scores)), test_score

## Develop test_model_three_cols

In [None]:
import pandas as pd
from models.model_three import categorical_features
df = pd.read_csv('data/train_tiny.csv')

In [None]:
def get_nuniques(df):
    unique_vals = dict()
    for c in df:
        unique_vals[c] = df[c].nunique()
    return unique_vals

nuniques = get_nuniques(df[categorical_features])
sum(nuniques.values())