# Model 2

Logistic regression model using columns 'C1',
                 'banner_pos',
                 'site_category',
                 'app_category',
                 'device_id' (or 'device_ip'),
                 'device_type',
                 'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21'.
                 
TODO:
- Try device_ip instead of device_id.
- Keep site/app_category columns or not.
    Score both options.
- What to do with click rates when a category is null?

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from tools.cv_tools import (
    fit_and_score, neg_log_loss_score,
    train_test_split, score_one_param, score_one_test_day,
    score_one_param, score_params, best_param
)
from models.model_two import ClickRateByCategoryEncoder
import time

## Get Stats
'user-app' without site_category, and 'user-app' with both category columns do equally well.

- 'both' on small: 
        Train time:  308.1253786087036 for 10 params.
        {'logistic_regression__C': 0.0001668100537200059},
         {1e-06: -0.5259801028792899,
      2.782559402207126e-06: -0.4789365908110825,
      7.742636826811277e-06: -0.4553710654798575,
      2.1544346900318823e-05: -0.44309836969574495,
      5.994842503189409e-05: -0.43532359556541256,
      0.0001668100537200059: -0.4314043467010145,
      0.00046415888336127773: -0.43220038494735136,
      0.0012915496650148827: -0.4374468348132501,
      0.003593813663804626: -0.4461029965405728,
      0.01: -0.4567780399908202},
         -0.4269088061828156
- 'user-app' on small (does slightly better than 'both'):
        Train time:  268.7218544483185
        {'logistic_regression__C': 0.00046415888336127773},
         {1e-06: -0.5259995434042378,
          2.782559402207126e-06: -0.4789478288366912,
          7.742636826811277e-06: -0.45552751310826645,
          2.1544346900318823e-05: -0.4434527413812714,
          5.994842503189409e-05: -0.43562661677200004,
          0.0001668100537200059: -0.43069380196244567,
          0.00046415888336127773: -0.42957727353281694,
          0.0012915496650148827: -0.4331400060813224,
          0.003593813663804626: -0.4405536832083817,
          0.01: -0.4504950454573378},
         -0.424894260803548
- 'user-site' on small (about the same as 'both'):
        {'logistic_regression__C': 0.00046415888336127773},
         {1e-06: -0.5260020677273898,
          2.782559402207126e-06: -0.4789420973060702,
          7.742636826811277e-06: -0.4555078885594261,
          2.1544346900318823e-05: -0.4434335000627961,
          5.994842503189409e-05: -0.4356378182723383,
          0.0001668100537200059: -0.43069260155100286,
          0.00046415888336127773: -0.4294539029722898,
          0.0012915496650148827: -0.43287661291967944,
          0.003593813663804626: -0.4402268603570036,
          0.01: -0.4501753929856996},
         -0.42632065113113626
- 'user-app' w/o site_category, on small:
        {'logistic_regression__C': 0.00046415888336127773},
         {1e-06: -0.5295815822182071,
          2.782559402207126e-06: -0.48200872728139166,
          7.742636826811277e-06: -0.4581443428821463,
          2.1544346900318823e-05: -0.44557855129477486,
          5.994842503189409e-05: -0.4369826281652913,
          0.0001668100537200059: -0.4313523670534364,
          0.00046415888336127773: -0.42981673074962073,
          0.0012915496650148827: -0.4332602245723451,
          0.003593813663804626: -0.44077409504598986,
          0.01: -0.4509166345753231},
         -0.42463570544774826
- 'user-app' w/o app_category on small:
        {'logistic_regression__C': 0.00026826957952797245},
         {1e-05: -0.4525667020251255,
          1.9306977288832496e-05: -0.44508917084082017,
          3.727593720314938e-05: -0.43935208154680716,
          7.196856730011514e-05: -0.43488424473766035,
          0.00013894954943731373: -0.4316798976813601,
          0.00026826957952797245: -0.4299865187950174,
          0.0005179474679231213: -0.4301291664156602,
          0.001: -0.4321171786896193},
         -0.42727808818222285

In [2]:
class ColumnInspector(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        print(X.shape)
        return X

In [3]:
def eval_model_two(df, params):
    model_two_cols = ['C1',
                  'click',
                 'banner_pos',
                  'app_category',
                  'site_category',
                  'device_id',
                 'device_type',
                 'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21']
    
    # all except click_rate are categorical features
    categorical_features = ['C1',
                  'banner_pos',
                 #'app_category',
                  'site_category',
                 'device_type',
                 'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21']
    
    click_rate_cols = ['click', 'app_category',
                  'site_category',
                  'device_id']
    
    clicks = df.click
    df = df[model_two_cols + ['hour']]
    X_train, y_train, X_test, y_test = train_test_split(df, clicks, 30)
    test_day_ls = [25,26,27,28,29]
    
    cr_encoder = ClickRateByCategoryEncoder('user-app')
    oh_encoder = OneHotEncoder(handle_unknown='ignore')
    preprocessor = ColumnTransformer([
        ('one_hot_encoding', oh_encoder, categorical_features),
        ('click_rate_encoding', cr_encoder, click_rate_cols)
    ])

    lg = LogisticRegression(solver='liblinear')
    pipeline = Pipeline([
                    ('preprocessing', preprocessor),
                     ('logistic_regression', lg)])
    
    C_kwd = 'logistic_regression__C'
    params_dict_ls = [{C_kwd: p} for p in params]
    
    train_begin = time.time()
    scores = score_params(X_train, y_train, pipeline, params_dict_ls, test_day_ls)
    train_time = time.time() - train_begin
    print("Train time: ", train_time)
    best_C = best_param(scores, params_dict_ls)
    print("Best C: ", best_C)
    
    # Use the best parameter to evaluate the model on the test set.
    test_begin = time.time()
    test_score = fit_and_score(X_train, y_train, X_test, y_test, pipeline, best_C)
    test_time = time.time() - test_begin
    print("Test time: ", test_time)
    
    return params_dict_ls, scores, test_score

In [4]:
df = pd.read_csv('data/train_tiny.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")
params = np.logspace(-5, -3, num=8)
params

array([1.00000000e-05, 1.93069773e-05, 3.72759372e-05, 7.19685673e-05,
       1.38949549e-04, 2.68269580e-04, 5.17947468e-04, 1.00000000e-03])

In [5]:
params_dict_ls, scores, test_score = eval_model_two(df, params)
best_C = best_param(scores, params_dict_ls)
best_C, dict(zip(params, scores)), test_score

Train time:  1.7852497100830078
Best C:  {'logistic_regression__C': 0.001}
Test time:  0.04359626770019531


({'logistic_regression__C': 0.001},
 {1e-05: -0.688522562115143,
  1.9306977288832496e-05: -0.6843469427644564,
  3.727593720314938e-05: -0.6766170252637685,
  7.196856730011514e-05: -0.6628246813814747,
  0.00013894954943731373: -0.6397767633077331,
  0.00026826957952797245: -0.6052270266107701,
  0.0005179474679231213: -0.5611377052203533,
  0.001: -0.5152737610899033},
 -0.4894974870332799)

In [6]:
raise

RuntimeError: No active exception to reraise

## Study ClickRateByCategoryEncoder
Expect 10-15% of rows in a test set to not have a corresponding row in a train set.

In [None]:
model_two_cols = ['C1',
                  'click',
                 'banner_pos',
                  'app_category',
                  'site_category',
                  'device_id',
                 'device_type',
                 'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21']

In [None]:
df = pd.read_csv('data/train_small.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")
clicks = df.click
df = df[model_two_cols + ['hour']]
X_train, y_train, X_test, y_test = train_test_split(df, clicks, 30)

In [None]:
df.shape

In [None]:
# TODO: summary statistics of click_rate columns
cr_encoder = ClickRateByCategoryEncoder('both')
encoded_train = cr_encoder.fit_transform(X_train)
encoded_train.head(1)

In [None]:
# Study null values
encoded_test = cr_encoder.transform(X_test)

In [None]:
# More than 10% of rows in the test set does not have a corresponding row in the train set.
encoded_test.click_rate_app.isna().mean(), encoded_test.click_rate_site.isna().mean()

In [None]:
df_test = pd.read_csv('data/test_small.csv')

In [None]:
df_test_encoded = cr_encoder.transform(df_test)

In [None]:
df_test_encoded.click_rate_site.isna().mean(), df_test_encoded.click_rate_app.isna().mean()

## ClickRateByCategoryEncoder seems broken (fixed)

In [None]:
model_two_cols = ['C1',
                  'click',
                 'banner_pos',
                  'app_category',
                  'site_category',
                  'device_id',
                 'device_type',
                 'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21']

categorical_features = ['C1',
                  'banner_pos',
                 'device_type',
                 'app_category',
                  'site_category',
                'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21']

click_rate_cols = ['click', 'app_category',
                  'site_category',
                  'device_id']

In [None]:
df = pd.read_csv('data/train_tiny.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")
clicks = df.click
df = df[model_two_cols + ['hour']]
X_train, y_train, X_test, y_test = train_test_split(df, clicks, 30)

In [None]:
cr_encoder = ClickRateByCategoryEncoder('both')
oh_encoder = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer([
    ('one_hot_encoding', oh_encoder, categorical_features),
    ('click_rate_encoding', cr_encoder, click_rate_cols)
])

lg = LogisticRegression(solver='liblinear')
pipeline = Pipeline([('preprocessing', preprocessor),
                     ('inspect', ColumnInspector()),
                 ('logistic_regression', lg)])

In [None]:
param = {'logistic_regression__C': 0.01584893192461114}
test_score = fit_and_score(X_train, y_train, X_test, y_test, pipeline, param)

In [None]:
cr_encoder = ClickRateByCategoryEncoder('both')
encoded = cr_encoder.fit_transform(X_train)
encoded.head()

In [None]:
cr_encoder = ClickRateByCategoryEncoder('user-app')
encoded = cr_encoder.fit_transform(X_train)
encoded.head()

In [None]:
cr_encoder = ClickRateByCategoryEncoder('user-site')
encoded = cr_encoder.fit_transform(X_train)
encoded.head()

## Develop a Pipeline model

In [None]:
class ClickRateEncoder(BaseEstimator, TransformerMixin):
    
    user_site_interaction_cols = ['site_category', 'device_id']
    user_app_interaction_cols = ['app_category', 'device_id']
    
    def __init__(self):
        self.click_rates_by_site_category = None
        self.click_rates_by_app_category = None
    
    def fit(self, X, y=None):
        """
        X must have the following columns: 'click', 'site_category', 'app_category', and 'device_id'.
        Returns a transformed DataFrame with 'click_rate_site' and 'click_rate_app'.
        The 'click' column is dropeed.
        """
        self.click_rates_by_site_category = X.groupby(ClickRateEncoder.user_site_interaction_cols)\
            .agg({'click': 'mean'}).rename({'click': 'click_rate_site'}, axis=1)
        
        self.click_rates_by_app_category = X.groupby(ClickRateEncoder.user_app_interaction_cols)\
            .agg({'click': 'mean'}).rename({'click': 'click_rate_app'}, axis=1)
    
        return self
    
    def transform(self, X):
        X = pd.merge(X, self.click_rates_by_site_category, how='left',
                  on=ClickRateEncoder.user_site_interaction_cols)
        X = pd.merge(X, self.click_rates_by_app_category, how='left',
                  on=ClickRateEncoder.user_app_interaction_cols)
        print('transform called.', X.columns)
        return X.drop(['click', 'device_id'], axis=1)

In [None]:
df = pd.read_csv('data/train_tiny.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")

In [None]:
model_two_cols = ['C1',
                  'click',
                 'banner_pos',
                  'app_category',
                  'site_category',
                  'device_id',
                 'device_type',
                 'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21']

# all except click_rate are categorical features
categorical_features = ['C1',
                  'banner_pos',
                    'app_category',
                  'site_category',
                 'device_type',
                 'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21']

In [None]:
clicks = df.click
df = df[model_two_cols + ['hour']]
X_train, y_train, X_test, y_test = train_test_split(df, clicks, 30)

In [None]:
X_train.head(1)

In [None]:
cr_encoder = ClickRateEncoder()
oh_encoder = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer([
    ('one_hot_encoding', oh_encoder, categorical_features)
])

#transformer = Pipeline([('click_rate_encoder', cr_encoder),
#    ('preprocessing', preprocessor)])

lg = LogisticRegression(solver='liblinear')
pipeline = Pipeline([('click_rate_encoder', cr_encoder),
    ('preprocessing', preprocessor),
                 ('logistic_regression', lg)])


In [None]:
#pipeline.fit(X_train, y_train)
#transformer.fit(X_train, y_train)
#transformer.transform(X_test)

In [None]:
param = {'logistic_regression__C': 0.21544346900318823}
test_score = fit_and_score(X_train, y_train, X_test, y_test, pipeline, param)

In [None]:
test_score

In [None]:
raise

## Develop model

In [None]:
# Properties of null values.
df = pd.read_csv('data/train_smaller.csv')

site_category_null = '50e219e0'
app_category_null = '07d7df22'

print(((df.site_category == site_category_null) | (df.app_category == app_category_null)).mean())
# => 1. Either site_category or app_category is null.
print(((df.site_category != site_category_null) | (df.app_category != app_category_null)).mean())
# => 0.933. In most rows, either app_category or site_category is not null.
print(((df.site_category == site_category_null) & (df.app_category == app_category_null)).mean())
# => 0.067. A small percentage of rows has both features null.
print(((df.site_category == site_category_null) ^ (df.app_category == app_category_null)).mean())
# => 0.933. When one is null, the other is not null.

In [None]:
model_two_cols = ['C1',
                  'click_rate_site',
                  'click_rate_app',
                 'banner_pos',
                  'app_category',
                  'site_category',
                 'device_type',
                 'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21']

# all except click_rate are categorical features
categorical_features = ['C1',
                  'banner_pos',
                    'app_category',
                  'site_category',
                 'device_type',
                 'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21']

In [None]:
df = pd.read_csv('data/train_small.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")

In [None]:
# Day 30 is for testing
df_train, df_test = train_test_split(df, None, 30)
y_train = df_train.click
test_day_ls = [25,26,27,28,29]

# TODO: turn these into a pipeline step (how? They need access to the target variable.)
user_site_interaction_cols = ['site_category', 'device_id']
user_app_interaction_cols = ['app_category', 'device_id']
df_train.head(1)

In [None]:
click_rates_by_site_category = df_train.groupby(user_site_interaction_cols).agg({'click': 'mean'})\
    .rename({'click': 'click_rate_site'}, axis=1)
df_train = pd.merge(df_train, click_rates_by_site_category, how='left',
                  on=user_site_interaction_cols)
df_train.head()

In [None]:
df_train.columns

In [None]:
click_rates_by_app_category = df_train.groupby(user_app_interaction_cols).agg({'click': 'mean'})\
    .rename({'click': 'click_rate_app'}, axis=1)
click_rates_by_app_category.head()

In [None]:
df_train = pd.merge(df_train, click_rates_by_app_category, how='left',
                  on=user_app_interaction_cols)

X_train = df_train[model_two_cols]
y_train = df_train.click
X_train.head()

In [None]:
y_test = df_test.click
X_test = df_test
X_test = pd.merge(X_test, click_rates_by_site_category, how='left',
                  on=user_site_interaction_cols)
X_test = pd.merge(X_test, click_rates_by_app_category, how='left',
                  on=user_app_interaction_cols)
X_test = X_test[model_two_cols]
X_test.head()

In [None]:
X_test.click_rate_app.isna().mean(), X_test.click_rate_site.isna().mean()

In [None]:
oh_encoder = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer([
    ('one_hot_encoding', oh_encoder, categorical_features)
])

lg = LogisticRegression(solver='liblinear')
pipeline = Pipeline([('preprocessing', preprocessor),
                 ('logistic_regression', lg)])

In [None]:
%%time
param = {'logistic_regression__C': 0.21544346900318823}
# TODO: append click_rate_app/site to X_test.
test_score = fit_and_score(X_train, y_train,
                           X_test, y_test, pipeline, param)

In [None]:
test_score