TODO: upgrade the model to support sklearn 0.20. Use the new OneHotEncoder, which supports string features, and ColumnTransformer.

Logistic regression model using columns 'C1',
                 'banner_pos',
                 'site_category',
                 'app_category',
                 'device_type',
                 'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21'.
                 
### Model Stats
- 01m55s to train 'small' for 2 parameters.
- 05m22s to train 'small' for 6 parameters.
- 07m21s to train 'small' for 10 parameters.
- {0.0001: -0.43356715906855536,
  0.001: -0.42629521272437554,
  0.01: -0.4242657276526611,
  0.1: -0.42427721774829286,
  1.0: -0.4245292557309675,
  10.0: -0.4246627635916426,
  100.0: -0.42472018597787536,
  1000.0: -0.42473047099609146,
  10000.0: -0.4247623788457422}
- {0.001: -0.42629521272437554,
  0.0021544346900318843: -0.4252131028945746,
  0.004641588833612777: -0.42457692978190453,
  0.01: -0.4242657276526611,
  0.021544346900318832: -0.4241657671706842,
  0.046415888336127774: -0.42418784783505004,
  0.1: -0.42427721774829286,
  0.21544346900318823: -0.4243908104081527,
  0.46415888336127775: -0.4244788065512767,
  1.0: -0.4245292557309675},
  Test score = -0.4251768174185952
- Submission trained on train_small:
    + Private 0.4168748
    + Public 0.4184437
- Train on 0.5 of the data:
    + Time: 10m10s.
    + Used around 20GB of memory.
    + Private 0.4172254
    + Public 0.4188042

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from tools.cv_tools import fit_encoders, neg_log_loss_score, train_test_split

import operator
import time

## Develop cross validation
Test set = Day 30. Days 20-29 are partitioned into train and dev sets.

| Train | Dev |
|:-----:|:----:|
| 20-28 | 29 |
| 20-27 | 28 |
| ...   | ...|
| 20-24 | 25 |

In [2]:
# helper functions

# this is specific to the model
def fit_transform_train(X_train, site_category_encoder, app_category_encoder):
    # step 0: apply site_category encoder
    X_train.site_category = site_category_encoder.transform(X_train.site_category)
    # step 1: apply site_category encoder
    X_train.app_category = app_category_encoder.transform(X_train.app_category)
    X_train = X_train.values
    # step 2: apply one-hot encoding
    # when transforming, an unknown categorical feature is mapped to a zero vector
    oh_encoder = OneHotEncoder(handle_unknown='ignore')
    X_train = oh_encoder.fit_transform(X_train)
    return X_train, oh_encoder

# this is specific to the model
def transform_dev(X_dev, site_category_encoder, app_category_encoder, oh_encoder):
    # step 0
    X_dev.site_category = site_category_encoder.transform(X_dev.site_category)
    # step 1
    X_dev.app_category = app_category_encoder.transform(X_dev.app_category)
    X_dev = oh_encoder.transform(X_dev)
    return X_dev

In [3]:
def _fit_and_score(X_train, y_train, X_dev, y_dev,
                  site_category_encoder, app_category_encoder, param):
    # the DataFrame needs the hour column for splitting. Drop the column right before training.
    X_train = X_train.drop('hour', axis=1)
    X_dev = X_dev.drop('hour', axis=1)
    
    X_train, oh_encoder = fit_transform_train(X_train, site_category_encoder, app_category_encoder)
    lg = LogisticRegression(C=param)
    lg.fit(X_train, y_train)
    X_dev = transform_dev(X_dev, site_category_encoder, app_category_encoder, oh_encoder)
    return neg_log_loss_score(lg, X_dev, y_dev)
    
def score_one_test_day(X_train, y_train, 
                       site_category_encoder, app_category_encoder,
                       test_day, param):
    X_train, y_train, X_dev, y_dev = train_test_split(X_train, y_train, test_day)
    return _fit_and_score(X_train, y_train, X_dev, y_dev,
                         site_category_encoder, app_category_encoder, param)

def score_one_param(X_train, y_train,
                    site_category_encoder, app_category_encoder,
                    param, test_day_ls):
    
    scores = []
    for test_day in test_day_ls:
        scores.append(score_one_test_day(X_train, y_train,
                                         site_category_encoder, app_category_encoder,
                                         test_day, param))

    assert len(scores) == len(test_day_ls)

    #print('Scores:', scores)
    mean_score = sum(scores)/len(scores)
    #print('Mean: ', mean_score)
    return mean_score
    
def score_params(X_train, y_train,
                 site_category_encoder, app_category_encoder,
                 param_ls, test_day_ls):
    """
    Returns a map: parameter -> mean score.
    """
    
    result = dict()
    for p in param_ls:
        result[p] = score_one_param(X_train, y_train,
                                    site_category_encoder, app_category_encoder,
                                    p, test_day_ls)
    return result

def best_param(score_dict):
    return max(score_dict.items(), key=operator.itemgetter(1))[0]

In [4]:
def eval_model_one(df, params):
    """
    df: pd.DataFrame. An output of pd.read_csv('train.csv') with its hour column formatted.
    """
    model_one_cols = ['C1',
                 'banner_pos',
                 'site_category',
                 'app_category',
                 'device_type',
                 'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21']
    
    encoders = fit_encoders(df, ['site_category', 'app_category'])
    site_category_encoder = encoders['site_category']
    app_category_encoder = encoders['app_category']
    clicks = df.click
    # need the hour column for splitting
    df = df[model_one_cols + ['hour']]
    # Day 30 is for testing
    X_train, y_train, X_test, y_test = train_test_split(df, clicks, 30)
    test_day_ls = [25,26,27,28,29]
    
    train_begin = time.time()
    scores = score_params(X_train, y_train,
                          site_category_encoder, app_category_encoder,
                          params, test_day_ls)
    train_time = time.time() - train_begin
    print("Train time: ", train_time)
    best_C = best_param(scores)
    print("Best C: ", best_C)
    
    # Use the best parameter to evaluate the model on the test set.
    test_begin = time.time()
    test_score = _fit_and_score(X_train, y_train, X_test, y_test,
                               site_category_encoder, app_category_encoder, best_C)
    test_time = time.time() - test_begin
    print("Test time: ", test_time)
    
    return scores, test_score

In [5]:
df = pd.read_csv('data/train_tiny.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")

In [6]:
#params = [0.001]
#K = 4
#params = np.logspace(-K, K, num=K*2+1)
params = np.logspace(-3, 0, num=10)
params

array([0.001     , 0.00215443, 0.00464159, 0.01      , 0.02154435,
       0.04641589, 0.1       , 0.21544347, 0.46415888, 1.        ])

In [7]:
%%time
scores, test_score = eval_model_one(df, params)

Train time:  1.184729814529419
Best C:  0.46415888336127775
Test time:  0.016676902770996094
CPU times: user 1.21 s, sys: 2.23 ms, total: 1.21 s
Wall time: 1.22 s


In [8]:
best_C = best_param(scores)
best_C, scores, test_score

(0.46415888336127775,
 {0.001: -0.5112606484015165,
  0.0021544346900318843: -0.4688670121876982,
  0.004641588833612777: -0.44216972031340884,
  0.01: -0.4271120181641944,
  0.021544346900318832: -0.4184112774561518,
  0.046415888336127774: -0.4131558089535118,
  0.1: -0.41001835673647247,
  0.21544346900318823: -0.4082597092380721,
  0.46415888336127775: -0.40789482838251995,
  1.0: -0.41009488890773527},
 -0.470478664934517)

In [9]:
raise

RuntimeError: No active exception to reraise

## Model 1 (baseline)

In [None]:
# create a map: colName -> index
def build_colname_idx(df):
    return dict(zip(df.columns, range(len(df.columns))))

model1_cols = ['C1',
             'banner_pos',
             'site_category',
             'app_category',
             'device_type',
             'device_conn_type',
             'C15',
             'C16',
             'C18',
             'C19',
             'C21']

str_cols = ['site_category', 'app_category']

In [None]:
df = pd.read_csv('data/train_tiny.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")

In [None]:
def fit_encoders(df):
    site_category_encoder = LabelEncoder()
    site_category_encoder.fit(df.site_category)
    app_category_encoder = LabelEncoder()
    app_category_encoder.fit(df.app_category)
    return site_category_encoder, app_category_encoder

In [None]:
# LabelEncoder doesn't handle unknown values, so fit them to the entire dataset.
site_category_encoder, app_category_encoder = fit_encoders(df)

In [None]:
def train_dev_test_split(df):
    last_day = 30
    last_day_mask = df.hour.dt.day == last_day
    df_test = df[last_day_mask]
    df_test = df_test[model1_cols + ['click']]
    df_dev, df_test = train_test_split(df_test, test_size=0.5, random_state=23)
    df_train = df[~last_day_mask]
    df_train = df_train[model1_cols + ['click']]
    X_train = df_train.drop('click', axis=1)
    y_train = df_train.click
    X_dev = df_dev.drop('click', axis=1)
    y_dev = df_dev.click
    X_test = df_test.drop('click', axis=1)
    y_test = df_test.click
    return X_train, y_train, X_dev, y_dev, X_test, y_test

In [None]:
# train/dev/test split
X_train, y_train, X_dev, y_dev, X_test, y_test = train_dev_test_split(df)

In [None]:
def fit_transform_train(X_train, site_category_encoder, app_category_encoder):
    X_train.site_category = site_category_encoder.transform(X_train.site_category)
    X_train.app_category = app_category_encoder.transform(X_train.app_category)
    X_train = X_train.values
    # when transforming, an unknown categorical feature is mapped to a zero vector
    oh_encoder = OneHotEncoder(handle_unknown='ignore')
    X_train = oh_encoder.fit_transform(X_train)
    return X_train, oh_encoder

In [None]:
train_colname_idx = build_colname_idx(X_train)
train_colname_idx

In [None]:
# prepare the train set
X_train, oh_encoder = fit_transform_train(X_train, site_category_encoder, app_category_encoder)

In [None]:
# tune: C. Maybe use class_weight. Try different solvers.
# Can be parallelized if multi-class.
lg = LogisticRegression()
lg.fit(X_train, y_train)

In [None]:
def transform_dev(X_dev, site_category_encoder, app_category_encoder, oh_encoder):
    X_dev.site_category = site_category_encoder.transform(X_dev.site_category)
    X_dev.app_category = app_category_encoder.transform(X_dev.app_category)
    X_dev = oh_encoder.transform(X_dev)
    return X_dev

In [None]:
# prepare the dev set
X_dev = transform_dev(X_dev, site_category_encoder, app_category_encoder, oh_encoder)

In [None]:
lg.score(X_dev, y_dev)