Cross-validation for a time series data:
for k in range(2, 10):
    Train on 1-k^th day, test on (k+1)^th.


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression

import operator

## Develop cross validation

In [2]:
# helper functions

def fit_encoders(df):
    site_category_encoder = LabelEncoder()
    site_category_encoder.fit(df.site_category)
    app_category_encoder = LabelEncoder()
    app_category_encoder.fit(df.app_category)
    return site_category_encoder, app_category_encoder

def fit_transform_train(X_train, site_category_encoder, app_category_encoder):
    X_train.site_category = site_category_encoder.transform(X_train.site_category)
    X_train.app_category = app_category_encoder.transform(X_train.app_category)
    X_train = X_train.values
    # when transforming, an unknown categorical feature is mapped to a zero vector
    oh_encoder = OneHotEncoder(handle_unknown='ignore')
    X_train = oh_encoder.fit_transform(X_train)
    return X_train, oh_encoder

def transform_dev(X_dev, site_category_encoder, app_category_encoder, oh_encoder):
    X_dev.site_category = site_category_encoder.transform(X_dev.site_category)
    X_dev.app_category = app_category_encoder.transform(X_dev.app_category)
    X_dev = oh_encoder.transform(X_dev)
    return X_dev

def train_test_split(X, y, test_day):
    """
    X and y are pd.DataFrame. X must have an hour column of type pd.Timestamp.
    """
    test_day_mask = X.hour.dt.day == test_day
    train_day_mask = X.hour.dt.day < test_day
    X_test = X[test_day_mask]
    y_test = y[test_day_mask]
    X_train = X[train_day_mask]
    y_train = y[train_day_mask]
    
    return X_train, y_train, X_test, y_test

In [3]:
df = pd.read_csv('data/train_small.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")
site_category_encoder, app_category_encoder = fit_encoders(df)

In [4]:
def neg_log_loss_score(lg, X_dev, y_dev):
    return -log_loss(y_dev, lg.predict_proba(X_dev))

In [5]:
def score_one_test_day(X_train, y_train, test_day, param):
    X_train, y_train, X_dev, y_dev = train_test_split(X_train, y_train, test_day)
    # the DataFrame needs the hour column for splitting. Drop the column right before training.
    X_train = X_train.drop('hour', axis=1)
    X_dev = X_dev.drop('hour', axis=1)
    
    X_train, oh_encoder = fit_transform_train(X_train, site_category_encoder, app_category_encoder)
    lg = LogisticRegression(C=param)
    lg.fit(X_train, y_train)
    X_dev = transform_dev(X_dev, site_category_encoder, app_category_encoder, oh_encoder)
    # TODO: use log-loss
    return neg_log_loss_score(lg, X_dev, y_dev)

def score_one_param(X_train, y_train, param):
    
    test_day_ls = [25, 26,27,28,29] # 30 goes to the test set
    scores = []

    for test_day in test_day_ls:
        scores.append(score_one_test_day(X_train, y_train, test_day, param))

    assert len(scores) == len(test_day_ls)

    #print('Scores:', scores)
    mean_score = sum(scores)/len(scores)
    #print('Mean: ', mean_score)
    return mean_score
    
def score_params(X_train, y_train, param_ls):
    """
    Returns a map: parameter -> mean score.
    """
    result = dict()
    for p in param_ls:
        result[p] = score_one_param(X_train, y_train, p)
    return result

def best_param(score_dict):
    return max(score_dict.items(), key=operator.itemgetter(1))[0]

In [6]:
model1_cols = ['C1',
                 'banner_pos',
                 'site_category',
                 'app_category',
                 'device_type',
                 'device_conn_type',
                 'C15',
                 'C16',
                 'C18',
                 'C19',
                 'C21']

model1_df = df[model1_cols + ['hour']]
X_train, y_train, X_test, y_test = train_test_split(model1_df, df.click, 30)

In [7]:

#params = [0.001]
#K = 4
#params = np.logspace(-K, K, num=K*2+1)
params = np.logspace(-4, 0, 5)
params

array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00])

### Model Stats
- 01m55s to train 'small' for 2 parameters.
- 03m11s to train 'small' for 5 parameters.
- {0.0001: -0.43356715906855536,
  0.001: -0.42629521272437554,
  0.01: -0.4242657276526611,
  0.1: -0.42427721774829286,
  1.0: -0.4245292557309675,
  10.0: -0.4246627635916426,
  100.0: -0.42472018597787536,
  1000.0: -0.42473047099609146,
  10000.0: -0.4247623788457422}

In [8]:
%%time
scores = score_params(X_train, y_train, params)
best_C = best_param(scores)

CPU times: user 3min 7s, sys: 3.71 s, total: 3min 10s
Wall time: 3min 11s


In [9]:
best_C, scores

(0.01,
 {0.0001: -0.43356715906855536,
  0.001: -0.42629521272437554,
  0.01: -0.4242657276526611,
  0.1: -0.42427721774829286,
  1.0: -0.4245292557309675})

In [10]:
%%time
# this cell is very similar to score_one_test_day.
X_train = X_train.drop('hour', axis=1)
X_test = X_test.drop('hour', axis=1)
X_train, oh_encoder = fit_transform_train(X_train, site_category_encoder, app_category_encoder)
lg = LogisticRegression(C=best_C)
lg.fit(X_train, y_train)
X_test = transform_dev(X_test, site_category_encoder, app_category_encoder, oh_encoder)

CPU times: user 10.4 s, sys: 156 ms, total: 10.6 s
Wall time: 10.6 s


In [11]:
neg_log_loss_score(lg, X_test, y_test)

-0.42472333212391294

In [12]:
raise

RuntimeError: No active exception to reraise

## Model 1 (baseline)

In [None]:
# create a map: colName -> index
def build_colname_idx(df):
    return dict(zip(df.columns, range(len(df.columns))))

model1_cols = ['C1',
             'banner_pos',
             'site_category',
             'app_category',
             'device_type',
             'device_conn_type',
             'C15',
             'C16',
             'C18',
             'C19',
             'C21']

str_cols = ['site_category', 'app_category']

In [None]:
df = pd.read_csv('data/train_tiny.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")

In [None]:
def fit_encoders(df):
    site_category_encoder = LabelEncoder()
    site_category_encoder.fit(df.site_category)
    app_category_encoder = LabelEncoder()
    app_category_encoder.fit(df.app_category)
    return site_category_encoder, app_category_encoder

In [None]:
# LabelEncoder doesn't handle unknown values, so fit them to the entire dataset.
site_category_encoder, app_category_encoder = fit_encoders(df)

In [None]:
def train_dev_test_split(df):
    last_day = 30
    last_day_mask = df.hour.dt.day == last_day
    df_test = df[last_day_mask]
    df_test = df_test[model1_cols + ['click']]
    df_dev, df_test = train_test_split(df_test, test_size=0.5, random_state=23)
    df_train = df[~last_day_mask]
    df_train = df_train[model1_cols + ['click']]
    X_train = df_train.drop('click', axis=1)
    y_train = df_train.click
    X_dev = df_dev.drop('click', axis=1)
    y_dev = df_dev.click
    X_test = df_test.drop('click', axis=1)
    y_test = df_test.click
    return X_train, y_train, X_dev, y_dev, X_test, y_test

In [None]:
# train/dev/test split
X_train, y_train, X_dev, y_dev, X_test, y_test = train_dev_test_split(df)

In [None]:
def fit_transform_train(X_train, site_category_encoder, app_category_encoder):
    X_train.site_category = site_category_encoder.transform(X_train.site_category)
    X_train.app_category = app_category_encoder.transform(X_train.app_category)
    X_train = X_train.values
    # when transforming, an unknown categorical feature is mapped to a zero vector
    oh_encoder = OneHotEncoder(handle_unknown='ignore')
    X_train = oh_encoder.fit_transform(X_train)
    return X_train, oh_encoder

In [None]:
train_colname_idx = build_colname_idx(X_train)
train_colname_idx

In [None]:
# prepare the train set
X_train, oh_encoder = fit_transform_train(X_train, site_category_encoder, app_category_encoder)

In [None]:
# tune: C. Maybe use class_weight. Try different solvers.
# Can be parallelized if multi-class.
lg = LogisticRegression()
lg.fit(X_train, y_train)

In [None]:
def transform_dev(X_dev, site_category_encoder, app_category_encoder, oh_encoder):
    X_dev.site_category = site_category_encoder.transform(X_dev.site_category)
    X_dev.app_category = app_category_encoder.transform(X_dev.app_category)
    X_dev = oh_encoder.transform(X_dev)
    return X_dev

In [None]:
# prepare the dev set
X_dev = transform_dev(X_dev, site_category_encoder, app_category_encoder, oh_encoder)

In [None]:
lg.score(X_dev, y_dev)