<div class="alert alert-block alert-info"><b>Modeling Strategy:</b><br>
    - Instead of exploring blindly, I decide first try to study and evalute some top notebooks on the Kaggle Public Leaderboard.<br>
    - Secondly I need to uncover where those models need to improve.<br>
    - Finally, I will come up with my models, hopefully with better performance.</div>

[**Reference 1: Kernel Logistic Regression**](https://www.kaggle.com/gogo827jz/kernel-logistic-regression-one-for-206-targets)

In [37]:
# Setup the environment

import os
import gc
import datetime
import numpy as np
import pandas as pd
from sklearn.kernel_approximation import Nystroem
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from tqdm.notebook import tqdm
from time import time

In [38]:
# Data Preparation

# Read features for the training data.
train_features = pd.read_csv('train_features.csv')

# Read binary MoA targets that are scored
train_targets = pd.read_csv('train_targets_scored.csv')

# Read features for the test data
test_features = pd.read_csv('test_features.csv')

# Read a submission file in the correct format
ss_krr = pd.read_csv('sample_submission.csv')

# Copy ss_krr to ss_lr
ss_lr = ss_krr.copy()

# Read column names, except sig_id, from sbumission file
cols = [c for c in ss_krr.columns.values if c != 'sig_id']

In [39]:
# Define a function preprocess:
# 1. encode the columns `cp_type` and `cp_dose`
# 2. drop col `sig_id`

def preprocess(df):
    # make a copy of the df so the changed won't apply to the original df
    df = df.copy()
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    del df['sig_id']
    return df

# Log loss is used as the metrics to ???
# Define function log_loss_metric to compute the metrics log loss

def log_loss_metric(y_true, y_pred):
    # To clip the values in the y_pred. 
    # Values smaller than 1e-15 become 0 and values larger than (1 - 1e-15) become 1
    y_pred_clip = np.clip(y_pred, 1e-15, 1 - 1e-15)
    
    # ??? 
    a = y_true * np.log(y_pred_clip) + (1 - y_true) * np.log(1 - y_pred_clip)
    b = np.mean(a, axis=1)
    loss = -np.mean(b)
    return loss

<div class="alert alert-block alert-info"><b>Notes:</b><br>
    - Why encode trt_cp as 0 and ctl_vehicle as 1?<br>
    - Why define the the function instead of using log_loss in sklearn? <br>

In [40]:
# Preprocess train_ and test_features

train = preprocess(train_features)
test = preprocess(test_features)

In [41]:
train_features.shape, train.shape

((23814, 876), (23814, 875))

In [42]:
train_features.head()

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,...,-0.4265,0.7543,0.4708,0.023,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
2,id_000a6266a,trt_cp,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,...,-0.725,-0.6297,0.6103,0.0223,-1.324,-0.3174,-0.6417,-0.2187,-1.408,0.6931
3,id_0015fd391,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,...,-2.099,-0.6441,-5.63,-1.378,-0.8632,-1.288,-1.621,-0.8784,-0.3876,-0.8154
4,id_001626bd3,trt_cp,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,...,0.0042,0.0048,0.667,1.069,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125


In [43]:
train.head()

Unnamed: 0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,0,24,0,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,-1.022,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,0,72,0,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,0.2341,...,-0.4265,0.7543,0.4708,0.023,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
2,0,48,0,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,0.1715,...,-0.725,-0.6297,0.6103,0.0223,-1.324,-0.3174,-0.6417,-0.2187,-1.408,0.6931
3,0,48,0,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,-1.959,...,-2.099,-0.6441,-5.63,-1.378,-0.8632,-1.288,-1.621,-0.8784,-0.3876,-0.8154
4,0,72,1,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,-0.28,...,0.0042,0.0048,0.667,1.069,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125


In [44]:
# Print the shape of train_targets

train_targets.shape

(23814, 207)

In [45]:
# Delete 'sig_id' column in train_targets

del train_targets['sig_id']
train_targets.shape

(23814, 206)

In [46]:
# Drop controls in train_targets
train_targets = train_targets.loc[train['cp_type'] == 0].reset_index(drop=True)

# Drop controls in train
train = train.loc[train['cp_type'] == 0].reset_index(drop=True)

In [47]:
# Check the shape of train_targets and train
train_targets.shape, train.shape

((21948, 206), (21948, 875))