In [3]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample

import scipy.sparse as sp

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ParameterGrid
from sklearn import metrics
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from lightfm import LightFM

from fastFM import sgd, als



In [4]:
train_data = pd.read_csv('../data/train.csv')
dev_data = pd.read_csv('../data/validation.csv')
test_data = pd.read_csv('../data/test.csv')

# Proprecess Function

In [5]:
def preprocess(train, dev, test):
    
    #merge the data first:
    data = [train, dev, test]
    data = pd.concat(data)
    
    #drop the useless columns
    drop_cols = ['bidid', 'userid', 'IP', 'domain', 'url', 'urlid', 'slotid', 'bidprice', 'payprice']
    data = data.drop(drop_cols, axis=1)
    
    #encode the useragent features
    data['OS'], data['Browser'] = data['useragent'].str.split('_').str
    data = data.drop(['useragent'], axis=1)
    
    #add slot_size = slotwidth * slotheight
    data['slotsize'] = data['slotwidth'] * data['slotheight']
    
    #encode the usertag features
    usertag = []
    s = list(data.usertag.value_counts().to_dict().keys())
    for o in s:
        l = o.split(',')
        for n in l:
            if n not in usertag:
                usertag.append(n)
    
    #add new feature
    for n in usertag:
        mask = data['usertag'].str.contains(n)
        data['usertag_' + str(n)] = mask * 1 #transform boolean to integer
        
    #drop the usertag column
    data = data.drop(['usertag'], axis=1)
    
    #encode the categorical feature
    cate_columns = ['weekday', 'hour', 'region','city', 'adexchange', 'slotvisibility',
                   'slotformat', 'creative', 'keypage','advertiser', 'OS', 'Browser']
    data = pd.get_dummies(data, columns=cate_columns)
    
    #scale the numerical feature
    numeric_cols=['slotwidth', 'slotheight', 'slotprice','slotsize']
    scaler = MinMaxScaler()
    data[numeric_cols] = pd.DataFrame(scaler.fit_transform(data[numeric_cols]))
    
    #split the train, dev, test set
    m_train = train.shape[0]
    m_dev = dev.shape[0]
    m_test = test.shape[0]
    
    train = data[:m_train]
    dev = data[m_train:m_train+m_dev]
    test = data[m_train+m_dev:]
    
    #split the features and the labels
    x_train = train.drop(['click'], axis=1)
    y_train = pd.DataFrame(train['click'])
    
    x_dev = dev.drop(['click'], axis=1)
    y_dev = pd.DataFrame(dev['click'])
    
    x_test = test.drop(['click'], axis=1)
    
    return (x_train, y_train), (x_dev, y_dev), x_test

In [6]:
def downsample(x, y, ratio, seed):
    data = x
    data['click'] = y
    majority = data[data.click == 0]
    minority = data[data.click == 1]
    
    m_resample = int(minority.shape[0] / ratio) - minority.shape[0]
    
    majority_downsampled = resample(majority, replace=False, n_samples = m_resample, random_state=seed)
    
    data_downsampled = pd.concat([minority, majority_downsampled])
    
    x_train = data_downsampled.drop(['click'], axis=1)
    y_train = data_downsampled['click'].values.ravel()
    
    
    
    down_ratio = majority_downsampled.shape[0] / majority.shape[0]
    
    return x_train, y_train, down_ratio  

# process and under-sample data

In [7]:
train, dev, test = preprocess(train_data, dev_data, test_data)

In [8]:
x_train, y_train = train
x_dev, y_dev = dev
x_test = test

In [9]:
x_train_sampled, y_train_sampled, down_ratio = downsample(x_train, y_train, ratio=0.3, seed=3693)

# LR

In [10]:
lr = LogisticRegression(class_weight='balanced', C=0.2, penalty='l1')
lr.fit(x_train_sampled, y_train_sampled)

LogisticRegression(C=0.2, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [11]:
pred_l = lr.predict_proba(x_dev)[:,1]
pred_l = pred_l / (pred_l + (1-pred_l) / down_ratio)
print('auc:' + str(metrics.roc_auc_score(y_score=pred_l, y_true=y_dev.values.ravel())))
print('logloss:' + str(metrics.log_loss(y_pred=pred_l, y_true=y_dev.values.ravel())))

auc:0.8480463063937591
logloss:0.005759289109539074


In [48]:
result = pd.DataFrame(pred_l)
result.to_csv('../prediction/pred_dev_l.csv', index_label=False)

# GDBT

In [12]:
lgb_params = {}
lgb_params['learning_rate'] = 0.05
lgb_params['n_estimators'] = 1000
lgb_params['subsample'] = 0.6
lgb_params['colsample_bytree'] = 0.6
lgb_params['max_depth'] = 19
lgb_params['min_child_weight'] = 1

In [13]:
lgb = LGBMClassifier(**lgb_params, class_weight='balanced')
lgb.fit(x_train_sampled, y_train_sampled, early_stopping_rounds=50, eval_metric='auc', eval_set=[(x_dev, y_dev.values.ravel())])

[1]	valid_0's auc: 0.779452
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's auc: 0.820624
[3]	valid_0's auc: 0.821607
[4]	valid_0's auc: 0.829767
[5]	valid_0's auc: 0.829867
[6]	valid_0's auc: 0.843367
[7]	valid_0's auc: 0.840773
[8]	valid_0's auc: 0.843592
[9]	valid_0's auc: 0.845412
[10]	valid_0's auc: 0.845196
[11]	valid_0's auc: 0.84867
[12]	valid_0's auc: 0.853399
[13]	valid_0's auc: 0.853511
[14]	valid_0's auc: 0.8531
[15]	valid_0's auc: 0.855533
[16]	valid_0's auc: 0.85567
[17]	valid_0's auc: 0.856217
[18]	valid_0's auc: 0.857587
[19]	valid_0's auc: 0.856769
[20]	valid_0's auc: 0.856996
[21]	valid_0's auc: 0.859243
[22]	valid_0's auc: 0.858366
[23]	valid_0's auc: 0.860605
[24]	valid_0's auc: 0.86088
[25]	valid_0's auc: 0.861739
[26]	valid_0's auc: 0.863182
[27]	valid_0's auc: 0.863997
[28]	valid_0's auc: 0.864211
[29]	valid_0's auc: 0.865197
[30]	valid_0's auc: 0.863549
[31]	valid_0's auc: 0.86471
[32]	valid_0's auc: 0.864922
[33]	valid_0's auc: 0.864

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
        colsample_bytree=0.6, learning_rate=0.05, max_depth=19,
        min_child_samples=20, min_child_weight=1, min_split_gain=0.0,
        n_estimators=1000, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=0.6, subsample_for_bin=200000, subsample_freq=1)

In [14]:
pred_lgb = lgb.predict_proba(x_dev)[:,1]
pred_lgb = pred_lgb / (pred_lgb + (1-pred_lgb) / down_ratio)
print('auc:' + str(metrics.roc_auc_score(y_score=pred_lgb, y_true=y_dev.values.ravel())))
print('logloss:' + str(metrics.log_loss(y_pred=pred_lgb, y_true=y_dev.values.ravel())))

auc:0.8776247820651326
logloss:0.0043774014569099235


In [15]:
pred_test_lgb = lgb.predict_proba(x_test)[:,1]
pred_test_lgb = pred_test_lgb / (pred_test_lgb + (1-pred_test_lgb) / down_ratio)

# GDBT + LR

In [16]:
gdbt_lr_params = {}
gdbt_lr_params['learning_rate'] = 0.05
gdbt_lr_params['n_estimators'] = 1000
gdbt_lr_params['subsample'] = 0.6
gdbt_lr_params['colsample_bytree'] = 0.6
gdbt_lr_params['max_depth'] = 19
gdbt_lr_params['min_child_weight'] = 1
gdbt_lr_params['num_leaves'] = 25

In [18]:
gdbt = LGBMClassifier(**gdbt_lr_params, class_weight='balanced')
gdbt.fit(x_train_sampled, y_train_sampled, early_stopping_rounds=50, eval_metric='auc', eval_set=[(x_dev, y_dev.values.ravel())])

[1]	valid_0's auc: 0.780917
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's auc: 0.819247
[3]	valid_0's auc: 0.820068
[4]	valid_0's auc: 0.82838
[5]	valid_0's auc: 0.828843
[6]	valid_0's auc: 0.840768
[7]	valid_0's auc: 0.838267
[8]	valid_0's auc: 0.842315
[9]	valid_0's auc: 0.844738
[10]	valid_0's auc: 0.844251
[11]	valid_0's auc: 0.848542
[12]	valid_0's auc: 0.853678
[13]	valid_0's auc: 0.852971
[14]	valid_0's auc: 0.851344
[15]	valid_0's auc: 0.852538
[16]	valid_0's auc: 0.851469
[17]	valid_0's auc: 0.853238
[18]	valid_0's auc: 0.852863
[19]	valid_0's auc: 0.852753
[20]	valid_0's auc: 0.853379
[21]	valid_0's auc: 0.855981
[22]	valid_0's auc: 0.854863
[23]	valid_0's auc: 0.857665
[24]	valid_0's auc: 0.858037
[25]	valid_0's auc: 0.858978
[26]	valid_0's auc: 0.860419
[27]	valid_0's auc: 0.861714
[28]	valid_0's auc: 0.861919
[29]	valid_0's auc: 0.861626
[30]	valid_0's auc: 0.860253
[31]	valid_0's auc: 0.861682
[32]	valid_0's auc: 0.862459
[33]	valid_0's auc: 

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
        colsample_bytree=0.6, learning_rate=0.05, max_depth=19,
        min_child_samples=20, min_child_weight=1, min_split_gain=0.0,
        n_estimators=1000, n_jobs=-1, num_leaves=25, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=0.6, subsample_for_bin=200000, subsample_freq=1)

In [19]:
x_train_gdbt = gdbt.apply(x_train_sampled)
x_dev_gdbt = gdbt.apply(x_dev)
x_test_gdbt = gdbt.apply(x_test)

In [20]:
x_train_gdbt = pd.DataFrame(x_train_gdbt)
x_dev_gdbt = pd.DataFrame(x_dev_gdbt)
x_test_gdbt = pd.DataFrame(x_test_gdbt)

In [21]:
data = pd.concat([x_train_gdbt, x_dev_gdbt, x_test_gdbt])

In [22]:
col = data.columns.tolist()

In [23]:
data = pd.get_dummies(data, columns=col)

In [24]:
x_train_new = data[:x_train_gdbt.shape[0]]
x_dev_new = data[x_train_gdbt.shape[0]:x_train_gdbt.shape[0]+x_dev_gdbt.shape[0]]
x_test_new = data[x_train_gdbt.shape[0]+x_dev_gdbt.shape[0]:]

In [25]:
lr = LogisticRegression(C=0.015, class_weight='balanced')
lr.fit(x_train_new, y_train_sampled)

LogisticRegression(C=0.015, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [26]:
pred_gl = lr.predict_proba(x_dev_new)[:,1]
pred_gl = pred_gl / (pred_gl + (1-pred_gl) / down_ratio)
print('auc:' + str(metrics.roc_auc_score(y_score=pred_gl, y_true=y_dev.values.ravel())))
print('logloss:' + str(metrics.log_loss(y_pred=pred_gl, y_true=y_dev.values.ravel())))

auc:0.8778242293663686
logloss:0.004327498123507622


In [27]:
pred_test_gl = lr.predict_proba(x_test_new)[:,1]
pred_test_gl = pred_test_gl / (pred_test_gl + (1-pred_test_gl) / down_ratio)

# GBDT + FM

In [28]:
yTrain = y_train_sampled.copy()

In [29]:
yTrain[yTrain == 0] = -1

In [30]:
xTrain = sp.csc_matrix(x_train_new)
xDev = sp.csc_matrix(x_dev_new)

In [31]:
fm = als.FMClassification(n_iter=200, rank=7, l2_reg_w=3000, l2_reg_V=1500)
fm.fit(xTrain, yTrain)

FMClassification(init_stdev=0.1, l2_reg=None, l2_reg_V=1500, l2_reg_w=3000,
         n_iter=200, random_state=123, rank=7)

In [32]:
pred_gf = fm.predict_proba(xDev)
pred_gf = pred_gf / (pred_gf + (1-pred_gf) / down_ratio)
print('auc:' + str(metrics.roc_auc_score(y_score=pred_gf, y_true=y_dev.values.ravel())))
print('logloss:' + str(metrics.log_loss(y_pred=pred_gf, y_true=y_dev.values.ravel())))

auc:0.8779125230803223
logloss:0.005200755491462952


# ensemble model

In [33]:
pred_esm = pred_lgb * 0.5 + pred_gl * 0.5

In [34]:
print('auc:' + str(metrics.roc_auc_score(y_score=pred_esm, y_true=y_dev.values.ravel())))
print('logloss:' + str(metrics.log_loss(y_pred=pred_esm, y_true=y_dev.values.ravel())))

auc:0.8793282460376302
logloss:0.004327581966505735


In [35]:
result = pd.DataFrame(pred_esm)
result.to_csv('../prediction/pred_dev_esm.csv', index_label=False)

# FM

In [33]:
yTrain = y_train_sampled.copy()
yTrain[yTrain == 0] = -1

In [34]:
xTrain = sp.csc_matrix(x_train_sampled)
xDev = sp.csc_matrix(x_dev)

In [35]:
fm = als.FMClassification(n_iter=1000, rank=2, l2_reg=80)
fm.fit(xTrain, yTrain)

FMClassification(init_stdev=0.1, l2_reg=80, l2_reg_V=80, l2_reg_w=80,
         n_iter=1000, random_state=123, rank=2)

In [36]:
pred_gf = fm.predict_proba(xDev)
pred_gf = pred_gf / (pred_gf + (1-pred_gf) / down_ratio)
print('auc:' + str(metrics.roc_auc_score(y_score=pred_gf, y_true=y_dev.values.ravel())))
print('logloss:' + str(metrics.log_loss(y_pred=pred_gf, y_true=y_dev.values.ravel())))

auc:0.8560534965696172
logloss:0.005247799741169467


# Test prediction

In [319]:
pred_test = pred_test_gl * 0.5 + pred_test_lgb * 0.5

In [320]:
index = test_data.bidid

In [321]:
pred_test = pd.DataFrame(pred_test, index=index)

In [322]:
pred_test.to_csv('../prediction/pred_test.csv')