In [1]:
import pandas as pd
import numpy as np
import scipy as sp

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction import FeatureHasher

from collections import namedtuple
import datetime as DT

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

%load_ext Cython

In [2]:
Raw = namedtuple('raw',['train','test'])
raw = Raw(
    train=pd.read_csv('./train.csv.zip',compression='infer'),
    test=pd.read_csv('./test.csv.zip',compression='infer'),
)

In [3]:
# show a description of features 
descriptions = []
for c in raw.test.columns:
    if c!='click_id':
        r = {}
        r['col'] = c
        r['unique'] = np.unique(raw.train[c]).shape[0]
        v = raw.train[c].value_counts()
        r['mode'] = v.index[0] 
        r['mode_cnt'] = v.iloc[0]
        r['second_mode_cnt'] = v.iloc[1]
        descriptions.append(r)

pd.DataFrame(descriptions)

Unnamed: 0,col,mode,mode_cnt,second_mode_cnt,unique
0,ip,5348,1238734,1171448,277396
1,app,3,33911780,24179003,706
2,device,1,174330052,8105054,3475
3,os,19,44181914,39782808,800
4,channel,280,15065927,8873025,202
5,click_time,2017-11-07 14:00:11,1502,1497,259620


In [4]:
raw.train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,,0
1,17357,3,1,19,379,2017-11-06 14:33:34,,0
2,35810,3,1,13,379,2017-11-06 14:34:12,,0
3,45745,14,1,13,478,2017-11-06 14:34:52,,0
4,161007,3,1,13,379,2017-11-06 14:35:08,,0


In [5]:
raw.train['h_hour'] = pd.to_datetime(raw.train.click_time).dt.strftime('%H')
raw.test['h_hour'] = pd.to_datetime(raw.test.click_time).dt.strftime('%H')

## Processing

In [6]:
def minimize_column(c, i):
    ''' Return series with minimized values
        ----------
        Args:
            c(pd.Series) - column data
            i(int) - number of values to check
    '''
    def getval(x, col_dict, i):
        if x:
            if col_dict[x]>i:
                return x
            else:
                return -1
        else:
            return x

    colvals = c.value_counts().reset_index(level=0)
    colnames = colvals.columns
    col_list = zip(colvals[colnames[0]].values, colvals[colnames[1]].values)
    col_dict = { k:v for k,v in col_list }
    new = c.apply(lambda x: getval(x, col_dict, i))
    return new

In [19]:
%%cython
''' No longer used
'''
import numpy as np
import scipy as sp
import hashlib


cdef tuple COL_EXCLUDES = (
    'click_id',
    'click_time',
    'year_mo_dt',
    'attributed_time', 
    'is_attributed',
)


def build_flookup(dict coldict):
    ''' Build a feature lookup
    '''
    cdef dict uniques = {}, flookup = {}
    cdef int i = 0
    cdef str ch, c
    cdef list cols = [ c for c in coldict 
                        if c not in COL_EXCLUDES ]

    for c in cols:
        # get all unique values
        uniques[c] = np.unique(coldict[c])

        # feature identifier
        ch = c[0]
        for u in uniques[c]:
            flookup[ch+str(u)] = i
            i += 1
    
    flookup['null'] = i
    return flookup


def build_sparse_matrix(dict coldict, dict flookup, int total_uniques):
    ''' Take the data and feature lookup 
        and build a sparse matrix
    '''
    # set up vars 
    cdef str ch, fl, c, val
    cdef int i, cnum
    
    # get length counts
    cdef list cols = coldict.keys()
    cdef int N = coldict[cols[0]].shape[0]
    cdef int M = len(cols)
    cdef int smLength = N*M
    
    # create empty arrays
    cdef double[:] row = np.zeros(smLength)
    cdef double[:] col = np.zeros(smLength)
    cdef double[:] data = np.zeros(smLength)
    
    # loop through data
    for i in range(N):
        
        # for each column 
        for cnum in range(M):
            
            # assign row 
            row[i+cnum] = i
            
            # assign feature 
            c = cols[cnum]
            ch = c[0]
            val = str(coldict[c][i])
            fl = ch + val
            if fl in flookup:
                col[i+cnum] = flookup[fl]
            else:
                col[i+cnum] = flookup['null']
            
            # flag with 1
            data[i+cnum] = 1
    
    sm = sp.sparse.csr_matrix((data, (row, col)), shape=(N, total_uniques))
    return sm


def build_flookup_hash(dict coldict, int N):
    ''' Build a feature lookup with hashing
    '''
    cdef dict uniques = {}, flookup = {}
    cdef int i = 0
    cdef str ch, c
    cdef list cols = [ c for c in coldict 
                        if c not in COL_EXCLUDES ]

    for c in cols:
        # get all unique values
        uniques[c] = np.unique(coldict[c])

        # feature identifier
        ch = c[0]
        for u in uniques[c]:
            h = int(hashlib.md5(u).hexdigest(), 16)
            flookup[ch+str(u)] = h % N
            i += 1
    
    flookup['null'] = N+1
    return flookup


def build_matrix(dict coldict, dict flookup):
    ''' Use feature lookup to build a matrix
    '''
    cdef list cols = coldict.keys()
    cdef int N = coldict[cols[0]].shape[0]
    cdef int n_features = len(set(flookup.values()))
    cdef double[:,:] data = np.zeros((N, n_features))
    cdef int i
    cdef str c, fl
    
    # look through data
    for i in range(N):
        
        # loop through columns 
        for c in cols:
            
            # find hashed column map
            ch = c[0]
            val = str(coldict[c][i])
            fl = ch + val
            
            if fl in flookup:
                col = flookup[fl]
            else:
                col = flookup['null']
            
            # increment value
            data[i, col] += 1
    
    return data

In [10]:
COL_EXCLUDES = (
    'click_id',
    'click_time',
    'year_mo_dt',
    'attributed_time', 
    'is_attributed',
)
coldict = {}

for cname, c in raw.train.iteritems():
    if cname not in COL_EXCLUDES:
        coldict[cname] = c.values
    
flookup = build_flookup_hash(coldict, 100)
totals = len(flookup.keys())

In [None]:
m = build_matrix(coldict, flookup)

In [None]:
X_tr, X_tst, y_tr, y_tst = train_test_split(
    X, y, test_size=0.33, random_state=22,
)

## XGB

In [None]:
import xgboost as xgb

In [None]:
dtrain = xgb.DMatrix(X_tr, label=y_tr)
dtest = xgb.DMatrix(X_tst, label=y_tst)
dval = xgb.DMatrix(X_val)

scale_pos_tr = np.sum(y_tr.values==0) / np.sum(y_tr.values==1)

In [None]:
params = {
    'silent': 1,
    'eval_metric': 'logloss',
    'eta': 0.3,
    'nthread': 16,
    'min_child_weight': 1,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'seed': 2,
    'objective': 'binary:logistic',
    'alpha': 0.0,
    'max_depth': 6,
    'gamma': 0.0,
    'booster': 'gbtree',
    'scale_pos_weight': scale_pos_tr ,
}

In [None]:
evallist  = [(dtrain,'train'), (dtest,'eval')]
gbdt = xgb.train(
    params,
    dtrain,
    200,
    evallist,
    early_stopping_rounds=10,
    verbose_eval=5,
)

In [None]:
preds = gbdt.predict(dtest)

In [None]:
preds.shape

In [None]:
roc_auc_score(y_tst, preds)

In [None]:
predictions = gbdt.predict(dval)
output = pd.DataFrame(np.vstack((raw.test.click_id, predictions)).T,columns=['click_id','is_attributed'])
output['click_id'] = output.click_id.astype(np.int32)

In [None]:
output.to_csv('./submission6.csv', header=True, index=False)

## Light GBM

In [13]:
import lightgbm as lgb

In [14]:
X_gbm_tr = lgb.Dataset(X_tr, y_tr)
X_gbm_tst = lgb.Dataset(X_tst, y_tst)

In [None]:
params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'learning_rate': 0.05 ,
    'num_leaves': 256,
    'min_data_in_leaf': 1000,
    'max_depth': -1,
    'max_bin': 255,
    'num_boost_round': 115,
    'bagging_fraction': 0.8,
    'feature_fraction': 0.8,
    'bagging_seed': 1,
    'feature_fraction_seed': 1,
    'metric' : ['binary_logloss'],
    'nthread': 16,
}
model = lgb.train(
    params,
    X_gbm_tr, 
    valid_sets=[X_gbm_tst], 
    early_stopping_rounds=50,
    verbose_eval=5,
)

In [None]:
preds = model.predict(X_tst)
roc_auc_score(y_tst, preds)

In [16]:
predictions = model.predict(X_val)
output = pd.DataFrame(np.vstack((raw.test.click_id, predictions)).T,columns=['click_id','is_attributed'])
output['click_id'] = output.click_id.astype(np.int32)

In [None]:
output.to_csv('./submission4.csv', header=True, index=False)

## Initial model

In [None]:
lr = SGDClassifier(
    loss='log',
    n_jobs=-1,
    learning_rate='optimal',
    penalty='elasticnet',
    max_iter=5000,
    tol=1e-5,
    alpha = 0.01,
)
lr.fit(X, y)

In [None]:
lr.n_iter_

In [None]:
test_coldict = {}
for cname, c in raw.test.iteritems():
    if cname not in COL_EXCLUDES:
        test_coldict[cname] = c.values
        
sm = build_sparse_matrix(test_coldict, flookup, totals)
X_tst = sm[:, :-1]

In [None]:
predictions = lr.predict_proba(X_tst)

In [None]:
output = pd.DataFrame(np.vstack((raw.test.click_id, predictions[:, 1])).T,columns=['click_id','is_attributed'])
output['click_id'] = output.click_id.astype(np.int32)

In [None]:
output.shape

In [None]:
output.to_csv('./submission3.csv', header=True, index=False)

In [None]:
output.head()