In [1]:
import pandas as pd
import numpy as np
import scipy as sp

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

from collections import namedtuple
import datetime as DT

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

%load_ext Cython

In [2]:
Raw = namedtuple('raw',['train','test'])
raw = Raw(
    train=pd.read_csv('./train.csv.zip',compression='infer'),
    test=pd.read_csv('./test.csv.zip',compression='infer'),
)

In [3]:
raw.train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,,0
1,17357,3,1,19,379,2017-11-06 14:33:34,,0
2,35810,3,1,13,379,2017-11-06 14:34:12,,0
3,45745,14,1,13,478,2017-11-06 14:34:52,,0
4,161007,3,1,13,379,2017-11-06 14:35:08,,0


## Processing

In [4]:
raw.train['h_hour'] = pd.to_datetime(raw.train.click_time).dt.strftime('%H')
raw.test['h_hour'] = pd.to_datetime(raw.test.click_time).dt.strftime('%H')

In [5]:
%%cython
import numpy as np
import scipy as sp


cdef tuple COL_EXCLUDES = (
    'click_id',
    'click_time',
    'year_mo_dt',
    'attributed_time', 
    'is_attributed',
)


def build_flookup(dict coldict):
    ''' Build a feature lookup
    '''
    cdef dict uniques = {}, flookup = {}
    cdef int i = 0
    cdef str ch, c
    cdef list cols = [ c for c in coldict 
                        if c not in COL_EXCLUDES ]

    for c in cols:
        # get all unique values
        uniques[c] = np.unique(coldict[c])

        # feature identifier
        ch = c[0]
        for u in uniques[c]:
            flookup[ch+str(u)] = i
            i += 1
    
    flookup['null'] = i
    return flookup


def build_sparse_matrix(dict coldict, dict flookup, int total_uniques):
    ''' Take the data and feature lookup 
        and build a sparse matrix
    '''
    # set up vars 
    cdef str ch, fl, c, val
    cdef int i, cnum
    
    # get length counts
    cdef list cols = coldict.keys()
    cdef int N = coldict[cols[0]].shape[0]
    cdef int M = len(cols)
    cdef int smLength = N*M
    
    # create empty arrays
    cdef double[:] row = np.zeros(smLength)
    cdef double[:] col = np.zeros(smLength)
    cdef double[:] data = np.zeros(smLength)
    
    # loop through data
    for i in range(N):
        
        # for each column 
        for cnum in range(M):
            
            # assign row 
            row[i+cnum] = i
            
            # assign feature 
            c = cols[cnum]
            ch = c[0]
            val = str(coldict[c][i])
            fl = ch + val
            if fl in flookup:
                col[i+cnum] = flookup[fl]
            else:
                col[i+cnum] = flookup['null']
            
            # flag with 1
            data[i+cnum] = 1
    
    sm = sp.sparse.csr_matrix((data, (row, col)), shape=(N, total_uniques))
    return sm

In [6]:
COL_EXCLUDES = (
    'click_id',
    'click_time',
    'year_mo_dt',
    'attributed_time', 
    'is_attributed',
)
coldict = {}

for cname, c in raw.train.iteritems():
    if cname not in COL_EXCLUDES:
        coldict[cname] = c.values
    
flookup = build_flookup(coldict)
totals = len(flookup.keys())

In [None]:
print('Total raw unique values (features): {}'.format(totals))

Total raw unique values (features): 282604


In [None]:
sm = build_sparse_matrix(coldict, flookup, totals)

In [None]:
y = raw.train.is_attributed.values
X = sm[:, :-1]

In [None]:
test_coldict = {}
for cname, c in raw.test.iteritems():
    if cname not in COL_EXCLUDES:
        test_coldict[cname] = c.values
        
val_sm = build_sparse_matrix(test_coldict, flookup, totals)
X_val = val_sm[:, :-1]

In [None]:
print X.shape

In [None]:
X_tr, X_tst, y_tr, y_tst = train_test_split(X, y, test_size=0.33, random_state=22)

## Light GBM

In [None]:
import lightgbm as lgb

In [None]:
X_gbm_tr = lgb.Dataset(X_tr, y_tr)
X_gbm_tst = lgb.Dataset(X_tst, y_tst)

In [None]:
params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'learning_rate': 0.05 ,
    'num_leaves': 256,
    'min_data_in_leaf': 1000,
    'max_depth': -1,
    'max_bin': 255,
    'num_boost_round': 2000,
    'bagging_fraction': 0.8,
    'feature_fraction': 0.8,
    'bagging_seed': 1,
    'feature_fraction_seed': 1,
    'metric' : ['binary_logloss'],
    'nthread': 16,
}
model = lgb.train(
    params,
    X_gbm_tr, 
    valid_sets=[X_gbm_tst], 
    early_stopping_rounds=50,
    verbose_eval=5,
)

In [None]:
predictions = model.predict(X_val)
output = pd.DataFrame(np.vstack((raw.test.click_id, predictions)).T,columns=['click_id','is_attributed'])
output['click_id'] = output.click_id.astype(np.int32)

In [None]:
output.to_csv('./submission4.csv', header=True, index=False)

## Initial model

In [None]:
lr = SGDClassifier(
    loss='log',
    n_jobs=-1,
    learning_rate='optimal',
    penalty='elasticnet',
    max_iter=5000,
    tol=1e-5,
    alpha = 0.01,
)
lr.fit(X, y)

In [None]:
lr.n_iter_

In [None]:
test_coldict = {}
for cname, c in raw.test.iteritems():
    if cname not in COL_EXCLUDES:
        test_coldict[cname] = c.values
        
sm = build_sparse_matrix(test_coldict, flookup, totals)
X_tst = sm[:, :-1]

In [None]:
predictions = lr.predict_proba(X_tst)

In [None]:
output = pd.DataFrame(np.vstack((raw.test.click_id, predictions[:, 1])).T,columns=['click_id','is_attributed'])
output['click_id'] = output.click_id.astype(np.int32)

In [None]:
output.shape

In [None]:
output.to_csv('./submission3.csv', header=True, index=False)

In [None]:
output.head()