In [1]:
import pandas as pd
import numpy as np
import scipy as sp

from sklearn.linear_model import SGDClassifier

from collections import namedtuple
import datetime as DT

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

%load_ext Cython

In [2]:
Raw = namedtuple('raw',['train','test'])
raw = Raw(
    train=pd.read_csv('./train.csv.zip',compression='infer'),
    test=pd.read_csv('./test.csv.zip',compression='infer'),
)

## Initial model

In [3]:
%%cython
import numpy as np
import scipy as sp


cdef tuple COL_EXCLUDES = (
    'click_id',
    'click_time',
    'year_mo_dt',
    'attributed_time', 
    'is_attributed',
)


def build_flookup(dict coldict):
    ''' Build a feature lookup
    '''
    cdef dict uniques = {}, flookup = {}
    cdef int i = 0
    cdef str ch, c
    cdef list cols = [ c for c in coldict 
                        if c not in COL_EXCLUDES ]

    for c in cols:
        # get all unique values
        uniques[c] = np.unique(coldict[c])

        # feature identifier
        ch = c[0]
        for u in uniques[c]:
            flookup[ch+str(u)] = i
            i += 1
    
    flookup['null'] = i
    return flookup


def build_sparse_matrix(dict coldict, dict flookup, int total_uniques):
    ''' Take the data and feature lookup 
        and build a sparse matrix
    '''
    # set up vars 
    cdef str ch, fl, c, val
    cdef int i, cnum
    
    # get length counts
    cdef list cols = coldict.keys()
    cdef int N = coldict[cols[0]].shape[0]
    cdef int M = len(cols)
    cdef int smLength = N*M
    
    # create empty arrays
    cdef double[:] row = np.zeros(smLength)
    cdef double[:] col = np.zeros(smLength)
    cdef double[:] data = np.zeros(smLength)
    
    # loop through data
    for i in range(N):
        
        # for each column 
        for cnum in range(M):
            
            # assign row 
            row[i+cnum] = i
            
            # assign feature 
            c = cols[cnum]
            ch = c[0]
            val = str(coldict[c][i])
            fl = ch + val
            if fl in flookup:
                col[i+cnum] = flookup[fl]
            else:
                col[i+cnum] = flookup['null']
            
            # flag with 1
            data[i+cnum] = 1
    
    sm = sp.sparse.csr_matrix((data, (row, col)), shape=(N, total_uniques))
    return sm

In [4]:
COL_EXCLUDES = (
    'click_id',
    'click_time',
    'year_mo_dt',
    'attributed_time', 
    'is_attributed',
)
coldict = {}

for cname, c in raw.train.iteritems():
    if cname not in COL_EXCLUDES:
        coldict[cname] = c.values
    
flookup = build_flookup(coldict)
totals = len(flookup.keys())

In [5]:
print('Total raw unique values (features): {}'.format(totals))

Total raw unique values (features): 282580


In [6]:
sm = build_sparse_matrix(coldict, flookup, totals)

In [7]:
y = raw.train.is_attributed.values
X = sm[:, :-1]

In [8]:
print X.shape

(184903890, 282579)


In [None]:
lr = SGDClassifier(
    loss='log',
    n_jobs=-1,
    learning_rate='optimal',
    penalty='elasticnet',
    max_iter=5000,
    tol=1e-5,
    # alpha = 0.001,
)
lr.fit(X, y)

In [None]:
lr.n_iter_

In [None]:
test_coldict = {}
for cname, c in raw.test.iteritems():
    if cname not in COL_EXCLUDES:
        test_coldict[cname] = c.values
        
sm = build_sparse_matrix(test_coldict, flookup, totals)
X_tst = sm[:, :-1]

In [None]:
predictions = lr.predict_proba(X_tst)

In [None]:
output = pd.DataFrame(np.vstack((raw.test.click_id, predictions[:, 1])).T,columns=['click_id','is_attributed'])

In [None]:
output.to_csv('./submission.csv', header=True, index=False)

In [None]:
output.shape