In [38]:
import gzip
import pickle

import pandas as pd
import numpy as np
import scipy.sparse as sp

from tqdm import tqdm_notebook as tqdm

In [24]:
train_input = gzip.open('data/criteo_train.txt.gz', 'r')

files = [
    open('data/train_0.txt', 'w'),
    open('data/train_1.txt', 'w'),
    open('data/train_2.txt', 'w'),
    open('data/train_3.txt', 'w'),
]

for line in tqdm(train_input):
    line = line.decode()
    split = line.split('|')
    id = int(split[0].strip())
    fold = hash(id) % 4
    files[fold].write(line)

for f in files:
    f.flush()
    f.close()

173979748it

In [25]:
from collections import namedtuple
Line = namedtuple('Line', ['id', 'f0', 'f1', 'idx', 'val'])
LabeledLine = namedtuple('LabeledLine', ['id', 'f0', 'f1', 'idx', 'val', 'propensity', 'label'])

def parse_features(s):
    split = s.split(' ')
    f0 = split[0]
    assert f0.startswith('0:')
    f0 = int(f0[2:])

    f1 = split[1]
    assert f1.startswith('1:')
    f1 = int(f1[2:])

    idx = []
    values = []
    
    for fv in split[2:]:
        f, v = fv.split(':')
        idx.append(int(f) - 2)
        values.append(int(v))

    return f0, f1, idx, values

def read_train(fname):
    if fname.endswith('.gz'):
        f = gzip.open(fname, 'r')
        f = map(bytes.decode, f)
    else:
        f = open(fname, 'r')

    for line in f:
        split = line.split('|')
        id = int(split[0].strip())

        label = None
        propensity = None
        features = None

        if len(split) == 4:
            l = split[1]
            assert l.startswith('l')

            l = l.lstrip('l ').strip()
            if l == '0.999':
                label = 0
            elif l == '0.001':
                label = 1
            else:
                raise Exception('ololo')

            p = split[2]
            assert p.startswith('p')
            p = p.lstrip('p ').strip()
            propensity = float(p)

            features = split[3].lstrip('f ').strip()

            f0, f1, idx, val = parse_features(features)
            idx = np.array(idx, dtype=np.uint32)
            val = np.array(val, dtype=np.uint8)
            yield LabeledLine(id, f0, f1, idx, val, propensity, label)
        elif len(split) == 2:
            pass

In [26]:
import itertools
it0 = read_train('data/train_0.txt')
it1 = read_train('data/train_1.txt')
it2 = read_train('data/train_2.txt')
it_train = itertools.chain(it0, it1, it2)


In [27]:
df_train = []

for line in tqdm(it_train):
    df_train.append(line)




In [35]:
df_train = pd.DataFrame(df_train)

In [39]:
with open('tmp/df_train.bin', 'wb') as f:
    pickle.dump(df_train, f)

In [28]:
it_val = read_train('data/train_3.txt')

In [29]:
df_val = []

for line in tqdm(it_val):
    df_val.append(line)

In [40]:
df_val = pd.DataFrame(df_val)

In [41]:
with open('tmp/df_val.bin', 'wb') as f:
    pickle.dump(df_val, f)

Read data

In [1]:
import pickle

import pandas as pd
import numpy as np
import scipy.sparse as sp

In [2]:
with open('tmp/df_train.bin', 'rb') as f:
    df_train = pickle.load(f)

In [2]:
with open('tmp/df_val.bin', 'rb') as f:
    df_val = pickle.load(f)

In [3]:
def to_csr(cols, vals, shape=74000):
    lens = [len(c) for c in cols]
    intptr = np.zeros((len(cols) + 1), dtype='uint32')
    intptr[1:] = lens
    intptr = intptr.cumsum()

    columns = np.concatenate(cols).astype('uint32')
    values = np.concatenate(vals).astype('uint8')

    return sp.csr_matrix((values, columns, intptr), shape=(len(cols), shape))

In [4]:
X_train = to_csr(list(df_train.idx), list(df_train.val))
X_val = to_csr(list(df_val.idx), list(df_val.val))

In [5]:
sp.save_npz('tmp/X_train_sparse.npz', X_train, compressed=False)
sp.save_npz('tmp/X_val_sparse.npz', X_val, compressed=False)

In [9]:
y_train = df_train.label.values.astype('uint8')
y_val = df_val.label.values.astype('uint8')

In [6]:
prospensity_val = df_val.propensity.astype('float32')

In [15]:
X_num_train = df_train[['f0', 'f1']].values.astype('uint16')
X_num_val = df_val[['f0', 'f1']].values.astype('uint16')

In [18]:
np.save('tmp/y_train.npy', y_train)
np.save('tmp/y_val.npy', y_val)

np.save('tmp/X_num_train.npy', X_num_train)
np.save('tmp/X_num_val.npy', X_num_val)

In [7]:
np.save('tmp/prospensity_val.npy', prospensity_val)

Restart

In [1]:
import pickle

import pandas as pd
import numpy as np
import scipy.sparse as sp

In [2]:
X_train = sp.load_npz('tmp/X_train_sparse.npz')
X_val = sp.load_npz('tmp/X_val_sparse.npz')

In [3]:
y_train = np.load('tmp/y_train.npy', )
y_val = np.load('tmp/y_val.npy', )

X_num_train = np.load('tmp/X_num_train.npy', )
X_num_val = np.load('tmp/X_num_val.npy', )

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from time import time

In [5]:
cnt = (X_train > 0).sum(axis=0)
cnt = np.asarray(cnt)[0]
mask = cnt >= 10

In [6]:
(cnt >= 10).mean()

0.53309459459459463

In [7]:
X_train_f = X_train[:, mask]
X_val_f = X_val[:, mask]

In [8]:
X_train_f, X_val_f

(<10631142x39449 sparse matrix of type '<class 'numpy.uint8'>'
 	with 261790690 stored elements in Compressed Sparse Row format>,
 <3544334x39449 sparse matrix of type '<class 'numpy.uint8'>'
 	with 87299061 stored elements in Compressed Sparse Row format>)

In [9]:
log = open('exp.log', 'w')

In [10]:
for C in [0.05, 0.1, 0.5, 1]:
    t0 = time()

    lr = LogisticRegression(penalty='l1', C=C, random_state=1)
    lr.fit(X_train_f, y_train)

    y_pred = lr.decision_function(X_val_f)
    auc = roc_auc_score(y_val, y_pred)

    print('C=%s, took %.3fs, auc=%.3f' % (C, time() - t0, auc))
    log.write('C=%s, took %.3fs, auc=%.3f' % (C, time() - t0, auc))
    log.write('\n')
    log.flush()

C=0.05, took 7730.881s, auc=0.731
C=0.1, took 7685.568s, auc=0.733
C=0.5, took 10003.983s, auc=0.734
C=1, took 9697.831s, auc=0.733


In [11]:
;

''

In [12]:
log.flush()
log.close()