In [1]:
%%time
import warnings; warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from keras.layers import Input, Dense, BatchNormalization
from keras.models import Model
from keras import optimizers, activations, losses


NUM_CUSTS = 2000
NUM_PRODS = 40
NUM_WEEKS = 49
WINDOW_SIZE = 4
ROW_INP_LEN = NUM_PRODS*2+WINDOW_SIZE+3

Using TensorFlow backend.


CPU times: user 1.05 s, sys: 253 ms, total: 1.3 s
Wall time: 1.11 s


In [2]:
%%time
def load_train_data(random_state=123, valid_size=0.1):
    """
    Load train/test data from disk and perform preprocessing to prepare it for classifiers.
    
    Returns:
    - X_train, y_train, X_val, y_val, X_test
    """
    def cast(x):
        try:
            return float(x)
        except:
            return 0
    
    X = np.zeros((NUM_CUSTS, NUM_PRODS, NUM_WEEKS, ROW_INP_LEN))
    X_test = np.zeros((NUM_CUSTS, NUM_PRODS, 1, ROW_INP_LEN))
    y = np.zeros((NUM_CUSTS, NUM_WEEKS, NUM_PRODS))

    df = pd.read_csv('./train.csv')

    # Add "discount" column
    org_price = df.loc[(df['advertised'] == 0), ['j', 'price']].drop_duplicates().sort_values('j')['price'].values
    df['discount'] = df.apply(lambda row: 1-row['price']/org_price[int(row['j'])], axis=1)

    # pre-init of useful conditions
    cond_i = [df['i'] == i for i in range(NUM_CUSTS)]
    cond_j = [df['j'] == j for j in range(NUM_PRODS)]
    cond_t = [df['t'] == t for t in range(NUM_WEEKS)]

    # Specify product by value, value = how much prod_j is been purchased
    # ex. prod_0 = 3251, prod_6 = 16190
    # np.log1p: for normalization
    prod_val = [np.log1p(len(df[cond_j[j]])) for j in range(NUM_PRODS)]
    
    # Represent user by vector, vector = how much user_i purchased prod_j
    # ex. user_0 = [2 2 1 ... 0 0 4], user_2 = [0 2 0 ... 0 0 5]
    # makes sort of uniqe identity and relation between simmiler users
    user_vec = [[np.log1p(len(df[cond_i[i] & cond_j[j]]))
                 for j in range(NUM_PRODS)] for i in range(NUM_CUSTS)]
    
    # Discount history
    disc_his = [[cast(df.loc[cond_j[j] & cond_t[t], 'discount'].drop_duplicates())
                 for t in range(NUM_WEEKS)] for j in range(NUM_PRODS)]
    
    # Advertise history
    advr_his = [[cast(df.loc[cond_j[j] & cond_t[t], 'advertised'].drop_duplicates())
                 for t in range(NUM_WEEKS)] for j in range(NUM_PRODS)]
    
    # Fill in y with output-labels
    for i in range(NUM_CUSTS):
        for t in range(NUM_WEEKS):
            # Labels all the products purchased by user_i in week_t
            y[i, t, df.loc[cond_i[i] & cond_t[t], 'j'].values] = 1
    
    # Fill in the input-metrix X
    for i in range(NUM_CUSTS):
        for j in range(NUM_PRODS):
            for t in range(WINDOW_SIZE, NUM_WEEKS):
                # [0:NUM_PRODS]: user_i vector
                X[i, j, t, :NUM_PRODS] = user_vec[i]
                # [NUM_PRODS:NUM_PRODS*2]: prod_j vector "one hot vector"
                X[i, j, t, NUM_PRODS+j] = 1
                # [NUM_PRODS*2]: prod_j value
                X[i, j, t, NUM_PRODS*2] = prod_val[j]
                # [NUM_PRODS*2+1]: prod_j discount in week_t
                X[i, j, t, NUM_PRODS*2+1] = disc_his[j][t]
                # [NUM_PRODS*2+2]: is prod_j advertised in week_t
                X[i, j, t, NUM_PRODS*2+2] = advr_his[j][t]
                # [NUM_PRODS*2+3:NUM_PRODS*2+2+WINDOW_SIZE]: user_i purchases history of prod_j
                X[i, j, t, NUM_PRODS*2+3:] = y[i, t-WINDOW_SIZE:t, j]
    
    # Fill in the test input-metrix X_test
    tps = pd.read_csv('./promotion_schedule.csv')
    cond_j_test = [tps['j'] == j for j in range(NUM_PRODS)]
    for i in range(NUM_CUSTS):
        for j in range(NUM_PRODS):
            X_test[i, j, 0, :NUM_PRODS] = user_vec[i]
            X_test[i, j, 0, NUM_PRODS+j] = 1
            X_test[i, j, 0, NUM_PRODS*2] = prod_val[j]
            X_test[i, j, 0, NUM_PRODS*2+1] = cast(tps.loc[cond_j_test[j], 'discount'])
            X_test[i, j, 0, NUM_PRODS*2+2] = cast(tps.loc[cond_j_test[j], 'advertised'])
            X_test[i, j, 0, NUM_PRODS*2+3:] = y[i, -WINDOW_SIZE:, j]
    
    # Reshape X, X_test and y
    X = X[:, :, WINDOW_SIZE:].reshape(-1, ROW_INP_LEN)
    X_test = X_test.reshape(-1, ROW_INP_LEN)
    y = y[:, WINDOW_SIZE:].transpose((0, 2, 1)).reshape(-1, 1)
    y = np.concatenate((y.astype(np.int8), (y == 0).astype(np.int8)), axis=1)
    
    # Split X and y to train and valid sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=random_state, valid_size=valid_size)
    
    return X_train, y_train, X_val, y_val, X_test


X_train, y_train, X_val, y_val, X_test = load_train_data()

CPU times: user 2min 14s, sys: 2.48 s, total: 2min 17s
Wall time: 2min 26s


In [3]:
%%time
def get_model():
    inp = Input(shape=[X_train.shape[1]])
    
    h_layer = BatchNormalization() (Dense(128, activation=lambda x: activations.elu(x, alpha=0.8)) (inp))
    h_layer = Dense(32, activation=lambda x: activations.elu(x, alpha=0.2)) (h_layer)
    h_layer = Dense(8, activation=lambda x: activations.elu(x, alpha=0.05)) (h_layer)
    
    output = Dense(2, activation=activations.softmax) (h_layer)
    
    model = Model([inp], output)
    model.compile(loss=losses.binary_crossentropy, optimizer=optimizers.Adam(lr=3e-3, beta_1=0.92, beta_2=0.94))
    
    return model


model = get_model()
for i in range(10):
    model.fit(X_train, y_train, epochs=1, batch_size=2**(8+i), class_weight="balenced", verbose=0)
    print("[EPOCH {}, ROC-AUC Valid] {}".format(i, roc_auc_score(y_val, model.predict(X_val, batch_size=512))))

[EPOCH 0, ROC-AUC Valid] 0.865060010432247
[EPOCH 1, ROC-AUC Valid] 0.8843904142782171
[EPOCH 2, ROC-AUC Valid] 0.8942961787784827
[EPOCH 3, ROC-AUC Valid] 0.8956277427320496
[EPOCH 4, ROC-AUC Valid] 0.8965357004066832
[EPOCH 5, ROC-AUC Valid] 0.8979278411671864
[EPOCH 6, ROC-AUC Valid] 0.8980160118058149
[EPOCH 7, ROC-AUC Valid] 0.8986029549049459
[EPOCH 8, ROC-AUC Valid] 0.8987987514061534
[EPOCH 9, ROC-AUC Valid] 0.8986597412090079
CPU times: user 4min 51s, sys: 29.9 s, total: 5min 21s
Wall time: 3min 45s


In [4]:
%%time
y_test = model.predict(X_test, batch_size=512)[:, 0]
submission = pd.DataFrame([[i, j, y_test[i*NUM_PRODS+j]] for i in range(NUM_CUSTS) for j in range(NUM_PRODS)],
             columns=['user_id', 'product_id', 'prediction'])
submission.to_csv("week_50_predictions.csv", index=False, float_format='%.17f')

CPU times: user 760 ms, sys: 39.1 ms, total: 799 ms
Wall time: 743 ms
