In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Dense,Embedding,LSTM,Dropout,Bidirectional,Input,BatchNormalization,TimeDistributed,CuDNNGRU
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping,ModelCheckpoint,ReduceLROnPlateau
from keras.layers.recurrent import LSTM, GRU, SimpleRNN
#from keras_contrib.layers import CRF

Using TensorFlow backend.


In [4]:
    # Load data
    train = pd.read_csv("train.csv")
    test = pd.read_csv("test.csv")
    
    # Create labels
    label = train[train.challenge_sequence > 10][['user_id','challenge']]
    label.rename(columns={'challenge':'label'},inplace=True)
    
    # Treat the sequence of challenges as text
    df = train[train.challenge_sequence <= 10].groupby('user_id').challenge.aggregate(lambda x: ' '.join(x)).reset_index()
    
    # Merge Labels
    df = df.merge(label)
    
    # Validation split for early stopping
    df_train, df_validation = train_test_split(df.sample(frac=1,random_state=123), test_size=0.05, random_state=123)
    
    # Load all the challenges
    challenges = pd.read_csv('challenge_data.csv')
    
    # Encode challenges
    encoder = LabelEncoder()
    encoder.fit(challenges['challenge_ID'])
    df_train['brand_id_encoded'] = encoder.transform(df_train.label)
    df_validation['brand_id_encoded'] = encoder.transform(df_validation.label)
    
    # Tokenize text
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df_train['challenge'])
    
    # Constants
    NB_WORDS = len(tokenizer.word_index)
    MAX_SEQUENCE_LENGTH = 10
    N_CATEGORIES = challenges.shape[0]
    
    # Create sequences
    sequences_train = tokenizer.texts_to_sequences(df_train['challenge'])
    sequences_validation = tokenizer.texts_to_sequences(df_validation['challenge'])
    
    # Pad sequences
    x_train = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    x_validation = pad_sequences(sequences_validation, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    
    # Set Labels
    y_train = df_train['brand_id_encoded'].values
    y_validation= df_validation['brand_id_encoded'].values

    # Test preprocessing
    def padding(text):
        return pad_sequences(tokenizer.texts_to_sequences(text), maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    test_text = test[test.challenge_sequence <= 10].groupby('user_id').challenge.aggregate(lambda x: ' '.join(x)).reset_index()
    x_test = padding(test_text.challenge)

In [9]:
    # Model callbacks
    path = 'best_model_weights'
    es_callback = EarlyStopping(monitor="val_loss", patience=50)
    mc_callback = ModelCheckpoint('{}.hdf5'.format(path), monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='auto', period=1)
    lr_callback = ReduceLROnPlateau(monitor='val_accuracy', patience=5, verbose=1, factor=0.5, min_lr=0.001)
    callbacks = [lr_callback]

In [48]:
    # NN architecture
    def get_model(path='',lr=0.005,dim):
        adam = Adam(lr=lr)
        inp = Input(shape=(MAX_SEQUENCE_LENGTH, ))
        x = Embedding(NB_WORDS,dim)(inp)
        x = BatchNormalization()(x)
        x = Bidirectional(LSTM(dim, dropout=0.1, recurrent_dropout=0.1))(x)
        x = Dropout(0.5)(x)
        x = Dense(N_CATEGORIES, activation="softmax")(x)
        model = Model(inputs=inp, outputs=x)
        
        if path != '':
            model.load_weights(path)
        model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
        return model
    
    # Initialize the model
    model0 = get_model(dim=256)
    model2 = get_model(dim=512)
    model4 = get_model(dim=128)

In [49]:
# Fit the model
model0.fit(x_train,
              y_train,
              epochs=100,
              batch_size=1024,
              validation_data=(x_validation, y_validation),
              callbacks = callbacks)
    
# Load best weights
#model = get_model('{}.hdf5'.format(path))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 198166 samples, validate on 10430 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100

Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.0012499999720603228.
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.001.
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100

Epoch 00023: ReduceLROnPlateau reducing learning rate to 0.001.
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100

Epoch 00028: ReduceLROnPlateau reducing learning rate to 0.001.
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100

Epoch 00033: ReduceLROnPlateau reducing learning rate to 0.001.
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100

Epoch 00038: ReduceLROnPlateau reducing l

<keras.callbacks.callbacks.History at 0x7f518f426080>

In [50]:
model0.load_weights('LSTM_256_100epoch.hdf5')
model2.load_weights('LSTM_512_100epoch.hdf5')
model4.load_weights('LSTM_128_100epoch.hdf5')

In [28]:
    # Get top 3 predictions for each user
    pred0 = model0.predict(x_test,batch_size=2048)
    pred = pred0.argsort(axis=1)[:,-3:][:,::-1]

In [36]:
    pred2 = model2.predict(x_test,batch_size=2048)

In [52]:
    pred4 = model4.predict(x_test,batch_size=2048)
    pred = pred4.argsort(axis=1)[:,-3:][:,::-1]

In [40]:
def get_model1(path='',lr=0.005,dim):
        adam = Adam(lr=lr)
        inp = Input(shape=(MAX_SEQUENCE_LENGTH, ))
        x = Embedding(NB_WORDS,dim)(inp)
        x = BatchNormalization()(x)
        x = Bidirectional(GRU(dim))(x)
        x = Dropout(0.5)(x)
        x = Dense(N_CATEGORIES, activation="softmax")(x)
        model = Model(inputs=inp, outputs=x)
        if path != '':
            model.load_weights(path)
        model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
        return model

model1=get_model1(dim=256)
model3=get_model1(dim=512)
model5=get_model1(dim=128)

In [41]:
model1.fit(x_train,
              y_train,
              epochs=100,
              batch_size=1024,
              validation_data=(x_validation, y_validation),
              callbacks = callbacks
             )

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 198166 samples, validate on 10430 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100

Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.0012499999720603228.
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.001.
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100

Epoch 00023: ReduceLROnPlateau reducing learning rate to 0.001.
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100

Epoch 00028: ReduceLROnPlateau reducing learning rate to 0.001.
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100

Epoch 00033: ReduceLROnPlateau reducing learning rate to 0.001.
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100

Epoch 00038: ReduceLROnPlateau reducing l

<keras.callbacks.callbacks.History at 0x7f51fc3a9630>

In [42]:
model1.load_weights('GRU_512_100epoch.hdf5')
model3.load_weights('GRU_512_100epoch.hdf5')
model5.load_weights('GRU_512_100epoch.hdf5')

In [43]:
pred3 = model3.predict(x_test,batch_size=2048)
pred = pred3.argsort(axis=1)[:,-3:][:,::-1]

In [19]:
pred1 = model1.predict(x_test,batch_size=2048)
pred = pred1.argsort(axis=1)[:,-3:][:,::-1]

In [None]:
pred5 = model5.predict(x_test,batch_size=2048)
pred = pred3.argsort(axis=1)[:,-3:][:,::-1]

In [54]:
overall_pred=(pred0+pred1+pred2+pred3+pred4+pred5)/6

In [55]:
pred = overall_pred.argsort(axis=1)[:,-3:][:,::-1]

In [57]:
    # Write Predictions
    df_list = []
    for i in range(3):
        test_11 = test_text[['user_id']]
        test_11['user_sequence'] = test_11.user_id.astype(str) + '_'+str(i+11)
        test_11['challenge'] = encoder.inverse_transform(pred[:,i])
        df_list.append(test_11[['user_sequence','challenge']])
    pd.concat(df_list).to_csv('Ensemble_512_256_128.csv',index=False)