In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import feather
import pickle

In [4]:
with open('topics.bin', 'rb') as f:
    all_topics, selected_topics = pickle.load(f)

In [5]:
df_train = feather.read_dataframe('df_train.feather')
df_val = feather.read_dataframe('df_val.feather')
df_test = feather.read_dataframe('df_test.feather')

In [4]:
df_all = pd.concat([df_train, df_val]).reset_index(drop=1)
del df_train, df_val

In [5]:
def get_y(df, topics):
    topic_idx = {t: i for (i, t) in enumerate(topics)}
    y = np.zeros((len(df), len(topics)), dtype='uint8')

    for idx, topics in enumerate(df.topics):
        for t in topics.split(','):
            if t in topic_idx:
                y[idx, topic_idx[t]] = 1
    return y

y_train = get_y(df_all, selected_topics)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import log_loss, f1_score

from sklearn.cross_validation import KFold

cv = KFold(n=len(df_all), n_folds=3, shuffle=True, random_state=1)



In [7]:
vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), min_df=10)

X_train = vec.fit_transform(df_all.body)
X_test = vec.transform(df_test.body)

In [8]:
svd = TruncatedSVD(n_components=180, random_state=1)

X_train = svd.fit_transform(X_train).astype('float32')
X_test = svd.transform(X_test).astype('float32')

In [9]:
scaler = StandardScaler(with_mean=True, copy=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD, Adam
from keras.regularizers import l1, l2
from keras.callbacks import EarlyStopping
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU, ELU, LeakyReLU
from keras.callbacks import EarlyStopping, Callback

Using Theano backend.


In [17]:
class WatchlistCallback(Callback):
    def __init__(self, watchlist):
        super(Callback, self).__init__()
        self.X, self.y = watchlist

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.X, verbose=0)

        print("epoch no %d" % (epoch), end=', ')
        print('logloss=%.4f' % log_loss(self.y, y_pred), end=', ')

        f = f1_score(self.y, y_pred >= 0.5, average='micro')
        print('f1=%0.4f' % f)

In [12]:
input_dim = X_train.shape[1]
output_dim = y_train.shape[1]

In [13]:
def nnet():
    model = Sequential()

    model.add(Dense(input_dim=input_dim, units=350, kernel_initializer='glorot_uniform')) 
    model.add(PReLU())
    model.add(Dropout(0.2))

    model.add(BatchNormalization())

    model.add(Dense(input_dim=350, units=350, kernel_initializer='glorot_uniform')) 
    model.add(PReLU())
    model.add(Dropout(0.2)) 

    model.add(BatchNormalization())

    model.add(Dense(units=output_dim, kernel_initializer='glorot_uniform')) 
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001))

    return model

In [18]:
from time import time

In [25]:
train_preds = []
test_preds = []
models = []

epochs = 600

for i, (train_idx, val_idx) in enumerate(cv):
    t0 = time()
    print('fold %d...' % i)
    model = nnet()
    
    watchlist = WatchlistCallback(watchlist=(X_train[val_idx], y_train[val_idx]))
    model.fit(X_train[train_idx], y_train[train_idx], epochs=epochs, 
              batch_size=50000, callbacks=[watchlist], verbose=0)
    models.append(model)

    y_pred = model.predict(X_train[val_idx])
    y_pred_test = model.predict(X_test)

    train_preds.append(y_pred)
    test_preds.append(y_pred_test)

    print('fold %d, took %.3fs' % (i, time() - t0))
    print()

fold 0...
epoch no 0, logloss=7.0342, f1=0.0258
epoch no 1, logloss=6.9478, f1=0.0314
epoch no 2, logloss=6.8749, f1=0.0351
epoch no 3, logloss=6.8116, f1=0.0382
epoch no 4, logloss=6.7556, f1=0.0416
epoch no 5, logloss=6.7070, f1=0.0460
epoch no 6, logloss=6.6673, f1=0.0522
epoch no 7, logloss=6.6359, f1=0.0609
epoch no 8, logloss=6.6050, f1=0.0764
epoch no 9, logloss=6.5728, f1=0.1046
epoch no 10, logloss=6.5374, f1=0.1570
epoch no 11, logloss=6.5005, f1=0.2480
epoch no 12, logloss=6.4620, f1=0.3771
epoch no 13, logloss=6.4184, f1=0.5169
epoch no 14, logloss=6.3655, f1=0.5915
epoch no 15, logloss=6.3124, f1=0.5915
epoch no 16, logloss=6.2595, f1=0.5626
epoch no 17, logloss=6.2086, f1=0.5230
epoch no 18, logloss=6.1654, f1=0.4815
epoch no 19, logloss=6.1184, f1=0.4348
epoch no 20, logloss=6.0788, f1=0.3933
epoch no 21, logloss=6.0161, f1=0.3412
epoch no 22, logloss=5.9864, f1=0.3127
epoch no 23, logloss=5.9503, f1=0.2869
epoch no 24, logloss=5.9021, f1=0.2556
epoch no 25, logloss=5.86

In [28]:
nn_oof_train_pred = np.zeros(y_train.shape, dtype='float32')

for i, (train_idx, val_idx) in enumerate(cv):
    nn_oof_train_pred[val_idx] = train_preds[i]

In [30]:
nn_oof_train_pred.shape

(177350, 139)

In [33]:
for t in np.linspace(0, 1, 21):
    f = f1_score(y_train, nn_oof_train_pred >= t, average='micro')
    print('t=%0.2f, f1=%0.4f' % (t, f))

t=0.00, f1=0.0205
t=0.05, f1=0.6686
t=0.10, f1=0.7299
t=0.15, f1=0.7545
t=0.20, f1=0.7662
t=0.25, f1=0.7708
t=0.30, f1=0.7710
t=0.35, f1=0.7683
t=0.40, f1=0.7632
t=0.45, f1=0.7563
t=0.50, f1=0.7479
t=0.55, f1=0.7374
t=0.60, f1=0.7249
t=0.65, f1=0.7097
t=0.70, f1=0.6921
t=0.75, f1=0.6705
t=0.80, f1=0.6438
t=0.85, f1=0.6082
t=0.90, f1=0.5574
t=0.95, f1=0.4718
t=1.00, f1=0.0022


In [27]:
nn_test_pred = np.mean(test_preds, axis=0)

In [34]:
with open('nn_preds_all.bin', 'wb') as f:
    pickle.dump((nn_oof_train_pred, nn_test_pred), f)

In [2]:
with open('nn_preds_all.bin', 'rb') as f:
    nn_oof_train_pred, nn_test_pred = pickle.load(f)

In [6]:
test_preds = dict(zip(selected_topics, (nn_test_pred >= 0.3).astype('uint8').T))

In [7]:
all_zeros = np.zeros(len(df_test), dtype='uint8')

df_final_pred = pd.DataFrame()
df_final_pred['id'] = df_test['key']

for t in all_topics:
    if t in test_preds:
        df_final_pred[t] = test_preds[t]
    else:
        df_final_pred[t] = all_zeros

In [10]:
df_final_pred.to_csv('nn.csv', index=False)