In [2]:
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm
import feather
import pickle

from sklearn.metrics import f1_score

from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD



In [3]:
with open('topics.bin', 'rb') as f:
    all_topics, selected_topics = pickle.load(f)

In [4]:
df_train = feather.read_dataframe('df_train.feather')
df_val = feather.read_dataframe('df_val.feather')
df_test = feather.read_dataframe('df_test.feather')

In [5]:
df_all = pd.concat([df_train, df_val]).reset_index(drop=1)
del df_train, df_val

In [6]:
def get_y(df, topics):
    topic_idx = {t: i for (i, t) in enumerate(topics)}
    y = np.zeros((len(df), len(topics)), dtype='uint8')

    for idx, topics in enumerate(df.topics):
        for t in topics.split(','):
            if t in topic_idx:
                y[idx, topic_idx[t]] = 1
    return y

y_train = get_y(df_all, selected_topics)

In [7]:
cv = KFold(n=len(df_all), n_folds=3, shuffle=True, random_state=1)

In [8]:
fold = np.zeros(len(df_all), dtype='uint8')

for i, (_, val_idx) in enumerate(cv):
    fold[val_idx] = i

In [19]:
vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), min_df=10)

X_train = vec.fit_transform(df_all.body)
X_test = vec.transform(df_test.body)

In [20]:
svd = TruncatedSVD(n_components=10, random_state=1)

X_train = svd.fit_transform(X_train).astype('float32')
X_test = svd.transform(X_test).astype('float32')

with open('svd_train_test_10.bin', 'wb') as f:
    pickle.dump((X_train, X_test), f)

with open('svd_train_test_10.bin', 'rb') as f:
    X_train, X_test = pickle.load(f)

In [12]:
del df_all
del vec, svd
gc.collect()

74

In [13]:
with open('knn3_preds_all.bin', 'rb') as f:
    knn_train, knn_test = pickle.load(f)

In [14]:
with open('nn_preds_all.bin', 'rb') as f:
    nn_train, nn_test = pickle.load(f)

In [15]:
with open('smv_models_pred.bin', 'rb') as f:
    _, svm_train_dict, svm_test_dict = pickle.load(f)
    del _

In [16]:
svm_train = np.zeros_like(knn_train)
svm_test = np.zeros_like(knn_test)

for i, c in enumerate(selected_topics):
    svm_train[:, i] = svm_train_dict[c]
    svm_test[:, i] = svm_test_dict[c]

In [17]:
del svm_train_dict, svm_test_dict
gc.collect()

158

In [207]:
from scipy.stats import rankdata

In [18]:
nn_train.shape

(177350, 139)

In [206]:
nn_train.shape

(177350, 139)

In [217]:
nn_ranks = []
knn_ranks = []
svm_ranks =[]

for i in tqdm(range(nn_train.shape[0])):
    nn = rankdata(nn_train[i])
    nn_ranks.append(nn.astype('uint8'))

    knn = rankdata(knn_train[i])
    knn_ranks.append(knn.astype('uint8'))

    svm = rankdata(svm_train[i])
    svm_ranks.append(svm.astype('uint8'))

100%|██████████| 177350/177350 [00:46<00:00, 3797.63it/s]


In [221]:
nn_ranks = np.array(nn_ranks)
knn_ranks = np.array(knn_ranks)
svm_ranks = np.array(knn_ranks)

In [218]:
nn_ranks_test = []
knn_ranks_test = []
svm_ranks_test =[]

for i in tqdm(range(nn_test.shape[0])):
    nn = rankdata(nn_test[i])
    nn_ranks_test.append(nn.astype('uint8'))

    knn = rankdata(knn_test[i])
    knn_ranks_test.append(knn.astype('uint8'))

    svm = rankdata(svm_test[i])
    svm_ranks_test.append(svm.astype('uint8'))

100%|██████████| 7581/7581 [00:02<00:00, 3702.11it/s]


In [222]:
nn_ranks_test = np.array(nn_ranks_test)
knn_ranks_test = np.array(knn_ranks_test)
svm_ranks_test = np.array(knn_ranks_test)

In [19]:
nn_train_flat = nn_train.flatten(order='C')
nn_test_flat = nn_test.flatten(order='C')
knn_train_flat = knn_train.flatten(order='C')
knn_test_flat = knn_test.flatten(order='C')
svm_train_flat = svm_train.flatten(order='C')
svm_test_flat = svm_test.flatten(order='C')

y = y_train.flatten(order='C')

In [223]:
nn_train_rank_flat = nn_ranks.flatten(order='C')
nn_test_rank_flat = nn_ranks_test.flatten(order='C')
knn_train_rank_flat = knn_ranks.flatten(order='C')
knn_test_rank_flat = knn_ranks_test.flatten(order='C')
svm_train_rank_flat = svm_ranks.flatten(order='C')
svm_test_rank_flat = svm_ranks_test.flatten(order='C')

In [114]:
num_classes = 139

In [20]:
idx = np.arange(0, num_classes, dtype='float32')
idx_train = np.repeat(idx, nn_train.shape[0])
idx_test = np.repeat(idx, nn_test.shape[0])

In [224]:
meta_train = np.array([nn_train_flat, knn_train_flat, svm_train_flat, 
                       nn_train_rank_flat, knn_train_rank_flat, svm_train_rank_flat,
                       idx_train]).T
meta_test = np.array([nn_test_flat, knn_test_flat, svm_test_flat,
                      nn_test_rank_flat, knn_test_rank_flat, svm_test_rank_flat,
                      idx_test]).T

In [22]:
X_train_rep = np.repeat(X_train, num_classes, axis=0)
X_test_rep = np.repeat(X_test, num_classes, axis=0)

In [225]:
X_train = np.hstack([X_train_rep, meta_train])
X_test = np.hstack([X_test_rep, meta_test])

In [28]:
fold_rep = np.repeat(fold, num_classes)

In [226]:
X_train_part = X_train[fold_rep != 0]
X_val = X_train[fold_rep == 0]

y_train = y[fold_rep != 0]
y_val = y[fold_rep == 0]

In [86]:
import xgboost as xgb

In [87]:
groups_train = np.repeat(num_classes, X_train_part.shape[0] // num_classes)
groups_val = np.repeat(num_classes, X_val.shape[0] // num_classes)

In [227]:
features = ['svd_%d' % i for i in range(10)] + \
           ['nn', 'knn', 'svm', 'nn_rank', 'knn_rank', 'svm_rank', 'label_id']

In [229]:
dtrain = xgb.DMatrix(X_train_part, y_train, feature_names=features)
dtrain.set_group(groups_train)

In [230]:
dval = xgb.DMatrix(X_val, y_val, feature_names=features)
dval.set_group(groups_val)

In [249]:
def f1_eval_metric(y_pred, dmatrix):
    y_true = dmatrix.get_label()
    n, = y_true.shape 

    y_true = y_true.reshape(n // num_classes, -1)
    y_pred = y_pred.reshape(n // num_classes, -1)

    f1 = f1_score(y_true, y_pred >= 0.5, average='micro')
    #f1 = f1_score(y_true, y_pred >= 0, average='micro')

    return 'f1_micro', f1

In [253]:
xgb_pars = {
    'eta': 0.3,
    'gamma': 0.0,
    'max_depth': 3,
    'min_child_weight': 1,
    'max_delta_step': 0,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'colsample_bylevel': 1,
    'lambda': 1,
    'alpha': 0,
    'tree_method': 'approx',
    #'objective': 'rank:pairwise',
    'objective': 'binary:logistic',
    #'eval_metric': 'auc',
    'nthread': 12,
    'seed': 42,
    'silent': 1
}
n_rounds = 100

In [254]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [255]:
model = xgb.train(xgb_pars, dtrain, num_boost_round=125, verbose_eval=1,
                  evals=watchlist, feval=f1_eval_metric)

[0]	train-error:0.004986	val-error:0.005013	train-f1_micro:0.758129	val-f1_micro:0.756893
[1]	train-error:0.004847	val-error:0.004844	train-f1_micro:0.748555	val-f1_micro:0.749102
[2]	train-error:0.004435	val-error:0.004433	train-f1_micro:0.7651	val-f1_micro:0.765087
[3]	train-error:0.004269	val-error:0.004274	train-f1_micro:0.777693	val-f1_micro:0.777275
[4]	train-error:0.004161	val-error:0.004168	train-f1_micro:0.784009	val-f1_micro:0.783571
[5]	train-error:0.00406	val-error:0.004072	train-f1_micro:0.791551	val-f1_micro:0.790935
[6]	train-error:0.004009	val-error:0.004013	train-f1_micro:0.794399	val-f1_micro:0.794191
[7]	train-error:0.003986	val-error:0.004	train-f1_micro:0.795844	val-f1_micro:0.79509
[8]	train-error:0.003982	val-error:0.003999	train-f1_micro:0.794043	val-f1_micro:0.793148
[9]	train-error:0.00395	val-error:0.003972	train-f1_micro:0.796351	val-f1_micro:0.795193
[10]	train-error:0.003952	val-error:0.003973	train-f1_micro:0.797145	val-f1_micro:0.795996
[11]	train-error:

In [256]:
scores = model.get_score(importance_type='gain')
sorted(scores.items(), key=lambda x: -x[1])

[('nn_rank', 3936.461176666668),
 ('svm', 3297.40996963768),
 ('knn', 2275.5096084259258),
 ('nn', 1577.8046514388493),
 ('knn_rank', 769.506557662338),
 ('svm_rank', 140.05032555555556),
 ('svd_4', 30.836141071428578),
 ('svd_7', 22.373825312500003),
 ('svd_0', 20.932332127659574),
 ('svd_3', 14.446979285714287),
 ('svd_8', 11.907598620689654),
 ('svd_1', 11.8876937037037),
 ('svd_5', 11.863790789473688),
 ('svd_6', 11.425773809523808),
 ('svd_2', 10.448287423076923),
 ('svd_9', 9.287284583333333),
 ('label_id', 6.874789285714286)]

In [261]:
scores = model.get_fscore()
sorted(scores.items(), key=lambda x: -x[1])

[('nn', 139),
 ('svm', 138),
 ('knn', 108),
 ('nn_rank', 81),
 ('knn_rank', 77),
 ('svd_0', 47),
 ('svd_5', 38),
 ('svd_7', 32),
 ('svd_8', 29),
 ('svd_4', 28),
 ('svd_3', 28),
 ('svd_1', 27),
 ('svd_2', 26),
 ('svd_9', 24),
 ('svd_6', 21),
 ('svm_rank', 18),
 ('label_id', 14)]

In [257]:
y_pred = model.predict(dval)
n, = y_pred.shape
y_pred = y_pred.reshape(n // num_classes, -1)

In [258]:
y_val_reshaped = y_val.reshape(n // num_classes, -1)

In [260]:
for t in np.linspace(0, 1, 21):
    f = f1_score(y_val_reshaped, y_pred >= t, average='micro')
    print('t=%0.2f, f1=%0.4f' % (t, f))

t=0.00, f1=0.0205
t=0.05, f1=0.6414
t=0.10, f1=0.7210
t=0.15, f1=0.7605
t=0.20, f1=0.7817
t=0.25, f1=0.7947
t=0.30, f1=0.8028
t=0.35, f1=0.8067
t=0.40, f1=0.8081
t=0.45, f1=0.8081
t=0.50, f1=0.8049
t=0.55, f1=0.7993
t=0.60, f1=0.7911
t=0.65, f1=0.7806
t=0.70, f1=0.7656
t=0.75, f1=0.7465
t=0.80, f1=0.7199
t=0.85, f1=0.6839
t=0.90, f1=0.6310
t=0.95, f1=0.5381
t=1.00, f1=0.0000


  'precision', 'predicted', average, warn_for)


In [245]:
for t in np.linspace(3, 5, 21):
    f = f1_score(y_val_reshaped, y_pred >= t, average='micro')
    print('t=%0.2f, f1=%0.4f' % (t, f))

t=3.00, f1=0.7661
t=3.10, f1=0.7727
t=3.20, f1=0.7790
t=3.30, f1=0.7850
t=3.40, f1=0.7901
t=3.50, f1=0.7943
t=3.60, f1=0.7982
t=3.70, f1=0.8016
t=3.80, f1=0.8040
t=3.90, f1=0.8060
t=4.00, f1=0.8069
t=4.10, f1=0.8069
t=4.20, f1=0.8062
t=4.30, f1=0.8052
t=4.40, f1=0.8030
t=4.50, f1=0.8005
t=4.60, f1=0.7967
t=4.70, f1=0.7926
t=4.80, f1=0.7875
t=4.90, f1=0.7817
t=5.00, f1=0.7749


In [238]:
for t in np.linspace(2, 4, 21):
    f = f1_score(y_val_reshaped, y_pred >= t, average='micro')
    print('t=%0.2f, f1=%0.4f' % (t, f))

t=2.00, f1=0.7225
t=2.10, f1=0.7342
t=2.20, f1=0.7455
t=2.30, f1=0.7557
t=2.40, f1=0.7654
t=2.50, f1=0.7749
t=2.60, f1=0.7830
t=2.70, f1=0.7900
t=2.80, f1=0.7959
t=2.90, f1=0.8014
t=3.00, f1=0.8049
t=3.10, f1=0.8067
t=3.20, f1=0.8065
t=3.30, f1=0.8043
t=3.40, f1=0.8009
t=3.50, f1=0.7945
t=3.60, f1=0.7869
t=3.70, f1=0.7778
t=3.80, f1=0.7660
t=3.90, f1=0.7524
t=4.00, f1=0.7372


In [262]:
dtest = xgb.DMatrix(X_test, feature_names=features)

In [263]:
y_pred = model.predict(dtest)
n, = y_pred.shape
y_pred = y_pred.reshape(n // num_classes, -1)

In [264]:
test_preds = dict(zip(selected_topics, (y_pred >= 0.4).astype('uint8').T))

In [265]:
all_zeros = np.zeros(len(df_test), dtype='uint8')

df_final_pred = pd.DataFrame()
df_final_pred['id'] = df_test['key']

for t in all_topics:
    if t in test_preds:
        df_final_pred[t] = test_preds[t]
    else:
        df_final_pred[t] = all_zeros

In [266]:
df_final_pred.to_csv('xgb3.csv', index=False)