## Link with github project folder

In [2]:
!nvidia-smi

Wed Nov 20 01:16:45 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.50       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8    30W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
!git clone https://github.com/acmilannesta/Bert-embedding
!pip install keras-bert
# !git clone https://github.com/acmilannesta/eda_nlp

Cloning into 'Bert-embedding'...
remote: Enumerating objects: 147, done.[K
remote: Counting objects: 100% (147/147), done.[K
remote: Compressing objects: 100% (145/145), done.[K
remote: Total 147 (delta 68), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (147/147), 11.52 MiB | 5.76 MiB/s, done.
Resolving deltas: 100% (68/68), done.
Collecting keras-bert
  Downloading https://files.pythonhosted.org/packages/df/fe/bf46de1ef9d1395cd735d8df5402f5d837ef82cfd348a252ad8f32feeaef/keras-bert-0.80.0.tar.gz
Collecting keras-transformer>=0.30.0
  Downloading https://files.pythonhosted.org/packages/0a/57/496b1eab888171b0801a0a44d3245a7874b8d1cc04c1fbfdbb5e3327fc7a/keras-transformer-0.31.0.tar.gz
Collecting keras-pos-embd>=0.10.0
  Downloading https://files.pythonhosted.org/packages/09/70/b63ed8fc660da2bb6ae29b9895401c628da5740c048c190b5d7107cadd02/keras-pos-embd-0.11.0.tar.gz
Collecting keras-multi-head>=0.22.0
  Downloading https://files.pythonhosted.org/packages/40/3e/d0a64bb2ac5

In [0]:
import json
import numpy as np
import pandas as pd
from random import choice
import re, os, gc
import codecs
import boto3
from keras.layers import *
from keras.losses import sparse_categorical_crossentropy
from keras.models import Model, load_model
import keras.backend as K
from keras.optimizers import Adam, SGD
from keras.callbacks import Callback, LearningRateScheduler
import tensorflow as tf
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.metrics import f1_score
from tqdm import tqdm
from functools import partial
from keras_bert import load_trained_model_from_checkpoint, Tokenizer, AdamWarmup, calc_train_steps, get_custom_objects, get_model
import warnings
warnings.filterwarnings("ignore")
os.environ['AWS_SHARED_CREDENTIALS_FILE'] = 'AWS.txt'
s3 = boto3.Session(profile_name='default').client('s3')

Using TensorFlow backend.


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load Dataset
1. Add event weight
2. Reassign event code (0-47) for all
3. Add binary indicators for top 5 words by each event type


In [0]:
train = pd.read_csv('Bert-embedding/CDC/train.csv')
test = pd.read_csv('Bert-embedding/CDC/test.csv')
# Event weight
wt = pd.DataFrame(train.event.value_counts()/len(train)).rename(columns={'event':'weight'})
wt['event'] = wt.index
train = train.merge(wt, how='left', on='event')
# Reassign eventcode
train['event_idx'] = train.event.map({y:x for x, y in enumerate(np.sort(train.event.unique()))})
# Assign weight freqency
train['wt_freq'] = np.where(train.weight<0.01, 1, np.where(train.weight<0.05, 2, 3))

## EDA: Word frequencies by Event Type

In [0]:
# Worclouds for top 20 events 
from itertools import groupby 
from wordcloud import WordCloud
import matplotlib.pyplot as plt
train['text_grouped'] = train.groupby('event')['text'].transform(lambda x: ' '.join(x))
x = train[['event', 'text_grouped', 'weight']].drop_duplicates('event').sort_values('weight', ascending=False)
x.reset_index(drop=True, inplace=True)

f, ax = plt.subplots(10, 2, figsize=(30,30))

for seq in range(20):
    string = x.loc[seq, 'text_grouped'].split(' ')
    counts = [(len(list(c)),i) for i,c in groupby(sorted(string)) if len(i)>3 and i!='WORK'] 
    counts_dict = {x[1]:x[0] for x in counts}

    wordcloud = WordCloud(height=200, width=200, margin=0, collocations=False).generate_from_frequencies(counts_dict)
    
    ax[seq//2, seq%2].imshow(wordcloud, interpolation='bilinear')
    ax[seq//2, seq%2].set_title('event:'+str(x.loc[seq, 'event']) #+' weight:'+str(round(x.loc[seq, 'weight'], 2)),
                    ,fontsize=16, color='white')
    ax[seq//2, seq%2].set_axis_off()
    ax[seq//2, seq%2].margins(x=0, y=0)
    plt.tight_layout(w_pad=0.025)


# BERT

## Download BERT checkpoint and dictionary

In [0]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip

In [0]:
!wget https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip
!unzip wwm_uncased_L-24_H-1024_A-16.zip

In [0]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip
!unzip uncased_L-24_H-1024_A-16.zip

--2019-11-19 19:42:55--  https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.97.128, 2404:6800:4008:c03::80
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.97.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1247797031 (1.2G) [application/zip]
Saving to: ‘uncased_L-24_H-1024_A-16.zip’


2019-11-19 19:43:09 (87.0 MB/s) - ‘uncased_L-24_H-1024_A-16.zip’ saved [1247797031/1247797031]

Archive:  uncased_L-24_H-1024_A-16.zip
   creating: uncased_L-24_H-1024_A-16/
  inflating: uncased_L-24_H-1024_A-16/bert_model.ckpt.meta  
  inflating: uncased_L-24_H-1024_A-16/bert_model.ckpt.data-00000-of-00001  

## Parameter setting

In [0]:
MAXLEN = 142 #@param {type:'slider', min:50, max:300, step:1}
BATCH_SIZE = 16 #@param {type:'slider', min:8, max:32, step:8}
NUM_EPOCHS = 3
NUM_CLASSES = 48
LR = 4e-5
MIN_LR = 0
# OUTPUT_TRAIN = 'train_bert_ipredcv15_oof.csv'
# OUTPUT_TEST = 'test_base_cv15.npy'
choice = 'uncased_L-24_H-1024_A-16' #@param ['uncased_L-12_H-768_A-12', 'wwm_uncased_L-24_H-1024_A-16', 'uncased_L-24_H-1024_A-16']


## Tokenize train and validation set

In [0]:
token_dict = {}
with codecs.open(os.path.join(choice, 'vocab.txt'), 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)
tokenizer = Tokenizer(token_dict)

# token_dict1 = {}
# with codecs.open(dict_path1, 'r', 'utf8') as reader:
#     for line in reader:
#         token = line.strip()
#         token_dict1[token] = len(token_dict1)
# tokenizer1 = Tokenizer(token_dict1)

def convert_data(data_df, branch='training'):
    data_df = data_df.reset_index(drop=True)
    global tokenizer
    indices, indices1 = [], []
    for i in tqdm(range(len(data_df))):
        ids, segments = tokenizer.encode(data_df.loc[i, 'text'])
        # ids1, segments1 = tokenizer1.encode(data_df.loc[i, 'text'])
        indices.append(ids)
        # indices1.append(ids1)
    aux = data_df[['age', 'sex']].apply(lambda x: (x - min(x)) / (max(x)-min(x)))
    if branch=='training':
        targets = data_df['event_idx'] 
        return indices, np.array(targets), np.array(aux)
    else:
        return indices, np.array(aux)

## Data Generator

In [0]:
def seq_padding(X, padding=0):
    L = [len(x) for x in X]
    ML = max(L)
    return np.array([np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X])

class data_generator:
    def __init__(self, data, batch_size=BATCH_SIZE, branch='train'):
        self.data = data
        self.batch_size = batch_size
        self.branch = branch
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1

    def __len__(self):
        return self.steps

    def __iter__(self):
        while True:
            if self.branch == 'train':
                np.random.shuffle(self.data)
            for i in range(self.steps):
                d = self.data[i * self.batch_size: (i + 1) * self.batch_size]
                X1 = seq_padding([x[0] for x in d])           
                X2 = np.zeros_like(X1)
                # X3 = seq_padding([x[1] for x in d])           
                # X4 = np.zeros_like(X3)
                if self.branch == 'test':
                    aux = np.array([x[1] for x in d])
                    yield [X1, X2, aux]
                else:
                    Y = np.array([x[1] for x in d])
                    aux = np.array([x[2] for x in d])
                    yield [X1, X2, aux], Y

##Model Assemble

In [0]:
def model_build(len_train):
    global NUM_CLASSES
    global BATCH_SIZE
    global NUM_EPOCHS
    global MIN_LR
    global LR
    global MAXLEN


    bert_model = load_trained_model_from_checkpoint(
        os.path.join(choice, 'bert_config.json'),
        os.path.join(choice, 'bert_model.ckpt'),
        seq_len = MAXLEN,
        trainable=True
    )


    # clinic_model = load_trained_model_from_checkpoint(
    #     config_path1,
    #     checkpoint_path1,
    #     seq_len = MAXLEN,
    #     trainable=True
    # )

    x1_in = Input(shape=(None,))
    x2_in = Input(shape=(None,))
    # x3_in = Input(shape=(None,))
    # x4_in = Input(shape=(None,))
    aux_in = Input(shape=(2, ))

    inputs = bert_model([x1_in, x2_in])
    bert = Lambda(lambda x: x[:, 0])(inputs)


    dense = concatenate([bert, aux_in])
    outputs = Dense(NUM_CLASSES, activation='softmax')(dense)
    # outputs = Average()(outputs)
    model = Model([x1_in, x2_in, aux_in], outputs)

    decay_steps, warmup_steps = calc_train_steps(
        len_train,
        batch_size=BATCH_SIZE,
        epochs=NUM_EPOCHS,
    )

    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=AdamWarmup(
            decay_steps=decay_steps,
            warmup_steps=warmup_steps,
            lr=LR,
            min_lr=MIN_LR,
            ),
        metrics=['sparse_categorical_accuracy']
    )
    del bert_model
    gc.collect()
    return model

## Batchwise evaluation callback

In [0]:
class IntervalPrediction(Callback):

    def __init__(self, test_data, pred, pred1, nsplits, fold):
        super(Callback, self).__init__()
        self.test_data = test_data
        self.pred = pred
        self.nsplits = nsplits
        self.fold = fold
        self.pred1 = pred1
    def on_epoch_end(self, epoch, logs={}):
        # self.seen += logs.get('num_steps', 1)
        if epoch == 2:
            self.pred += self.model.predict_generator(self.test_data.__iter__(), len(self.test_data), verbose=1) * 0.3 / self.nsplits
            model_file = 'model-oof-'+str(self.fold)+'-'+str(epoch+1)+'.h5'
            self.model.save('model.h5')
            s3.upload_file('model.h5', 'acmilannesta', 'ipred/'+model_file)
        if epoch == 3:
            tmp = self.model.predict_generator(self.test_data.__iter__(), len(self.test_data), verbose=1) / self.nsplits
            self.pred += tmp * 0.7
            self.pred1 = tmp
            model_file = 'model-oof-'+str(self.fold)+'-'+str(epoch+1)+'.h5'
            self.model.save('model.h5')
            s3.upload_file('model.h5', 'acmilannesta', 'ipred/'+model_file)

In [0]:
test_indices, test_aux = convert_data(test, branch='test')
pred = np.zeros((len(test), NUM_CLASSES))
# pred = np.load(OUTPUT_TEST)
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=0)
idx = [x for x in kf.split(train, train.wt_freq)]

for i, (tr_idx, val_idx) in enumerate(idx[5:6], 6):
    print('\nFold - {:}\n'.format(i))
    tr, val = train.loc[tr_idx], train.loc[val_idx]
    tr_x, tr_y, tr_aux = convert_data(tr)
    val_x, val_y, val_aux = convert_data(val)
    model = model_build(len_train=len(tr_x))
    train_D = data_generator(list(zip(tr_x, tr_y, tr_aux)))
    valid_D = data_generator(list(zip(val_x, val_y, val_aux)), branch='valid')
    test_D = data_generator(list(zip(test_indices, test_aux)), branch='test')
    # ipred = IntervalPrediction(test_data=test_D, pred=pred, nsplits=kf.get_n_splits(), fold=i, pred1=pred1)
    model.fit_generator(
        train_D.__iter__(),
        steps_per_epoch=len(train_D),
        epochs=NUM_EPOCHS,
        # callbacks = [ipred]
    )
    oof_pred = model.predict_generator(valid_D.__iter__(), len(valid_D), verbose=1)
    # train_aug.loc[val_idx, 'oof_pred'] = np.argmax(oof_pred, 1)
    print('oof - {:} f1_score - {:.4f}'.format(i, f1_score(val_y, np.argmax(oof_pred, 1), average='weighted')))

    # pred += model.predict_generator(test_D.__iter__(), len(test_D), verbose=1) / kf.get_n_splits()
    # np.save(OUTPUT_TEST, pred)
    # s3.upload_file(Filename=OUTPUT_TEST, Bucket='acmilannesta', Key='base/'+OUTPUT_TEST)

    model_file = 'model-oof-'+str(i)+'.h5'
    model.save('model.h5')
    s3.upload_file(Filename='model.h5', Bucket='acmilannesta', Key='large_uncased/'+model_file)

    del model
    gc.collect()



100%|██████████| 75864/75864 [00:12<00:00, 5874.77it/s]
  0%|          | 572/123163 [00:00<00:21, 5719.58it/s]


Fold - 6



100%|██████████| 123163/123163 [00:21<00:00, 5618.09it/s]
100%|██████████| 30793/30793 [00:05<00:00, 5841.40it/s]






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.









Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/3
Epoch 2/3

In [0]:
test_D = data_generator(list(zip(test_indices, test_aux)), branch='test')
pred_md = np.zeros((len(test), NUM_CLASSES))
for i in range(2, 9):
    print('Fold - {:} - Prediction'.format(i))
    load_file = 'base/model-oof-' + str(i) + '.h5'
    s3.download_file(Bucket='acmilannesta', Key=load_file, Filename='model.h5')
    loaded = load_model('model.h5', custom_objects=get_custom_objects())
    pred_md += loaded.predict_generator(test_D.__iter__(), len(test_D), verbose=1) / kf.get_n_splits()
    del loaded
    !rm model.h5
    gc.collect()


In [0]:
model = model_build(len(test_D))
model.load_weights(model.h5)

In [0]:
test['event'] = np.argmax(pred, 1)
test.event = test.event.map({x: y for x, y in enumerate(np.sort(train.event.unique()))})
test.to_csv('solution.csv', index=False)
test.head()

Unnamed: 0,text,sex,age,event
0,54 Y O F PUNCTURE WOUND OF FIINGER RE ATTACHIN...,2,54,55
1,22 YOM CONTUSION TO LT LOWER LEG S P MVC HIT B...,1,22,24
2,20 YOM PT WORKS IN A QUARRY WAS ATTEMPTING TO...,1,20,71
3,38 YOF WAS WALKING AT WORK TWISTED HER LT ANKL...,2,38,73
4,44 YOM C O LOW BACK PAIN AFTER LIFTING A BOX A...,1,44,71


In [0]:
# fit on whole training set and make preds on testing set
train_x, train_y, train_aux = convert_data(train)
model = model_build(len(train_x))
train_D = data_generator(list(zip(train_x, train_y, train_aux)))
model.fit_generator(    
    train_D.__iter__(),
    steps_per_epoch=len(train_D),
    epochs=NUM_EPOCHS
    )

test_indices, test_aux = convert_data(test, branch='test')
test_D = data_generator(list(zip(test_indices, test_aux)), branch='test')
pred = model.predict_generator(test_D.__iter__(), len(test_D), verbose=1)
test['bert_uncased'] = np.argmax(pred, 1)
test.to_csv('drive/My Drive/CDC Model/oof/test_bert_uncased_whole.csv', index=False)

100%|██████████| 75864/75864 [00:17<00:00, 4390.70it/s]


In [0]:
model.save('drive/My Drive/CDC Model/M18.h5')

In [0]:
# test_D = data_generator(list(zip(test_indices, test_aux)), branch='test')
# pred = model.predict_generator(test_D.__iter__(), len(test_D), verbose=1)
test['event'] = np.argmax(pred, 1)
test['event'] = test.bert_clinic.map({x:y for x, y in enumerate(np.sort(train.event.unique()))})
test.drop('bert_clinic', 1).to_csv('solution.csv', index=False)

## Pseudo labeling

In [0]:
test = pd.read_csv('/content/drive/My Drive/CDC Model/solution_m14.csv')
test['event_idx'] = test.event.map({y:x for x, y in enumerate(np.sort(train.event.unique()))})
# test['event_idx'] = test.bert_uncased
test.head()

In [0]:
tr, val = train_test_split(train, test_size=0.2, random_state=0)
tr_pseudo, val_pseudo = train_test_split(tr, test_size = 0.2, random_state = 1)
tr_pseudo = pd.concat([tr_pseudo, test, val])
# pseudo = pd.concat([train[['text', 'age', 'sex', 'event_idx']], test[['text', 'age', 'sex', 'event_idx']]], 0)

tr_pseudo_x, tr_pseudo_y, tr_pseudo_aux = convert_data(tr_pseudo)
val_pseudo_x, val_pseudo_y, val_pseudo_aux = convert_data(val_pseudo)
pseudo_test_x, pseudo_test_aux = convert_data(test, branch='testing')


100%|██████████| 75864/75864 [00:18<00:00, 4066.48it/s]


In [0]:
train_D = data_generator(list(zip(tr_pseudo_x, tr_pseudo_y, tr_pseudo_aux)))
test_D = data_generator(list(zip(pseudo_test_x, pseudo_test_aux)), branch='test')
valid_D = data_generator(list(zip(val_pseudo_x, val_pseudo_y, val_pseudo_aux)), branch='valid')
ival = IntervalEvaluation(validation_data=valid_D, label=val_pseudo_y, interval = len(train_D))
model = model_build(len(train_D))
model.fit_generator(    
    train_D.__iter__(),
    steps_per_epoch=len(train_D),
    epochs=2,
    callbacks = [ival]
)
pred = model.predict_generator(test_D.__iter__(), len(test_D), verbose=1)
test['event'] = np.argmax(pred, 1)
test['event'] = test.event.map({x:y for x, y in enumerate(np.sort(train.event.unique()))})
test[['text', 'sex', 'age', 'event']].to_csv('solution.csv', index=False)

# xlnet

In [0]:
!wget https://storage.googleapis.com/xlnet/released_models/cased_L-12_H-768_A-12.zip
!unzip cased_L-12_H-768_A-12.zip
# !wget https://storage.googleapis.com/xlnet/released_models/cased_L-24_H-1024_A-16.zip
# !unzip cased_L-24_H-1024_A-16.zip

--2019-11-02 22:39:22--  https://storage.googleapis.com/xlnet/released_models/cased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.97.128, 2404:6800:4008:c07::80
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.97.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 433638019 (414M) [application/zip]
Saving to: ‘cased_L-12_H-768_A-12.zip’


2019-11-02 22:39:34 (40.3 MB/s) - ‘cased_L-12_H-768_A-12.zip’ saved [433638019/433638019]

Archive:  cased_L-12_H-768_A-12.zip
   creating: xlnet_cased_L-12_H-768_A-12/
  inflating: xlnet_cased_L-12_H-768_A-12/xlnet_model.ckpt.index  
  inflating: xlnet_cased_L-12_H-768_A-12/xlnet_model.ckpt.data-00000-of-00001  
  inflating: xlnet_cased_L-12_H-768_A-12/spiece.model  
  inflating: xlnet_cased_L-12_H-768_A-12/xlnet_model.ckpt.meta  
  inflating: xlnet_cased_L-12_H-768_A-12/xlnet_config.json  


In [0]:
!pip install keras_xlnet
import os
from keras_xlnet import Tokenizer, load_trained_model_from_checkpoint, ATTENTION_TYPE_BI, ATTENTION_TYPE_UNI


## Parameter Setting

In [0]:
checkpoint_path = 'xlnet_cased_L-12_H-768_A-12' 
MEMLEN=512
BATCH_SIZE=16

## Tokenize train and validation set

In [0]:
!pip install transformers
from transformers import XLNetTokenizer

In [0]:
t = Tokenizer(os.path.join(checkpoint_path, 'spiece.model'))
def convert_data(data_df):
    # data_df.sample(frac=1, random_state=0)
    data_df.reset_index(drop=True, inplace=True)
    global tokenizer
    indices = []
    for i in tqdm(range(len(data_df))):
        ids = tokenizer.encode(data_df.loc[i, 'text'])
        indices.append(ids)
    targets = data_df['event_idx']
    aux = data_df[['age', 'sex']].apply(lambda x: (x - min(x)) / (max(x)-min(x)))
    return indices, np.array(targets), np.array(aux)

tr, val = train_test_split(train, test_size=0.2, random_state=0)
tr_x, tr_y, tr_aux = convert_data(tr)
val_x, val_y, val_aux = convert_data(val)

## Data generator

In [0]:
tokenizer.decode(5)

'<pad>'

In [0]:
t.SYM_PAD

5

In [0]:
tokenizer.encode(train.loc[0, 'text'], add_special_tokens=True)

In [0]:
def seq_padding(X, padding=0):
  L = [len(x) for x in X]
  ML = max(L)
  return np.array([np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X])

def seq_seg(X):
  seg = [[tokenizer.SYM_UNK]*(len(x)-1)+[tokenizer.SYM_EOS] for x in X]
  ML = max([len(x) for x in X])    
  return np.array([np.concatenate([x, [tokenizer.SYM_SEP] * (ML - len(x))]) if len(x) < ML else x for x in seg])

def seq_mask(X):
  mask = [[tokenizer.SYM_UNK]*len(x) for x in X]
  ML = max([len(x) for x in X])    
  return np.array([np.concatenate([x, [tokenizer.SYM_BOS] * (ML - len(x))]) if len(x) < ML else x for x in mask])


class data_generator:
  def __init__(self, data, batch_size=BATCH_SIZE, memlen=MEMLEN, branch='train'):
    self.data = data
    self.batch_size = batch_size
    self.memlen = memlen
    self.branch = branch
    self.steps = len(self.data) // self.batch_size
    if len(self.data) % self.batch_size != 0:
        self.steps += 1
  def __len__(self):
    return self.steps
  def __iter__(self):
    while True:
        if self.branch=='train':
            np.random.shuffle(self.data)
        for i in range(self.steps):
            d = self.data[i * self.batch_size: (i + 1) * self.batch_size]
            X1 = seq_padding([x[0] for x in d])
            # segments
            X2 = seq_seg(([x[0] for x in d]))
            # memories
            X3 = np.array([self.memlen for i in range(len(d))])
            # masks
            # X4 = seq_mask(([x[0] for x in d]))
            Y = np.array([x[1] for x in d])
            aux = np.array([x[2] for x in d])
            yield [X1, X2, X3, aux], Y

## Model Assemble

In [0]:
xlnet_model = load_trained_model_from_checkpoint(
    config_path=os.path.join(checkpoint_path, 'xlnet_config.json'),
    checkpoint_path=os.path.join(checkpoint_path, 'xlnet_model.ckpt'),
    batch_size=BATCH_SIZE, #16
    memory_len=MEMLEN, #512
    target_len=142, #128
    in_train_phase=False,
    attention_type=ATTENTION_TYPE_BI,
)














In [0]:
x1_in = Input(shape=(None,))
x2_in = Input(shape=(None,))
x3_in = Input(shape=(1,))
# x4_in = Input(shape=(None,))
aux_in = Input(shape=(2, ))

x = xlnet_model([x1_in, x2_in, x3_in])
x = Lambda(lambda x: x[:, 0])(x)
x = concatenate([x, aux_in])
p = Dense(48, activation='softmax')(x)

model = Model([x1_in, x2_in, x3_in, aux_in], p)

decay_steps, warmup_steps = calc_train_steps(
    len(tr_x),
    batch_size=BATCH_SIZE,
    epochs=2
)

model.compile(
    loss='sparse_categorical_crossentropy',
    # optimizer=Adam(1e-4),
    optimizer=AdamWarmup(decay_steps=decay_steps, warmup_steps=warmup_steps, learning_rate=1e-4, min_lr=1e-6),
    metrics= ['sparse_categorical_accuracy']
)





## Batchwise evaluation callback

In [0]:
class IntervalEvaluation(Callback):
    def __init__(self, validation_data, label, weight, interval=3000):
        # super(Callback, self).__init__()
        self.seen = 0
        self.interval = interval
        self.validation_data = validation_data
        self.label = label
        self.weight = weight
    def on_batch_end(self, batch, logs={}):
        self.seen += logs.get('num_steps', 1)
        if self.seen % self.interval == 0:
            y_pred = self.model.predict_generator(self.validation_data.__iter__(), len(self.validation_data))
            score = f1_score(self.label, np.argmax(y_pred, 1), average='weighted', sample_weight=self.weight)
            print(" - interval evaluation - batch: {:d} - score: {:.4f}".format(self.seen, score))

In [0]:
train_D = data_generator(list(zip(tr_x, tr_y, tr_aux)))
valid_D = data_generator(list(zip(val_x, val_y, val_aux)), branch='valid')
ival = IntervalEvaluation(validation_data=valid_D, label=val_y, weight=val_wt, interval = len(train_D))
model.fit_generator(
    train_D.__iter__(),
    steps_per_epoch=len(train_D),
    epochs=2,
    # validation_data=valid_D.__iter__(),
    # validation_steps=len(valid_D),
    callbacks = [ival]
)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/2
Epoch 2/2
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/2
Epoch 2/2
 - interval evaluation - batch: 15396 - score: 0.9122


<keras.callbacks.History at 0x7fabe5c79c88>

<keras.callbacks.History at 0x7fabe5c79c88>

In [0]:
y_pred = model.predict_generator(valid_D.__iter__(), len(valid_D))
f1_score(val_y, np.argmax(y_pred, 1), average='weighted', sample_weight=val_wt)


0.8694244178850293

In [0]:
del model
gc.collect()

In [0]:
model.save('drive/My Drive/CDC Model/xlnet_base.h5')

## Prediction on test set

In [0]:
test_indices = []
for i in tqdm(range(len(test))):
    ids = tokenizer.encode(test.loc[i, 'text'])
    ids.extend([tokenizer.SYM_SEP, tokenizer.SYM_CLS])
    test_indices.append(ids)
test_aux = np.array(test[['age', 'sex']].apply(lambda x: (x - min(x)) / (max(x)-min(x))))

class test_generator:
  def __init__(self, data, batch_size=BATCH_SIZE, memlen=MEMLEN):
    self.data = data
    self.batch_size = batch_size
    self.memlen = memlen
    self.steps = len(self.data) // self.batch_size
    if len(self.data) % self.batch_size != 0:
        self.steps += 1
  def __len__(self):
    return self.steps
  def __iter__(self):
    while True:
        for i in range(self.steps):
            d = self.data[i * self.batch_size: (i + 1) * self.batch_size]
            X1 = seq_padding([x[0] for x in d])
            # segments
            X2 = seq_seg([x[0] for x in d])
            # memories
            X3 = np.array([self.memlen for i in range(len(d))])
            # masks
            # X4 = seq_mask(([x[0] for x in d]))
            # Y = np.array([x[1] for x in d])
            aux = np.array([x[1] for x in d])
            yield [X1, X2, X3, aux]

100%|██████████| 75864/75864 [00:08<00:00, 8644.29it/s]


In [0]:
test_D = test_generator(list(zip(test_indices, test_aux)))
pred = model.predict_generator(test_D.__iter__(), len(test_D), verbose=1)
test['event'] = np.argmax(pred, 1)
test['event'] = test.event.map({x:y for x, y in enumerate(np.sort(train.event.unique()))})



In [0]:
test.to_csv('solution.csv', index=False)