## Link with github project folder

In [0]:
!nvidia-smi

In [0]:
!git clone https://github.com/acmilannesta/Bert-embedding

In [0]:
!pip install keras_bert
from keras_bert import AdamWarmup, calc_train_steps

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from keras.layers import *
from keras.models import Model, load_model
from keras.callbacks import Callback, EarlyStopping
import pandas as pd
import numpy as np
import gc
import codecs
from tqdm import tqdm
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score

Using TensorFlow backend.


In [55]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load Dataset

In [0]:
train = pd.read_csv('Bert-embedding/CDC/train.csv')
test = pd.read_csv('Bert-embedding/CDC/test.csv')
# Event weight
wt = pd.DataFrame(train.event.value_counts()/len(train)).rename(columns={'event':'weight'})
wt['event'] = wt.index
train = train.merge(wt, how='left', on='event')
# Reassign eventcode
train['event_idx'] = train.event.map({y:x for x, y in enumerate(np.sort(train.event.unique()))})
# Assign weight freqency
train['wt_freq'] = np.where(train.weight<0.01, 1, np.where(train.weight<0.05, 2, 3))

In [0]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tqdm(train.text.tolist()+test.text.tolist()))

def convert_data(data_df, branch='training'):
    # data_df.sample(frac=1, random_state=0)
    data_df.reset_index(drop=True, inplace=True)
    global tokenizer
    indices = tokenizer.texts_to_sequences(tqdm(data_df.text.tolist()))
    aux = data_df[['age', 'sex']].apply(lambda x: (x - min(x)) / (max(x)-min(x)))
    if branch=='training':
        targets = data_df['event_idx'] 
        return indices, np.array(targets), np.array(aux)
    else:
        return indices, np.array(aux)

100%|██████████| 229820/229820 [00:05<00:00, 44644.72it/s]


## Parameter setting

In [0]:
BATCH_SIZE = 512
NUM_EPOCHS = 100
NUM_CLASSES = 48

In [0]:
import spacy

In [0]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz (826.9MB)
[K     |████████████████████████████████| 826.9MB 11.7MB/s 
[?25hBuilding wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.1.0-cp36-none-any.whl size=828255076 sha256=50793a38744453673e4b5cb7b117dbbdc6331dfb7f4e18a42f19fb4e4fab8576
  Stored in directory: /tmp/pip-ephem-wheel-cache-w4xb9r_t/wheels/b4/d7/70/426d313a459f82ed5e06cc36a50e2bb2f0ec5cb31d8e0bdf09
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [0]:
import en_core_web_lg


In [6]:
text_list = pd.concat([train.text, test.text])

nlp = en_core_web_lg.load(disable=['parser','ner','tagger'])
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
word_dict = {}
word_index = 1
lemma_dict = {}
docs = nlp.pipe(text_list, n_threads = 2)
word_sequences = []
for doc in tqdm(docs):
    word_seq = []
    for token in doc:
        if (token.text not in word_dict) and (token.pos_ is not "PUNCT"):
            word_dict[token.text] = word_index
            word_index += 1
            lemma_dict[token.text] = token.lemma_
        if token.pos_ is not "PUNCT":
            word_seq.append(word_dict[token.text])
    word_sequences.append(word_seq)
del docs
gc.collect()



229820it [01:53, 2020.10it/s]


0

In [0]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lc = LancasterStemmer()
from nltk.stem import SnowballStemmer
sb = SnowballStemmer("english")

In [0]:
def load_embedding(EMBEDDING_FILE, word_dict, lemma_dict):
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    embed_size = 300
    nb_words = len(word_dict)+1
    embedding_matrix = np.zeros((nb_words, embed_size), dtype=np.float32)
    unknown_vector = np.zeros((embed_size,), dtype=np.float32) - 1.
    print(unknown_vector[:5])
    for key in tqdm(word_dict):
        word = key
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.lower()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.upper()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.capitalize()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = ps.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = lc.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = sb.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = lemma_dict[key]
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        if len(key) > 1:
            word = correction(key)
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[word_dict[key]] = embedding_vector
                continue
        embedding_matrix[word_dict[key]] = unknown_vector                    
    return embedding_matrix, nb_words 

In [9]:
embedding_matrix_glove, nb_words = load_embedding('/content/drive/My Drive/CDC Model/embedding/glove.840B.300d.txt', word_dict, lemma_dict)

  0%|          | 0/44724 [00:00<?, ?it/s]

[-1. -1. -1. -1. -1.]





NameError: ignored

In [0]:
np.save('drive/My Drive/CDC Model/embedding/embedding_matrix_wide', embedding_matrix)

In [0]:
# np.save('drive/My Drive/embedding matrix', embedding_matrix)
embedding_matrix = np.load('drive/My Drive/CDC Model/embedding matrix.npy')

## Model Assemble

In [0]:
def model_build():
    global embedding_matrix
    global NUM_CLASSES
    
    words = Input(shape=(None, ))
    aux = Input(shape=(2, ))

    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)

    hidden = concatenate([GlobalMaxPooling1D()(x), GlobalAveragePooling1D()(x)])
    hidden = add([hidden, Dense(512, activation='relu')(hidden)])
    hidden = add([hidden, Dense(512, activation='relu')(hidden)])
    hidden = concatenate([hidden, aux])
    result = Dense(NUM_CLASSES, activation='softmax')(hidden)

    # decay_steps, warmup_steps = calc_train_steps(
    # len(tr),
    # batch_size=BATCH_SIZE,
    # epochs=EPOCHS
    # )

    model = Model(inputs=[words, aux], outputs=result)
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=Adam(lr=1e-4),
        metrics= ['sparse_categorical_accuracy']
    )
    return model

<tf.Tensor 'add_2/add:0' shape=(?, 512) dtype=float32>

In [0]:
del model
gc.collect()

975

## Data generator

In [0]:
def seq_padding(X, padding=0):
    L = [len(x) for x in X]
    ML = max(L)
    return np.array([np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X])

class data_generator:
    def __init__(self, data, batch_size=BATCH_SIZE, branch='train'):
        self.data = data
        self.batch_size = batch_size
        self.branch = branch
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1

    def __len__(self):
        return self.steps

    def __iter__(self):
        while True:
            if self.branch == 'train':
                np.random.shuffle(self.data)
            for i in range(self.steps):
                d = self.data[i * self.batch_size: (i + 1) * self.batch_size]
                X1 = seq_padding([x[0] for x in d])
                if self.branch != 'test':
                    Y = np.array([x[1] for x in d])
                    aux = np.array([x[2] for x in d])
                    yield [X1, aux], Y
                else:
                    aux = np.array([x[1] for x in d])
                    yield [X1, aux]

## Interval evaluation and model training

In [0]:
class IntervalEvaluation(Callback):
    def __init__(self, validation_data, label, score=0, maxscore=0, count=0, interval=3000, patience=3, savethreshold=0.84):
        super(Callback, self).__init__()
        self.seen = 0
        self.interval = interval
        self.validation_data = validation_data
        self.label = label
        self.score = score
        self.maxscore = maxscore
        self.count = count
        self.patience = patience
        self.savethreshold = savethreshold
    def on_batch_end(self, batch, logs={}):
        self.seen += logs.get('num_steps', 1)
        if self.seen % self.interval == 0:
            y_pred = self.model.predict_generator(self.validation_data.__iter__(), len(self.validation_data))
            score = f1_score(self.label, np.argmax(y_pred, 1), average='weighted')
            print(" - batch: {:d} - score: {:.4f}".format(self.seen, score))
            if self.maxscore>=0.85:
                self.patience=2
            if (score < self.score):
                self.score = score
                self.count+=1
                if self.count==self.patience:
                    self.model.stop_training=True
            elif score > self.maxscore:
                self.score = score
                self.maxscore = score
                self.count = 0
                if self.maxscore > self.savethreshold:
                    self.model.save('lstm_model.h5')
            else:
                self.score = score
                self.count = 0

In [0]:
np.save('drive/My Drive/CDC Model/oof/test_lstm_oof.npy', pred)

In [0]:
# test_x, test_aux = convert_data(test, branch='test')
kf = StratifiedKFold(n_splits=5, random_state=0)
pred = np.zeros((len(test), NUM_CLASSES))
idx = [x for x in kf.split(train, train.wt_freq)]
for i, (tr_idx, val_idx) in enumerate(idx[3:]):
    print('Fold - {:}'.format(i+1))
    tr, val = train.loc[tr_idx], train.loc[val_idx]
    tr_x, tr_y, tr_aux = convert_data(tr)
    val_x, val_y, val_aux = convert_data(val)
    model = model_build()
    train_D = data_generator(list(zip(tr_x, tr_y, tr_aux)))
    valid_D = data_generator(list(zip(val_x, val_y, val_aux)), branch='valid')

    ival = IntervalEvaluation(
        validation_data=valid_D, 
        label=val_y, 
        interval = len(train_D)
        )
    model.fit_generator(
        train_D.__iter__(),
        steps_per_epoch=len(train_D),
        epochs=NUM_EPOCHS,
        callbacks = [ival]
    )
    model = load_model('lstm_model.h5')
    # oof_pred = model.predict_generator(valid_D.__iter__(), len(valid_D), verbose=1)
    # train.loc[val_idx, 'lstm'] = np.argmax(oof_pred, 1)
    # print('oof - {:} f1_score - {:.4f}'.format(i+1, f1_score(val_y, np.argmax(oof_pred, 1), average='weighted')))
    test_D = data_generator(list(zip(test_x, test_aux)), branch='test')
    pred += model.predict_generator(test_D.__iter__(), len(test_D), verbose=1) / kf.get_n_splits()
    del model
    gc.collect()
    # train.to_csv('drive/My Drive/CDC Model/oof/train_lstm_oof.csv', index=False)
test['lstm'] = np.argmax(pred, 1)
test.to_csv('drive/My Drive/CDC Model/oof/test_lstm_oof.csv', index=False)

  4%|▍         | 4625/123166 [00:00<00:02, 46248.61it/s]

Fold - 1


100%|██████████| 123166/123166 [00:02<00:00, 49902.97it/s]
100%|██████████| 30790/30790 [00:00<00:00, 52230.69it/s]


Epoch 1/100


  'precision', 'predicted', average, warn_for)


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100

## Predict on testing set

In [0]:
class test_generator:
    def __init__(self, data, batch_size=BATCH_SIZE, branch='train'):
        self.data = data
        self.batch_size = batch_size
        self.branch = branch
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1

    def __len__(self):
        return self.steps

    def __iter__(self):
        while True:
            for i in range(self.steps):
                d = self.data[i * self.batch_size: (i + 1) * self.batch_size]
                X1 = seq_padding([x[0] for x in d])
                aux = np.array([x[1] for x in d])
                yield [X1, aux]

In [0]:
# predmodel = load_model('lstm_model_0.8947.h5', custom_objects={'AdamWarmup': AdamWarmup})
test_D = test_generator(list(zip(test_text, test[['age', 'sex']].values)))
pred_lstm = predmodel.predict_generator(test_D.__iter__(), len(test_D), verbose=1)
test['event'] = np.argmax(pred_lstm, 1)
test['event'] = test.event.map({x:y for x, y in enumerate(np.sort(train.event.unique()))})



In [0]:
test.to_csv('solution.csv', index=False)