## Question classification using LSTM networks and word embedding

### 1. Importing Modules

In [1]:
import gc
import sys
import pandas as pd
import numpy as np
import tensorflow as tf
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Setting random seed
seed = 512
tf.random.set_seed(seed)
np.random.seed(seed)

In [3]:
# 3 vector files
GLOVE_FILE = 'embeddings/glove.840B.300d/glove.840B.300d.txt'
PARAGRAM_FILE = 'embeddings/paragram_300_sl999/paragram_300_sl999.txt'
WIKI_NEWS_FILE = 'embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'

In [4]:
def get_coefficients(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

def get_no_lines(file_name): 
    return sum(1 for _ in open(file_name, encoding="utf8", errors='ignore'))

def load_vector(file_name): 
    return dict(get_coefficients(*o.split(" ")) for o in tqdm(open(file_name, encoding="utf8", errors='ignore'), total=get_no_lines(file_name)) if len(o) > 100)

### 2. Loading Data

In [5]:
# loading the 2 csv files
reddit_df = pd.read_csv('reddit.csv')
reddit_df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [6]:
negative, positive = np.bincount(reddit_df['target'])
total = negative + positive

### 3. Text preprocessing and Word vectorization
Preprocessing words and converting words into vectors so that can be given as input to neural network.

In [7]:
import re
tqdm().pandas()

puncts = [",",".",'"',":",")","(","-","!","?","|",";","'","$","&","/","[","]",">","%","=","#","*","+","\\","•","~","@","£","·","_","{","}","©","^","®","`","<","→","°","€","™","›","♥","←","×","§","″","′","█","…","“","★","”","–","●","►","−","¢","¬","░","¡","¶","↑","±","¿","▾","═","¦","║","―","¥","▓","—","‹","─","▒","：","⊕","▼","▪","†","■","’","▀","¨","▄","♫","☆","¯","♦","¤","▲","¸","⋅","‘","∞","∙","）","↓","、","│","（","»","，","♪","╩","╚","・","╦","╣","╔","╗","▬","❤","≤","‡","√","◄","━","⇒","▶","≥","╝","♡","◊","。","✈","≡","☺","✔","↵","≈","✓","♣","☎","℃","◦","└","‟","～","！","○","◆","№","♠","▌","✿","▸","⁄","□","❖","✦","．","÷","｜","┃","／","￥","╠","↩","✭","▐","☼","☻","┐","├","«","∼","┌","℉","☮","฿","≦","♬","✧","〉","－","⌂","✖","･","◕","※","‖","◀","‰","\x97","↺","∆","┘","┬","╬","،","⌘","⊂","＞","〈","⎙","？","☠","⇐","▫","∗","∈","≠","♀","♔","˚","℗","┗","＊","┼","❀","＆","∩","♂","‿","∑","‣","➜","┛","⇓","☯","⊖","☀","┳","；","∇","⇑","✰","◇","♯","☞","´","↔","┏","｡","◘","∂","✌","♭","┣","┴","┓","✨","\xa0","˜","❥","┫","℠","✒","［","∫","\x93","≧","］","\x94","∀","♛","\x96","∨","◎","↻","⇩","＜","≫","✩","✪","♕","؟","₤","☛","╮","␊","＋","┈","％","╋","▽","⇨","┻","⊗","￡","।","▂","✯","▇","＿","➤","✞","＝","▷","△","◙","▅","✝","∧","␉","☭","┊","╯","☾","➔","∴","\x92","▃","↳","＾","׳","➢","╭","➡","＠","⊙","☢","˝","∏","„","∥","❝","☐","▆","╱","⋙","๏","☁","⇔","▔","\x91","➚","◡","╰","\x85","♢","˙","۞","✘","✮","☑","⋆","ⓘ","❒","☣","✉","⌊","➠","∣","❑","◢","ⓒ","\x80","〒","∕","▮","⦿","✫","✚","⋯","♩","☂","❞","‗","܂","☜","‾","✜","╲","∘","⟩","＼","⟨","·","✗","♚","∅","ⓔ","◣","͡","‛","❦","◠","✄","❄","∃","␣","≪","｢","≅","◯","☽","∎","｣","❧","̅","ⓐ","↘","⚓","▣","˘","∪","⇢","✍","⊥","＃","⎯","↠","۩","☰","◥","⊆","✽","⚡","↪","❁","☹","◼","☃","◤","❏","ⓢ","⊱","➝","̣","✡","∠","｀","▴","┤","∝","♏","ⓐ","✎",";","␤","＇","❣","✂","✤","ⓞ","☪","✴","⌒","˛","♒","＄","✶","▻","ⓔ","◌","◈","❚","❂","￦","◉","╜","̃","✱","╖","❉","ⓡ","↗","ⓣ","♻","➽","׀","✲","✬","☉","▉","≒","☥","⌐","♨","✕","ⓝ","⊰","❘","＂","⇧","̵","➪","▁","▏","⊃","ⓛ","‚","♰","́","✏","⏑","̶","ⓢ","⩾","￠","❍","≃","⋰","♋","､","̂","❋","✳","ⓤ","╤","▕","⌣","✸","℮","⁺","▨","╨","ⓥ","♈","❃","☝","✻","⊇","≻","♘","♞","◂","✟","⌠","✠","☚","✥","❊","ⓒ","⌈","❅","ⓡ","♧","ⓞ","▭","❱","ⓣ","∟","☕","♺","∵","⍝","ⓑ","✵","✣","٭","♆","ⓘ","∶","⚜","◞","்","✹","➥","↕","̳","∷","✋","➧","∋","̿","ͧ","┅","⥤","⬆","⋱","☄","↖","⋮","۔","♌","ⓛ","╕","♓","❯","♍","▋","✺","⭐","✾","♊","➣","▿","ⓑ","♉","⏠","◾","▹","⩽","↦","╥","⍵","⌋","։","➨","∮","⇥","ⓗ","ⓓ","⁻","⎝","⌥","⌉","◔","◑","✼","♎","♐","╪","⊚","☒","⇤","ⓜ","⎠","◐","⚠","╞","◗","⎕","ⓨ","☟","ⓟ","♟","❈","↬","ⓓ","◻","♮","❙","♤","∉","؛","⁂","ⓝ","־","♑","╫","╓","╳","⬅","☔","☸","┄","╧","׃","⎢","❆","⋄","⚫","̏","☏","➞","͂","␙","ⓤ","◟","̊","⚐","✙","↙","̾","℘","✷","⍺","❌","⊢","▵","✅","ⓖ","☨","▰","╡","ⓜ","☤","∽","╘","˹","↨","♙","⬇","♱","⌡","⠀","╛","❕","┉","ⓟ","̀","♖","ⓚ","┆","⎜","◜","⚾","⤴","✇","╟","⎛","☩","➲","➟","ⓥ","ⓗ","⏝","◃","╢","↯","✆","˃","⍴","❇","⚽","╒","̸","♜","☓","➳","⇄","☬","⚑","✐","⌃","◅","▢","❐","∊","☈","॥","⎮","▩","ு","⊹","‵","␔","☊","➸","̌","☿","⇉","⊳","╙","ⓦ","⇣","｛","̄","↝","⎟","▍","❗","״","΄","▞","◁","⛄","⇝","⎪","♁","⇠","☇","✊","ி","｝","⭕","➘","⁀","☙","❛","❓","⟲","⇀","≲","ⓕ","⎥","\u06dd","ͤ","₋","̱","̎","♝","≳","▙","➭","܀","ⓖ","⇛","▊","⇗","̷","⇱","℅","ⓧ","⚛","̐","̕","⇌","␀","≌","ⓦ","⊤","̓","☦","ⓕ","▜","➙","ⓨ","⌨","◮","☷","◍","ⓚ","≔","⏩","⍳","℞","┋","˻","▚","≺","ْ","▟","➻","̪","⏪","̉","⎞","┇","⍟","⇪","▎","⇦","␝","⤷","≖","⟶","♗","̴","♄","ͨ","̈","❜","̡","▛","✁","➩","ா","˂","↥","⏎","⎷","̲","➖","↲","⩵","̗","❢","≎","⚔","⇇","̑","⊿","̖","☍","➹","⥊","⁁","✢"];
contraction_dict = {"We'd": "We had", "That'd": "That had", "AREN'T": "Are not", "HADN'T": "Had not", "Could've": "Could have", "LeT's": "Let us", "How'll": "How will", "They'll": "They will", "DOESN'T": "Does not", "HE'S": "He has", "O'Clock": "Of the clock", "Who'll": "Who will", "What'S": "What is", "Ain't": "Am not", "WEREN'T": "Were not", "Y'all": "You all", "Y'ALL": "You all", "Here's": "Here is", "It'd": "It had", "Should've": "Should have", "I'M": "I am", "ISN'T": "Is not", "Would've": "Would have", "He'll": "He will", "DON'T": "Do not", "She'd": "She had", "WOULDN'T": "Would not", "She'll": "She will", "IT's": "It is", "There'd": "There had", "It'll": "It will", "You'll": "You will", "He'd": "He had", "What'll": "What will", "Ma'am": "Madam", "CAN'T": "Can not", "THAT'S": "That is", "You've": "You have", "She's": "She is", "Weren't": "Were not", "They've": "They have", "Couldn't": "Could not", "When's": "When is", "Haven't": "Have not", "We'll": "We will", "That's": "That is", "We're": "We are", "They're": "They' are", "You'd": "You would", "How'd": "How did", "What're": "What are", "Hasn't": "Has not", "Wasn't": "Was not", "Won't": "Will not", "There's": "There is", "Didn't": "Did not", "Doesn't": "Does not", "You're": "You are", "He's": "He is", "SO's": "So is", "We've": "We have", "Who's": "Who is", "Wouldn't": "Would not", "Why's": "Why is", "WHO's": "Who is", "Let's": "Let us", "How's": "How is", "Can't": "Can not", "Where's": "Where is", "They'd": "They had", "Don't": "Do not", "Shouldn't":"Should not", "Aren't":"Are not", "ain't": "is not", "What's": "What is", "It's": "It is", "Isn't":"Is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

def add_tag(text):
     #replacing sentences 'http' or 'www' with [url]
    if 'http' in text or 'www' in text:
        text = re.sub('(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+', '[url]', text)
    
    #replacing formulas with [formuala]
    if '[math]' in text:
        text = re.sub('\[math\].*?math\]', '[formula]', text) 

    return text


def remove_contractions(text):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    
    text = ' '.join([contraction_dict[t] if t in contraction_dict else t for t in text.split(" ")])
    return text


def remove_punct(x):
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x

def data_cleaning(x):
    x = add_tag(x)
    x = remove_contractions(x)
    x = remove_punct(x)
    return x

# doing preprocessing on reddit data
reddit_df['preprocessed_question_text'] = reddit_df['question_text'].progress_map(lambda x: data_cleaning(x))

0it [00:00, ?it/s]

  0%|          | 0/1048575 [00:00<?, ?it/s]

In [10]:
import keras
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()

# creating dict for vocab_freq, word2index and lemma_dict
vocab_freq = {}
word2index = {}
lemma_dict = {}

sentences = reddit_df["preprocessed_question_text"]
word_sequences = []

for doc in tqdm(sentences):
    word_seq = []
    for token in nlp(doc):
        if token.is_punct or token.is_space:
            continue
        try:
            vocab_freq[token.text] += 1
        except KeyError:
            vocab_freq[token.text] = 1
        if token.text not in word2index:
            word2index[token.text] = len(vocab_freq)
            lemma_dict[token.text] = token.lemma_
        word_seq.append(word2index[token.text])
    word_sequences.append(word_seq)

vocab_size = len(word2index)

print('Found %s unique tokens.' % len(word2index))

Found 210084 unique tokens.


In [12]:
MAX_SENTENCE_LENGTH = 100

max_que_len = len(max(word_sequences, key=len))
print("Max question length in data: ", max_que_len)

X_train = word_sequences[:len(reddit_df)]
X_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen=MAX_SENTENCE_LENGTH)
print('Data tensor shape:', X_train.shape)

X_test_data = word_sequences[len(reddit_df):]
X_test_data = keras.preprocessing.sequence.pad_sequences(X_test_data, maxlen=MAX_SENTENCE_LENGTH)

y_train = reddit_df['target']

del reddit_df
gc.collect()

Max question length in data:  145
Data tensor shape: (1048575, 100)


28

In [13]:
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
 
EMBEDDING_DIM = 300

ps = PorterStemmer()
lc = LancasterStemmer()
sb = SnowballStemmer('english')
lm = WordNetLemmatizer()

def correction(word): return list((known([word]) or known(edits1(word)) or [word]))[0]

def known(words): return set(w for w in words if w in word2index)

def edits1(word):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    splits = [(word[:i], word[i:])        for i in range(len(word) + 1)]
    deletes = [L + R[1:]                  for L, R in splits if R]  
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]  
    replaces = [L + c + R[1:]             for L, R in splits if R for c in letters]
    inserts = [L + c + R                  for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def load_embedding(word2vec):
    oov_count = 0
    vocab_count = 0
    embedding_weights = np.zeros((vocab_size+1, EMBEDDING_DIM))
    unknown_vector = np.zeros((EMBEDDING_DIM,), dtype=np.float32) - 1.
    unknown_words = {}

    for key, i in tqdm(word2index.items()):
        word = key
        if word in word2vec:
            vocab_count += vocab_freq[key]
            embedding_weights[i] = word2vec[word]
            continue
        #Lower
        word = key.lower()         
        if word in word2vec:
            vocab_count += vocab_freq[key]
            embedding_weights[i] = word2vec[word]
            continue
        #Upper
        word = key.upper()         
        if word in word2vec:
            vocab_count += vocab_freq[key]
            embedding_weights[i] = word2vec[word]
            continue
        #Capitalize
        word = key.capitalize()     
        if word in word2vec:
            vocab_count += vocab_freq[key]
            embedding_weights[i-1] = word2vec[word]
            continue
        #PorterStemmer
        word = ps.stem(key)        
        if word in word2vec:
            vocab_count += vocab_freq[key]
            embedding_weights[i] = word2vec[word]
            continue
        #LancasterStemmer
        word = lc.stem(key)        
        if word in word2vec:
            vocab_count += vocab_freq[key]
            embedding_weights[i] = word2vec[word]
            continue
        #SnowballStemmer
        word = sb.stem(key)        
        if word in word2vec:
            vocab_count += vocab_freq[key]
            embedding_weights[i] = word2vec[word]
            continue
        #Lemmanization
        word = lemma_dict[key]     
        if word in word2vec: 
            vocab_count += vocab_freq[key]
            embedding_weights[i] = word2vec[word]
            continue
        if len(key) > 1:
            word = correction(key)
            if word in word2vec: 
                vocab_count += vocab_freq[key]
                embedding_weights[i] = word2vec[word]
                continue

        try:
            unknown_words[key] += 1
        except KeyError:
            unknown_words[key] = 1
            
        embedding_weights[i] = unknown_vector
        oov_count += vocab_freq[key]

    print('Top 5 Null word embeddings: ')
    print(list(unknown_words.items())[:5])
    print('\n')
    print('Word embeddings which are null: %d' % np.sum(np.sum(embedding_weights, axis=1) == -1 * EMBEDDING_DIM))
    print('Null word embeddings percentage: %.2f%%' % (100 * oov_count / vocab_count))
    
    return embedding_weights

In [15]:
print('loading paragram_vec')
paragram_vec = load_vector(PARAGRAM_FILE)
paragram_weights = load_embedding(paragram_vec)
del paragram_vec
gc.collect()

print('loading glove_vec')
glove_vec = load_vector(GLOVE_FILE)
glove_weights = load_embedding(glove_vec)
del glove_vec
gc.collect()

print('loading fasttext_vec')
fasttext_vec = load_vector(WIKI_NEWS_FILE) 
fasttext_weights = load_embedding(fasttext_vec)
del fasttext_vec
gc.collect()

loading paragram_vec


  0%|          | 0/1703756 [00:00<?, ?it/s]

  0%|          | 0/210084 [00:00<?, ?it/s]

Top 5 Null word embeddings: 
[('calead', 1), ('nanodegree', 1), ('4AFSB', 1), ('Anizara', 1), ('Redmi', 1)]


Word embeddings which are null: 33587
Null word embeddings percentage: 0.33%
loading glove_vec


  0%|          | 0/2196017 [00:00<?, ?it/s]

  0%|          | 0/210084 [00:00<?, ?it/s]

Top 5 Null word embeddings: 
[('calead', 1), ('Tepelene', 1), ('nanodegree', 1), ('4AFSB', 1), ('Anizara', 1)]


Word embeddings which are null: 34780
Null word embeddings percentage: 0.34%
loading fasttext_vec


  0%|          | 0/999995 [00:00<?, ?it/s]

  0%|          | 0/210084 [00:00<?, ?it/s]

Top 5 Null word embeddings: 
[('montra', 1), ('Mcleodganj', 1), ('calead', 1), ('isovolumetric', 1), ('nanodegree', 1)]


Word embeddings which are null: 44706
Null word embeddings percentage: 0.45%


0

### 4.  Model Training and Evaluation  
- There has 3 layers in the model. 
- The 1st layer is Embedding layer that turns the X_train data(now that's the word indexes of vocabulary) into EMBEDDING_DIM dimensional vectors. 
- The 2nd layer is a bidirectinal LSTM that is well-suited to process data base on time-series. 
- The 3rd layer is output layer with a sigmoid activation function. 
- We had set the bias_initializer parameter for the output layer due to the imbalance in the dataset.
- We used metrics f1-score for binary classification model.

In [16]:
import tensorflow_addons as tfa
from keras import backend as K
from keras.layers import *
from keras.models import *
from keras.initializers import Constant

tf.config.run_functions_eagerly(True)

def build_model(units):
    output_bias = Constant(np.log([positive/negative]))
    
    x_in = Input(shape=(MAX_SENTENCE_LENGTH,))
    glove_embedding = Embedding(len(glove_weights), EMBEDDING_DIM, input_length=MAX_SENTENCE_LENGTH, weights=[glove_weights], trainable=False)(x_in)
#     paragram_embedding = Embedding(len(paragram_weights), EMBEDDING_DIM, input_length=MAX_SENTENCE_LENGTH, weights=[paragram_weights], trainable=False)(x_in)
#     fasttext_embedding = Embedding(len(fasttext_weights), EMBEDDING_DIM, input_length=MAX_SENTENCE_LENGTH, weights=[fasttext_weights], trainable=False)(x_in)
#     x = Concatenate()([glove_embedding, paragram_embedding, fasttext_embedding])
    
    x = SpatialDropout1D(0.2)(glove_embedding)
    lstm = Bidirectional(tf.keras.layers.LSTM(units, return_sequences=True))(x)
    gru = Bidirectional(GRU(units, return_sequences=True))(lstm)
    
    x = Concatenate()([lstm, gru])
    x = GlobalAveragePooling1D()(x)
    x_out = Dense(1, activation='sigmoid', bias_initializer=output_bias)(x)
    
    model = Model(inputs=x_in, outputs=x_out)
    model.compile(optimizer='adam', loss='binary_crossentropy')
    
    return model

In [17]:
def plot_history(history):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])   
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.title('model loss')
    plt.legend(['train', 'validation'])
    plt.show()

def f1_eval(y_true, y_pred):
    args = np.argsort(y_pred)
    tp = y_true.sum()
    fs = (tp - np.cumsum(y_true[args[:-1]])) / np.arange(y_true.shape[0] + tp - 1, tp, -1)
    res_idx = np.argmax(fs)
    return 2 * fs[res_idx], (y_pred[args[res_idx]] + y_pred[args[res_idx + 1]]) / 2

reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1)

early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', verbose=1, patience=1, mode='min', restore_best_weights=True)

checkpoint = keras.callbacks.ModelCheckpoint('lg_model.h5', monitor='val_loss', save_best_only=True, mode='min')

weight_for_0 = (1 / negative) * (total) / 2.0 
weight_for_1 = (1 / positive) * (total) / 2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

In [18]:
import tensorflow as tf

strategy = None

try:
    if len(tf.config.list_physical_devices('GPU')) > 0:
        strategy = tf.distribute.MirroredStrategy()
        print('Using GPU') 
    else:
        strategy = tf.distribute.get_strategy()
        print('Using CPU')
except ValueError:
    strategy = tf.distribute.get_strategy()
    print('Using CPU')

Using CPU


In [19]:
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from IPython.display import Image
from keras.utils import plot_model
import numpy as np

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

best_f1score = 0

with strategy.scope():
    model = build_model(64)
    model.summary()
    
    plot_model(model, 'lg_model.png', show_shapes=True, show_layer_names=True)
    
    for index, (train_index, valid_index) in enumerate(kfold.split(X_train, y_train)):
        if index > 1:
            break
        X_trainn, X_val, Y_trainn, Y_val = X_train[train_index], X_train[valid_index], y_train[train_index], y_train[valid_index]
        history = model.fit(X_trainn, Y_trainn, epochs=5, batch_size=128, validation_data=(X_val, Y_val), callbacks=[reduce_lr, checkpoint], class_weight=class_weight)
        Y_pred = model.predict(X_val)
        f1, threshold = f1_eval(Y_val.to_numpy(), np.squeeze(Y_pred))
        best_f1score = max(best_f1score, f1)
        print('Optimal F1: {:.4f} at threshold: {:.4f}\n'.format(f1, threshold))

print('Best f1-score: ', best_f1score)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 100, 300)     63025500    input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d (SpatialDropo (None, 100, 300)     0           embedding[0][0]                  
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 100, 128)     186880      spatial_dropout1d[0][0]          
______________________________________________________________________________________________



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Optimal F1: 0.6789 at threshold: 0.8921

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Optimal F1: 0.7067 at threshold: 0.8920

Best f1-score:  0.70666066294898
