In [1]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, Dropout, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler
import time
import pickle
import re
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
tqdm.pandas()

Using TensorFlow backend.


In [2]:
CRAWL_EMBEDDING_PATH = '../input/pickled-crawl300d2m-for-kernel-competitions/crawl-300d-2M.pkl'
GLOVE_EMBEDDING_PATH = '../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl'

In [3]:
train = pd.read_csv('../input/innoplexus-online-hiring-hackathon/train_F3WbcTw.csv',low_memory=True)
test = pd.read_csv('../input/innoplexus-online-hiring-hackathon/test_tOlRoBf.csv',low_memory=True)

In [4]:
train.shape

(5279, 4)

In [5]:
train.head()

Unnamed: 0,unique_hash,text,drug,sentiment
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,Autoimmune diseases tend to come in clusters. ...,gilenya,2
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,I can completely understand why you’d want to ...,gilenya,2
2,fe809672251f6bd0d986e00380f48d047c7e7b76,Interesting that it only targets S1P-1/5 recep...,fingolimod,2
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,"Very interesting, grand merci. Now I wonder wh...",ocrevus,2
4,b227688381f9b25e5b65109dd00f7f895e838249,"Hi everybody, My latest MRI results for Brain ...",gilenya,1


In [6]:
train['text'] = train['text'] + "This observation is for " + train['drug']
test['text'] = test['text'] + "This observation is for " + test['drug']

In [7]:
import re
def pre_process(text):
    new_text =re.sub('[0-9]', '', text)
    new_text = re.sub(r"\u200b","",new_text)
    new_text = re.sub(r"\.+",".",new_text)
    new_text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '',new_text, flags=re.MULTILINE)
    new_text = re.sub("'", "", new_text)
    new_text = re.sub(r'↑', '', new_text)
    new_text = re.sub("\t", "", new_text)
    new_text = re.sub("\xa0", "", new_text)
    new_text = re.sub("\(|\)|\[|\]", "", new_text)
    new_text = re.sub("\n", "", new_text)
    new_text = re.sub("\.", "", new_text)
    new_text = re.sub("\,", " ", new_text)
    new_text = re.sub("[/%]", " ", new_text)
    new_text = re.sub('[/%:;]', '', new_text)
    new_text = re.sub(' +', ' ', new_text)
    return new_text

In [8]:
# remove URL's from train and test
for index, row in train['text'].iteritems():
    train['text'][index] = pre_process(row)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
for index, row in test['text'].iteritems():
    test['text'][index] = pre_process(row)

In [10]:
# remove URL's from train and test
train['text'] = train['text'].apply(lambda x: re.sub(r'http\S+', '', x))

test['text'] = test['text'].apply(lambda x: re.sub(r'http\S+', '', x))

In [11]:
# remove numbers
train['text'] = train['text'].str.replace("[0-9]", " ")
test['text'] = test['text'].str.replace("[0-9]", " ")

In [12]:
# Adjusting the load_embeddings function, to now handle the pickled dict.

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def load_embeddings(path):
    with open(path,'rb') as f:
        emb_arr = pickle.load(f)
    return emb_arr

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    unknown_words = []
    
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

In [13]:
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [14]:
# Lets load the embeddings 

tic = time.time()
glove_embeddings = load_embeddings(GLOVE_EMBEDDING_PATH)
print(f'loaded {len(glove_embeddings)} word vectors in {time.time()-tic}s')

loaded 2196008 word vectors in 9.310883522033691s


In [15]:
# Lets check how many words we got covered 

vocab = build_vocab(list(train['text'].apply(lambda x:x.split())))
oov = check_coverage(vocab,glove_embeddings)
oov[:20]

HBox(children=(IntProgress(value=0, max=5279), HTML(value='')))




HBox(children=(IntProgress(value=0, max=63529), HTML(value='')))


Found embeddings for 67.00% of vocab
Found embeddings for  96.42% of all text


[('I’m', 1415),
 ('Ocrevus', 1034),
 ('gilenya', 821),
 ('it’s', 763),
 ('ocrevus', 751),
 ('I’ve', 661),
 ('don’t', 653),
 ('Entyvio', 549),
 ('Opdivo', 501),
 ('Keytruda', 500),
 ('Ocrelizumab', 425),
 ('It’s', 412),
 ('entyvio', 405),
 ('didn’t', 378),
 ('Crohn’s', 348),
 ('pembrolizumab', 346),
 ('Tagrisso', 330),
 ('nivolumab', 323),
 ('Tecfidera', 322),
 ('keytruda', 272)]

In [16]:
# Handle Won't 

replaceWords1 = { "won't":"will not","$&@*#":"in most profane vulgar shitty terms","#$&@*#":"shitty",
 "can't":"cannot","aren't": 'are not',
 "Aren't": 'Are not',
 "AREN'T": 'ARE NOT',
 "C'est": "C'est",
 "C'mon": "C'mon",
 "c'mon": "c'mon",
 "can't": 'cannot',
 "Can't": 'Cannot',
 "CAN'T": 'CANNOT',
 "con't": 'continued',
 "cont'd": 'continued',
 "could've": 'could have',
 "couldn't": 'could not',
 "Couldn't": 'Could not',
 "didn't": 'did not',
 "Didn't": 'Did not',
 "DIDN'T": 'DID NOT',
 "don't": 'do not',
 "Don't": 'Do not',
 "DON'T": 'DO NOT',
 "doesn't": 'does not',
 "Doesn't": 'Does not',
 "else's": 'else',
 "gov's": 'government',
 "Gov's": 'government',
 "gov't": 'government',
 "Gov't": 'government',
 "govt's": 'government',
 "gov'ts": 'governments',
 "hadn't": 'had not',
 "hasn't": 'has not',
 "Hasn't": 'Has not',
 "haven't": 'have not',
 "Haven't": 'Have not',
 "he's": 'he is',
 "He's": 'He is',
 "he'll": 'he will',
 "He'll": 'He will',
 "he'd": 'he would',
 "He'd": 'He would',
 "Here's": 'Here is',
 "here's": 'here is',
 "I'm": 'I am',
 "i'm": 'i am',
 "I'M": 'I am',
 "I've": 'I have',
 "i've": 'i have',
 "I'll": 'I will',
 "i'll": 'i will',
 "I'd": 'I would',
 "i'd": 'i would',
 "ain't": 'is not',
 "isn't": 'is not',
 "Isn't": 'Is not',
 "ISN'T": 'IS NOT',
 "it's": 'it is',
 "It's": 'It is',
 "IT'S": 'IT IS',
 "I's": 'It is',
 "i's": 'it is',
 "it'll": 'it will',
 "It'll": 'It will',
 "it'd": 'it would',
 "It'd": 'It would',
 "Let's": "Let's",
 "let's": 'let us',
 "ma'am": 'madam',
 "Ma'am": "Madam",
 "she's": 'she is',
 "She's": 'She is',
 "she'll": 'she will',
 "She'll": 'She will',
 "she'd": 'she would',
 "She'd": 'She would',
 "shouldn't": 'should not',
 "that's": 'that is',
 "That's": 'That is',
 "THAT'S": 'THAT IS',
 "THAT's": 'THAT IS',
 "that'll": 'that will',
 "That'll": 'That will',
 "there's": 'there is',
 "There's": 'There is',
 "there'll": 'there will',
 "There'll": 'There will',
 "there'd": 'there would',
 "they're": 'they are',
 "They're": 'They are',
 "they've": 'they have',
 "They've": 'They Have',
 "they'll": 'they will',
 "They'll": 'They will',
 "they'd": 'they would',
 "They'd": 'They would',
 "wasn't": 'was not',
 "we're": 'we are',
 "We're": 'We are',
 "we've": 'we have',
 "We've": 'We have',
 "we'll": 'we will',
 "We'll": 'We will',
 "we'd": 'we would',
 "We'd": 'We would',
 "What'll": 'What will',
 "weren't": 'were not',
 "Weren't": 'Were not',
 "what's": 'what is',
 "What's": 'What is',
 "When's": 'When is',
 "Where's": 'Where is',
 "where's": 'where is',
 "Where'd": 'Where would',
 "who're": 'who are',
 "who've": 'who have',
 "who's": 'who is',
 "Who's": 'Who is',
 "who'll": 'who will',
 "who'd": 'Who would',
 "Who'd": 'Who would',
 "won't": 'will not',
 "Won't": 'will not',
 "WON'T": 'WILL NOT',
 "would've": 'would have',
 "wouldn't": 'would not',
 "Wouldn't": 'Would not',
 "would't": 'would not',
 "Would't": 'Would not',
 "y'all": 'you all',
 "Y'all": 'You all',
 "you're": 'you are',
 "You're": 'You are',
 "YOU'RE": 'YOU ARE',
 "you've": 'you have',
 "You've": 'You have',
 "y'know": 'you know',
 "Y'know": 'You know',
 "ya'll": 'you will',
 "you'll": 'you will',
 "You'll": 'You will',
 "you'd": 'you would',
 "You'd": 'You would',
 "Y'got": 'You got',
 'cause': 'because',
 "had'nt": 'had not',
 "Had'nt": 'Had not',
 "how'd": 'how did',
 "how'd'y": 'how do you',
 "how'll": 'how will',
 "how's": 'how is',
 "I'd've": 'I would have',
 "I'll've": 'I will have',
 "i'd've": 'i would have',
 "i'll've": 'i will have',
 "it'd've": 'it would have',
 "it'll've": 'it will have',
 "mayn't": 'may not',
 "might've": 'might have',
 "mightn't": 'might not',
 "mightn't've": 'might not have',
 "must've": 'must have',
 "mustn't": 'must not',
 "mustn't've": 'must not have',
 "needn't": 'need not',
 "needn't've": 'need not have',
 "o'clock": 'of the clock',
 "oughtn't": 'ought not',
 "oughtn't've": 'ought not have',
 "shan't": 'shall not',
 "sha'n't": 'shall not',
 "shan't've": 'shall not have',
 "she'd've": 'she would have',
 "she'll've": 'she will have',
 "should've": 'should have',
 "shouldn't've": 'should not have',
 "so've": 'so have',
 "so's": 'so as',
 "this's": 'this is',
 "that'd": 'that would',
 "that'd've": 'that would have',
 "there'd've": 'there would have',
 "they'd've": 'they would have',
 "they'll've": 'they will have',
 "to've": 'to have',
 "we'd've": 'we would have',
 "we'll've": 'we will have',
 "what'll": 'what will',
 "what'll've": 'what will have',
 "what're": 'what are',
 "what've": 'what have',
 "when's": 'when is',
 "when've": 'when have',
 "where'd": 'where did',
 "where've": 'where have',
 "who'll've": 'who will have',
 "why's": 'why is',
 "why've": 'why have',
 "will've": 'will have',
 "won't've": 'will not have',
 "wouldn't've": 'would not have',
 "y'all'd": 'you all would',
 "y'all'd've": 'you all would have',
 "y'all're": 'you all are',
 "y'all've": 'you all have',
 "you'd've": 'you would have',
 "you'll've": 'you will have',
'bebecause':'be because',
'I’m':'I am',
              'it’s':'it is',
                 'I’ve':'I have',
                 'don’t':'do not',
                'However':'but',
                 'It’s':'It is',
                 'didn’t':'did not',
                 'can’t':'can not',
                 'that’s':'that is',
'doesn’t':'does not',
'I’d':'I had',
'isn’t':'is not',
'wasn’t':'was not'
                
                }

def wordreplace(tweet,replaceWords):
    for key in replaceWords:
        tweet = tweet.replace(key,replaceWords[key])
    return tweet

for index, row in train['text'].iteritems():
    train['text'][index] = wordreplace(row,replaceWords1)
    
for index, row in test['text'].iteritems():
    test['text'][index] = wordreplace(row,replaceWords1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [17]:
# Lets check how many words we got covered 

vocab = build_vocab(list(train['text'].apply(lambda x:x.split())))
oov = check_coverage(vocab,glove_embeddings)
oov[:20]

HBox(children=(IntProgress(value=0, max=5279), HTML(value='')))




HBox(children=(IntProgress(value=0, max=63505), HTML(value='')))


Found embeddings for 67.02% of vocab
Found embeddings for  96.73% of all text


[('Ocrevus', 1034),
 ('gilenya', 821),
 ('ocrevus', 751),
 ('Entyvio', 549),
 ('Opdivo', 501),
 ('Keytruda', 500),
 ('Ocrelizumab', 425),
 ('entyvio', 405),
 ('Crohn’s', 348),
 ('pembrolizumab', 346),
 ('Tagrisso', 330),
 ('nivolumab', 323),
 ('Tecfidera', 322),
 ('keytruda', 272),
 ('opdivo', 270),
 ('PD-', 234),
 ('tagrisso', 233),
 ('osimertinib', 206),
 ('stelara', 202),
 ('Bebecause', 186)]

In [18]:
import string
latin_similar = "’'‘ÆÐƎƏƐƔĲŊŒẞÞǷȜæðǝəɛɣĳŋœĸſßþƿȝĄƁÇĐƊĘĦĮƘŁØƠŞȘŢȚŦŲƯY̨Ƴąɓçđɗęħįƙłøơşșţțŧųưy̨ƴÁÀÂÄǍĂĀÃÅǺĄÆǼǢƁĆĊĈČÇĎḌĐƊÐÉÈĖÊËĚĔĒĘẸƎƏƐĠĜǦĞĢƔáàâäǎăāãåǻąæǽǣɓćċĉčçďḍđɗðéèėêëěĕēęẹǝəɛġĝǧğģɣĤḤĦIÍÌİÎÏǏĬĪĨĮỊĲĴĶƘĹĻŁĽĿʼNŃN̈ŇÑŅŊÓÒÔÖǑŎŌÕŐỌØǾƠŒĥḥħıíìiîïǐĭīĩįịĳĵķƙĸĺļłľŀŉńn̈ňñņŋóòôöǒŏōõőọøǿơœŔŘŖŚŜŠŞȘṢẞŤŢṬŦÞÚÙÛÜǓŬŪŨŰŮŲỤƯẂẀŴẄǷÝỲŶŸȲỸƳŹŻŽẒŕřŗſśŝšşșṣßťţṭŧþúùûüǔŭūũűůųụưẃẁŵẅƿýỳŷÿȳỹƴźżžẓ"
white_list = string.ascii_letters + string.digits + latin_similar + ' '
white_list += "'"

In [19]:
glove_chars = ''.join([c for c in tqdm(glove_embeddings) if len(c) == 1])
glove_symbols = ''.join([c for c in glove_chars if not c in white_list])
glove_symbols

HBox(children=(IntProgress(value=0, max=2196008), HTML(value='')))




',.":)(-!?|;$&/[]>%=#*+\\•~@£·_{}©^®`<→°€™›♥←×§″′█½…“★”–●►−¢²¬░¡¶↑±¿▾═¦║―¥▓—‹─▒：¼⊕▼▪†■▀¨▄♫☆¯♦¤▲¸¾⋅∞∙）↓、│（»，♪╩╚³・╦╣╔╗▬❤¹≤‡√◄━⇒▶º≥╝♡◊。✈≡☺✔↵≈✓♣☎℃◦└‟～！○◆№♠▌✿▸⁄□❖✦．÷｜┃／￥╠↩✭▐☼µ☻┐├«∼┌℉☮฿≦♬✧〉－⌂✖･◕※‖◀‰\x97↺∆┘┬╬،⌘⊂ª＞〈⎙Å？☠⇐▫∗∈≠♀ƒ♔˚℗┗＊┼❀＆∩♂‿∑‣➜┛⇓☯⊖☀┳；∇⇑✰◇♯☞´↔┏｡◘∂✌♭┣┴┓✨ˈ˜❥┫℠✒［∫\x93≧］\x94∀♛\x96∨◎ˑ↻⅓⇩＜≫✩ˆ✪♕؟₤☛╮␊＋┈ɡ％╋▽⇨┻⊗￡।▂✯▇＿➤₂✞＝▷△◙▅✝ﾟ∧␉☭┊╯☾➔∴\x92▃↳＾׳➢╭➡＠⊙☢˝⅛∏„①๑∥❝☐▆╱⋙๏☁⇔▔\x91②➚◡╰٠♢˙۞✘✮☑⋆ℓⓘ❒☣✉⌊➠∣❑⅔◢ⓒ\x80〒∕▮⦿✫✚⋯♩☂ˌ❞‗܂☜‾✜╲∘⟩＼⟨·⅜✗♚∅ⓔ◣͡‛❦⑨③◠✄❄１∃␣≪｢≅◯☽２∎｣⁰❧̅ǡⒶ↘⚓▣˘∪⇢✍⊥＃⅝⎯↠۩☰◥⊆✽ﬁ⚡↪ở❁☹◼☃◤❏ⓢ⊱α➝̣✡∠｀▴┤Ȃ∝♏ⓐ✎;３④␤＇❣⅞✂✤ⓞ☪✴⌒˛♒＄ɪ✶▻Ⓔ◌◈۲Ʈ❚ʿ❂￦◉╜̃ν✱╖❉₃ⓡℝ٤↗❶ʡ۰ˇⓣ♻➽۶₁ʃ׀✲ʤ✬☉▉≒☥⌐♨✕ⓝ⊰❘＂⇧̵➪４▁β۱▏⊃ⓛ‚♰́✏⏑̶٩Ⓢー⩾日￠❍≃⋰♋ɿ､̂❋✳ⓤ╤▕⌣✸℮⁺▨⑤╨Ⓥ♈❃☝５✻⊇≻♘♞◂７✟⌠✠☚✥❊ƂⒸ⌈❅Ⓡ♧Ⓞɑλ۵▭❱Ⓣ∟☕♺∵⍝ⓑɔ✵✣ℤ年ℕ٭♆Ⓘⅆ∶⚜◞்✹Ǥȡ➥ᴥ↕ɂ̳∷✋➧∋̿ͧʘ┅⥤⬆ǀμ₄⋱ʔ☄↖⋮۔♌Ⓛ╕♓ـ⁴❯♍▋✺⭐６✾♊➣▿Ⓑ♉Ａ⏠◾▹⑥⩽в↦╥⍵⌋։➨и∮⇥ⓗⒹ⁻ʊ⎝⌥⌉◔◑ǂ✼♎ℂ♐╪ɨ⊚☒⇤θВⓜ⎠Ｏ◐ǰ⚠╞ﬂ◗⎕ⓨ☟Ｉⓟ♟❈↬ⓓ◻♮❙а♤∉؛⁂例Ⓝ־♑╫╓╳⬅☔πɒɹ߂☸ɐʻ┄╧ʌ׃８ʒ⎢❆⋄⚫̏☏➞͂␙Ⓤ◟Ƥʕ̊Ȥ⚐✙は↙̾ωΔ℘ﾞ✷⑦φ⍺❌⊢▵✅ｗ９ⓖ☨▰ʹ╡Ⓜ☤∽╘˹↨ȿ♙⬇♱⌡Ω⠀╛❕┉Ⓟ̀Ǩ♖ⓚ┆⑧⎜ǹ◜⚾⤴✇╟⎛☩➲➟ⓥⒽ⏝◃０₀╢月↯✆˃⍴❇⚽╒Ｃɻɤ̸♜☓Ｔ➳⇄γ☬⚑✐⁵δȭ⌃◅▢ｓȸ❐∊☈ⅇℜ॥σ⎮ȣ▩のτεＳு⊹‵␔☊➸̌☿⇉➊⊳╙⁶ⓦ⇣｛̄↝⎟ℳ▍❗ℑＭɾｍ״Γ΄▞◁⛄⇝⎪ˤ♁ｖ⇠☇✊位ℒạி｝๐⭕➘Ｂ❺ɸˡ⁀⑩ｃ⅕Ƽ۳☙❛₆ƪ❓⟲Ʒ⇀≲Ｐ❷١ⓕ⎥Ｄс\u06ddǥͤ₋̱̎♝≳▙Ｒʹ➭ℰ܀ʺȫⒼ⇛ˉ▊❸号⇗̷

In [20]:
jigsaw_chars = build_vocab(list(train["text"]))
jigsaw_symbols = ''.join([c for c in jigsaw_chars if not c in white_list])
jigsaw_symbols

HBox(children=(IntProgress(value=0, max=5279), HTML(value='')))




'–-"?“”@!_…β»≥<=*&μ+|α—°>\x92\x97\x91®\x96#\xad👍💜\x80\x99~😉−🙂$≤€™―😛▼►©⩾\x93˜‒¿·«‹›‐^£\u200d±😂×½\x94🤙🏻¦•µω}🙁´☹️☺¼‰ˆ🙋😱😑💁😠🙏💰😳³💔🤷♀≼≽😀🙄😫😊🙃🤣χ😟ργ😖\x7fδ🏾🐻💪🏼😎✌†‡§¶κ\u200e😜🤔😕�{∞🎉😬„■Φ‑❤\\\ue103🤘☝🏽ζ⁄′`́😩😁\u202a\u202cΔ″¾¥\ufeff😒²\u3000😇😯ﬁ🤞😏\u200f\x9d😧💋÷🎷'

In [21]:
# Basically we can delete all symbols we have no embeddings for:

symbols_to_delete = ''.join([c for c in jigsaw_symbols if not c in glove_symbols])
symbols_to_delete

'\xad👍💜\x99😉🙂😛‒‐\u200d😂🤙🏻🙁️🙋😱😑💁😠🙏💰😳💔🤷≼≽😀🙄😫😊🙃🤣χ😟😖\x7f🏾🐻💪🏼😎κ\u200e😜🤔😕�🎉😬‑\ue103🤘🏽ζ😩😁\u202a\u202c\ufeff😒\u3000😇😯🤞😏\u200f\x9d😧💋🎷'

In [22]:
# The symbols we want to keep we need to isolate from our words. So lets setup a list of those to isolate.

symbols_to_isolate = ''.join([c for c in jigsaw_symbols if c in glove_symbols])
symbols_to_isolate

'–-"?“”@!_…β»≥<=*&μ+|α—°>\x92\x97\x91®\x96#\x80~−$≤€™―▼►©⩾\x93˜¿·«‹›^£±×½\x94¦•µω}´☹☺¼‰ˆ³♀ργδ✌†‡§¶{∞„■Φ❤\\☝⁄′`́Δ″¾¥²ﬁ÷'

In [23]:
# Note : Next comes the next trick. Instead of using an inefficient loop of replace we use translate. 
# I find the syntax a bit weird, but the improvement in speed is worth the worse readablity. 

isolate_dict = {ord(c):f' {c} ' for c in symbols_to_isolate}
remove_dict = {ord(c):f'' for c in symbols_to_delete}


def handle_punctuation(x):
    x = x.translate(remove_dict)
    x = x.translate(isolate_dict)
    return x

In [24]:
#So lets apply that function to our text and reasses the coverage

train['text'] = train['text'].progress_apply(lambda x:handle_punctuation(x))
test['text'] = test['text'].progress_apply(lambda x:handle_punctuation(x))

HBox(children=(IntProgress(value=0, max=5279), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2924), HTML(value='')))




In [25]:
vocab = build_vocab(list(train['text'].apply(lambda x:x.split())))
oov = check_coverage(vocab,glove_embeddings)
oov[:20]

HBox(children=(IntProgress(value=0, max=5279), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50721), HTML(value='')))


Found embeddings for 82.30% of vocab
Found embeddings for  98.21% of all text


[('Ocrevus', 1087),
 ('gilenya', 831),
 ('ocrevus', 772),
 ('Entyvio', 554),
 ('Opdivo', 526),
 ('Keytruda', 522),
 ('Ocrelizumab', 444),
 ('entyvio', 406),
 ('pembrolizumab', 375),
 ('Crohn’s', 351),
 ('Tagrisso', 347),
 ('Tecfidera', 338),
 ('nivolumab', 333),
 ('keytruda', 277),
 ('opdivo', 272),
 ('tagrisso', 237),
 ('osimertinib', 210),
 ('stelara', 203),
 ('Bebecause', 191),
 ('haven’t', 150)]

In [26]:
from nltk.tokenize.treebank import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

In [27]:
def handle_contractions(x):
    x = tokenizer.tokenize(x)
    x = ' '.join(x)
    return x

In [28]:
train['text'] = train['text'].progress_apply(lambda x:handle_contractions(x))
test['text'] = test['text'].progress_apply(lambda x:handle_contractions(x))


HBox(children=(IntProgress(value=0, max=5279), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2924), HTML(value='')))




In [29]:
vocab = build_vocab(list(train['text'].apply(lambda x:x.split())),verbose=False)
oov = check_coverage(vocab,glove_embeddings)
oov[:20]

HBox(children=(IntProgress(value=0, max=49675), HTML(value='')))


Found embeddings for 84.22% of vocab
Found embeddings for  98.51% of all text


[('Ocrevus', 1095),
 ('gilenya', 831),
 ('ocrevus', 772),
 ('Entyvio', 554),
 ('Opdivo', 530),
 ('Keytruda', 529),
 ('Ocrelizumab', 445),
 ('entyvio', 406),
 ('pembrolizumab', 376),
 ('Tagrisso', 347),
 ('Tecfidera', 339),
 ('nivolumab', 333),
 ('keytruda', 277),
 ('opdivo', 272),
 ('tagrisso', 237),
 ('osimertinib', 210),
 ('stelara', 203),
 ('Bebecause', 191),
 ('Nivolumab', 139),
 ('vedolizumab', 138)]

In [30]:
def fix_quote(x):
    x = [x_[1:] if x_.startswith("'") else x_ for x_ in x]
    x = ' '.join(x)
    return x

In [31]:
train['text'] = train['text'].progress_apply(lambda x:fix_quote(x.split()))
test['text'] = test['text'].progress_apply(lambda x:fix_quote(x.split()))

HBox(children=(IntProgress(value=0, max=5279), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2924), HTML(value='')))




In [32]:
vocab = build_vocab(list(train['text'].apply(lambda x:x.split())),verbose=False)
oov = check_coverage(vocab,glove_embeddings)
oov[:10]

HBox(children=(IntProgress(value=0, max=49675), HTML(value='')))


Found embeddings for 84.22% of vocab
Found embeddings for  98.51% of all text


[('Ocrevus', 1095),
 ('gilenya', 831),
 ('ocrevus', 772),
 ('Entyvio', 554),
 ('Opdivo', 530),
 ('Keytruda', 529),
 ('Ocrelizumab', 445),
 ('entyvio', 406),
 ('pembrolizumab', 376),
 ('Tagrisso', 347)]

## CRAWL embeddings

In [33]:
tic = time.time()
crawl_embeddings = load_embeddings(CRAWL_EMBEDDING_PATH)
print(f'loaded {len(glove_embeddings)} word vectors in {time.time()-tic}s')

loaded 2196008 word vectors in 8.331852436065674s


In [34]:
vocab = build_vocab(list(train['text'].apply(lambda x:x.split())))
oov = check_coverage(vocab,crawl_embeddings)
oov[:20]

HBox(children=(IntProgress(value=0, max=5279), HTML(value='')))




HBox(children=(IntProgress(value=0, max=49675), HTML(value='')))


Found embeddings for 84.80% of vocab
Found embeddings for  98.64% of all text


[('``', 2887),
 ('_', 875),
 ('gilenya', 831),
 ('ocrevus', 772),
 ('Entyvio', 554),
 ('entyvio', 406),
 ('alimta', 389),
 ('tarceva', 315),
 ('keytruda', 277),
 ('opdivo', 272),
 ('tagrisso', 237),
 ('stelara', 203),
 ('Bebecause', 191),
 ('OCREVUS', 114),
 ('lemtrada', 112),
 ('tysabri', 102),
 ('siponimod', 99),
 ('Pancolitis', 97),
 ('nsclc', 94),
 ('rowasa', 93)]

In [35]:
punctuation = '_`'

train['text'] = train['text'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
test['text'] = test['text'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

In [36]:
#Lets check the embeddings now

vocab = build_vocab(list(train['text'].apply(lambda x:x.split())))
oov = check_coverage(vocab,crawl_embeddings)
oov[:20]

HBox(children=(IntProgress(value=0, max=5279), HTML(value='')))




HBox(children=(IntProgress(value=0, max=49672), HTML(value='')))


Found embeddings for 84.81% of vocab
Found embeddings for  98.84% of all text


[('gilenya', 831),
 ('ocrevus', 772),
 ('Entyvio', 554),
 ('entyvio', 406),
 ('alimta', 389),
 ('tarceva', 315),
 ('keytruda', 277),
 ('opdivo', 272),
 ('tagrisso', 237),
 ('stelara', 203),
 ('Bebecause', 191),
 ('OCREVUS', 114),
 ('lemtrada', 112),
 ('tysabri', 102),
 ('siponimod', 99),
 ('Pancolitis', 97),
 ('nsclc', 94),
 ('rowasa', 93),
 ('Mavenclad', 93),
 ('abraxane', 85)]

In [37]:
X = train['text']
y = train['sentiment']
test_pred = test['text']

In [38]:
NUM_MODELS = 2
LSTM_UNITS = 200
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
MAX_LEN = 1500
max_features = 500000

BATCH_SIZE = 90
EPOCHS = 8

In [39]:
# Its really important that you intitialize the keras tokenizer correctly. Per default it does lower case and removes a lot of symbols. We want neither of that!

tokenizer = text.Tokenizer(num_words = max_features, filters='',lower=False)

In [40]:
tokenizer.fit_on_texts(list(X) + list(test_pred))

In [41]:
crawl_matrix, unknown_words_crawl = build_matrix(tokenizer.word_index, CRAWL_EMBEDDING_PATH)
print('n unknown words (crawl): ', len(unknown_words_crawl))

glove_matrix, unknown_words_glove = build_matrix(tokenizer.word_index, GLOVE_EMBEDDING_PATH)
print('n unknown words (glove): ', len(unknown_words_glove))

max_features = max_features or len(tokenizer.word_index) + 1
max_features

embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)
embedding_matrix.shape

import gc
del crawl_matrix
del glove_matrix
gc.collect()

n unknown words (crawl):  9863
n unknown words (glove):  10193


0

In [42]:
X = tokenizer.texts_to_sequences(X)
test_pred = tokenizer.texts_to_sequences(test_pred)

In [43]:
X = sequence.pad_sequences(X, maxlen=MAX_LEN)
test_pred = sequence.pad_sequences(test_pred, maxlen=MAX_LEN)

In [44]:
checkpoint_predictions = []
weights = []

In [45]:
# Check F1 score

from keras import backend as K

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [46]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, Dropout, add, concatenate,Flatten,Lambda
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D,PReLU,LSTM
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler
from keras.models import Sequential
from keras.preprocessing import text, sequence
from keras import regularizers
import keras
import tensorflow as tf
import keras.backend as K
from sklearn.model_selection import train_test_split
from keras.engine.topology import Layer
import tensorflow_hub as hub
from keras.layers.normalization import BatchNormalization

In [47]:
X_train , X_val, y_train  , y_val = train_test_split(X , 
                                                     y , 
                                                     stratify = y.values , 
                                                     train_size = 0.8,
                                                     random_state = 100)

In [48]:
from keras.callbacks import EarlyStopping 
es = EarlyStopping(monitor='val_loss', mode ='min' ,verbose =1,patience=5)

In [49]:
def build_model(embedding_matrix, num_aux_targets):
    words = Input(shape=(MAX_LEN,),name = 'input')
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words) #Finds word embeddings for each word
    x = SpatialDropout1D(0.3)(x) #This version performs the same function as Dropout, however it drops entire 1D feature maps instead of individual elements
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    hidden = concatenate([
        GlobalMaxPooling1D()(x), 
        GlobalAveragePooling1D()(x),#layer returns a fixed-length output vector for each example by averaging over the sequence dimension. This allows the model to handle input 
        #of variable length in the simplest way possible.
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)]) #This fixed-length output vector is piped through a fully-connected (Dense) layer with x hidden units.
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(3, activation='softmax')(hidden)
    model = Model(inputs=words, outputs= result)
    model.compile(loss='sparse_categorical_crossentropy',metrics = ['accuracy',f1_m], optimizer='adam')
    
    return model

In [50]:
len(X_train[1])

1500

In [51]:
embedding_matrix.shape

(58357, 600)

In [52]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)

In [53]:
for model_idx in range(NUM_MODELS):
    model = build_model(embedding_matrix,1)
    for global_epoch in range(EPOCHS):
        model.fit(
            X_train,
            y_train,
            validation_data = (X_val, y_val),
            batch_size=BATCH_SIZE,
            epochs=100,
            verbose=2,
            class_weight=class_weights,
            callbacks=[
                LearningRateScheduler(lambda epoch: 1e-3 * (0.4 ** global_epoch)),
                es
            ]
        )
        checkpoint_predictions.append(model.predict(test_pred))
        weights.append(2 ** global_epoch)

Train on 4223 samples, validate on 1056 samples
Epoch 1/100
 - 51s - loss: 0.7791 - acc: 0.7144 - f1_m: 1.1241 - val_loss: 0.7633 - val_acc: 0.7244 - val_f1_m: 1.2607
Epoch 2/100
 - 48s - loss: 0.7198 - acc: 0.7239 - f1_m: 1.1150 - val_loss: 0.7272 - val_acc: 0.7244 - val_f1_m: 1.0057
Epoch 3/100
 - 48s - loss: 0.6919 - acc: 0.7289 - f1_m: 1.1222 - val_loss: 0.7238 - val_acc: 0.7263 - val_f1_m: 1.0172
Epoch 4/100
 - 48s - loss: 0.6617 - acc: 0.7390 - f1_m: 1.1559 - val_loss: 0.7374 - val_acc: 0.7102 - val_f1_m: 1.2339
Epoch 5/100
 - 48s - loss: 0.6263 - acc: 0.7457 - f1_m: 1.1518 - val_loss: 0.7038 - val_acc: 0.7339 - val_f1_m: 1.2128
Epoch 6/100
 - 48s - loss: 0.6020 - acc: 0.7561 - f1_m: 1.1291 - val_loss: 0.7498 - val_acc: 0.7405 - val_f1_m: 1.0793
Epoch 7/100
 - 48s - loss: 0.5521 - acc: 0.7776 - f1_m: 1.1145 - val_loss: 0.7455 - val_acc: 0.7074 - val_f1_m: 1.1464
Epoch 8/100
 - 48s - loss: 0.5155 - acc: 0.7895 - f1_m: 1.1066 - val_loss: 0.7313 - val_acc: 0.7358 - val_f1_m: 1.1390


In [54]:
predictions_x = np.average(checkpoint_predictions, weights=weights, axis=0)

In [55]:
pred_avg =np.argmax(predictions_x,axis=1) 

In [56]:
pred_avg

array([1, 1, 2, ..., 2, 2, 2])

In [57]:
predictions=model.predict(test_pred)

In [58]:
prediction_round = np.argmax(predictions,axis=1)

In [59]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_val, batch_size=64, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

print(classification_report(y_val, y_pred_bool))

              precision    recall  f1-score   support

           0       0.35      0.24      0.28       123
           1       0.43      0.33      0.38       168
           2       0.78      0.87      0.82       765

    accuracy                           0.71      1056
   macro avg       0.52      0.48      0.49      1056
weighted avg       0.68      0.71      0.69      1056



In [60]:
sub = pd.read_csv("../input/innoplexus-online-hiring-hackathon/sample_submission_i5xnIZD.csv")

In [61]:
sub.head()

Unnamed: 0,unique_hash,sentiment
0,9e9a8166b84114aca147bf409f6f956635034c08,0
1,e747e6822c867571afe7b907b51f0f2ca67b0e1a,0
2,50b6d851bcff4f35afe354937949e9948975adf7,0
3,7f82ec2176ae6ab0b5d20b5ffc767ac829f384ae,0
4,8b37d169dee5bdae27060949242fb54feb6a7f7f,0


In [62]:
sub['sentiment'] = pred_avg

In [63]:
sub.to_csv("Glove_Sub2.csv",index=False)