 This is the inference kernel for BERT+LSTM ensemble. Runs in Kaggle kernel in <2 hrs.

In [1]:
import sys
package_dir = "../input/ppbert/pytorch-pretrained-bert/pytorch-pretrained-BERT"
sys.path.append(package_dir)

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import random
import torch.utils.data
import numpy as np
import pandas as pd
from scipy.stats import rankdata

from tqdm import tqdm, tqdm_notebook
import os, glob, gc
import warnings
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam, BertConfig
from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertModel
from torch.nn import functional as F

from unicodedata import category, name, normalize
import re
from multiprocessing import Pool
from string import ascii_uppercase
import unicodedata as ud

warnings.filterwarnings(action='once')
device = torch.device('cuda')

In [3]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [4]:
MAX_SEQUENCE_LENGTH = 256
BERT_LARGE_BATCH_SIZE = 64
BERT_BASE_BATCH_SIZE = 128
LARGE_BIN_PATH = '../input/large-bin-all/'
BASE_BIN_PATH = '../input/base-bin-all/'
LARGE_CONT_PATH = '../input/large-cont-all/'
BASE_CONT_PATH = '../input/base-cont-all/'
MODELS_TO_SKIP = ['../input/large-cont-all/1_bert_pytorch_14_NewLoss_all_0.bin',
                  '../input/large-cont-all/1_bert_pytorch_14_NewLoss_all_1.bin',
                  '../input/large-bin-all/1_bert_pytorch_15_binary_all_0.bin',
                  '../input/large-bin-all/1_bert_pytorch_15_binary_all_1.bin']

# !!!!!  REPLACE (if not full test)!!!
test_df = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv")
test_df['comment_text'] = test_df['comment_text'].astype(str) 

**Preprocessing for bert with binary target**

In [5]:
def convert_lines_bin(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    lens = []
    for text in example:
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        lens.append(len(tokens_a))
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    return np.array(all_tokens), lens

**Preprocessing for bert with continoues target**

In [6]:
isolate_words = ['trump', 'clinton', 'penis', 'fuck', 'hillary']
def isolate(text):
    for word in isolate_words:
        if word in text:
            text = text.replace(word, ' '+word+' ')
    return text


mis_spell_mapping = {'hwhat':'what',
                      'whwhat': 'what', 'howhat':'how that',
                      'whybis':'Why is', 
                      "howddo":"How do", 'howeber':'However', 'showh':'show',
                      'fcuk':'fuck', 'fuuck':'fuck','fuuuck':'fuck'}

def correct_misspel(text):
    for error in mis_spell_mapping:
        if error in text:
            text = text.replace(error, mis_spell_mapping[error])        
    return text


def remove_diacritics(s):
    return ''.join(c for c in normalize('NFKD', s.replace('ø', 'o').replace('Ø', 'O').replace('⁻', '-').replace('₋', '-'))
                  if category(c) != 'Mn')




letters = (x for x in ascii_uppercase if x not in ('Q', 'X'))
mapping = {ord(ud.lookup('LATIN LETTER SMALL CAPITAL ' + x)): x for x in letters}


def convert_line_cont(text, max_seq_length, tokenizer):
    text = ' '.join(re.findall(r'[A-Z]?[^A-Z\s]+|[A-Z]+', text))
    text = re.sub(r'[A-Za-z]+://[A-Za-z0-9-_]+.[A-Za-z0-9-_:%&;\?#/.=]+', ' https ', text)
    text = text.lower()
    text = correct_misspel(isolate(remove_diacritics(text)))
    tokens_a = tokenizer.tokenize(text)
    if len(tokens_a) > max_seq_length:
        tokens_a = tokens_a[:max_seq_length]
    return tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens_a + ["[SEP]"]) + [0] * (
                max_seq_length - len(tokens_a)), len(tokens_a)


def convert_lines_cont(examples, max_seq_length, tokenizer):
    all_tokens = []
    max_seq_length -= 2
    examples = examples.str.translate(mapping)
            
    with Pool(2) as p:
        all_tokens = p.starmap(convert_line_cont, [(text, max_seq_length, tokenizer) for text in examples])
    
    all_tokens, lens = map(list, zip(*all_tokens))
    return np.array(all_tokens), lens

**Inference functions**

In [7]:
def trim_tensors(tsrs):
    max_len = max(torch.sum((torch.sum((tsrs != 0), dim=0) > 0)).item(), 1)
    return tsrs[:, :max_len]


def bert_model_inference(bert_config, model_path, test_loader, batch_size):
    model = BertForSequenceClassification(bert_config, num_labels=7)
    model.to(device)
    
    tmp_state = torch.load(model_path, map_location=device)
    model.load_state_dict(tmp_state)
    del tmp_state
    torch.cuda.empty_cache()
    gc.collect()
    
    for param in model.parameters():
        param.requires_grad = False
    model = model.eval()
    
    test_preds = np.zeros((len(X_test)))
    
    tk0 = tqdm(test_loader)
    for i, (x_batch,) in enumerate(tk0):
        x_batch = trim_tensors(x_batch)
        pred = model(x_batch.to(device), 
                     attention_mask=(x_batch > 0).to(device))
        test_preds[i * batch_size:(i + 1) * batch_size] = pred[:, 0].detach().cpu().squeeze().numpy()

    test_pred = torch.sigmoid(torch.tensor(test_preds)).numpy().ravel()
    
    del model
    return test_pred


def folds_inference(path, test_dataset):
    batch_size = BERT_LARGE_BATCH_SIZE if 'large' in path else BERT_BASE_BATCH_SIZE
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    
    bert_config = BertConfig(path + 'bert_config.json')
    models = glob.glob(path + '*bert*.bin*')
    test_pred = []
    
    for model_path in models:
        if model_path in MODELS_TO_SKIP:
            continue
        print('Inference:',model_path)
        
        test_pred.append(bert_model_inference(bert_config, model_path, test_loader, batch_size))
    test_pred = np.average(test_pred, axis=0)
    
    return test_pred

In [8]:
import pickle
from nltk.tokenize.treebank import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

import torch.nn as nn
from torch.utils import data
from keras.preprocessing import text, sequence

CRAWL_EMBEDDING_PATH = '../input/pickled-crawl300d2m-for-kernel-competitions/crawl-300d-2M.pkl'
GLOVE_EMBEDDING_PATH = '../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl'


LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
MAX_LEN = 220

lstm_models = []
TRAIN_MAX_FEAURES = 410047

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
Using TensorFlow backend.
  return f(*args, **kwds)
  _config = json.load(open(_config_path))


**LSTM preprocessing**

In [9]:
%%time

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path,'rb') as f:
        emb_arr = pickle.load(f)
    return emb_arr

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((max_features + 1, 300))
    unknown_words = []
    
    for word, i in word_index.items():
        if i <= max_features:
            try:
                embedding_matrix[i] = embedding_index[word]
            except KeyError:
                try:
                    embedding_matrix[i] = embedding_index[word.lower()]
                except KeyError:
                    try:
                        embedding_matrix[i] = embedding_index[word.title()]
                    except KeyError:
                        unknown_words.append(word)
                        try:
                            embedding_matrix[i] = embedding_index[word.capitalize()]
                        except KeyError:
                            unknown_words.append(word)
    return embedding_matrix, unknown_words

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x
    
symbols_to_isolate = '.,?!-;*"…:—()%#$&_/@＼・ω+=”“[]^–>\\°<~•≠™ˈʊɒ∞§{}·τα❤☺ɡ|¢→̶`❥━┣┫┗Ｏ►★©―ɪ✔®\x96\x92●£♥➤´¹☕≈÷♡◐║▬′ɔː€۩۞†μ✒➥═☆ˌ◄½ʻπδηλσερνʃ✬ＳＵＰＥＲＩＴ☻±♍µº¾✓◾؟．⬅℅»Вав❣⋅¿¬♫ＣＭβ█▓▒░⇒⭐›¡₂₃❧▰▔◞▀▂▃▄▅▆▇↙γ̄″☹➡«φ⅓„✋：¥̲̅́∙‛◇✏▷❓❗¶˚˙）сиʿ✨。ɑ\x80◕！％¯−ﬂﬁ₁²ʌ¼⁴⁄₄⌠♭✘╪▶☭✭♪☔☠♂☃☎✈✌✰❆☙○‣⚓年∎ℒ▪▙☏⅛ｃａｓǀ℮¸ｗ‚∼‖ℳ❄←☼⋆ʒ⊂、⅔¨͡๏⚾⚽Φ×θ￦？（℃⏩☮⚠月✊❌⭕▸■⇌☐☑⚡☄ǫ╭∩╮，例＞ʕɐ̣Δ₀✞┈╱╲▏▕┃╰▊▋╯┳┊≥☒↑☝ɹ✅☛♩☞ＡＪＢ◔◡↓♀⬆̱ℏ\x91⠀ˤ╚↺⇤∏✾◦♬³の｜／∵∴√Ω¤☜▲↳▫‿⬇✧ｏｖｍ－２０８＇‰≤∕ˆ⚜☁'
symbols_to_delete = '\n🍕\r🐵😑\xa0\ue014\t\uf818\uf04a\xad😢🐶️\uf0e0😜😎👊\u200b\u200e😁عدويهصقأناخلىبمغر😍💖💵Е👎😀😂\u202a\u202c🔥😄🏻💥ᴍʏʀᴇɴᴅᴏᴀᴋʜᴜʟᴛᴄᴘʙғᴊᴡɢ😋👏שלוםבי😱‼\x81エンジ故障\u2009🚌ᴵ͞🌟😊😳😧🙀😐😕\u200f👍😮😃😘אעכח💩💯⛽🚄🏼ஜ😖ᴠ🚲‐😟😈💪🙏🎯🌹😇💔😡\x7f👌ἐὶήιὲκἀίῃἴξ🙄Ｈ😠\ufeff\u2028😉😤⛺🙂\u3000تحكسة👮💙فزط😏🍾🎉😞\u2008🏾😅😭👻😥😔😓🏽🎆🍻🍽🎶🌺🤔😪\x08‑🐰🐇🐱🙆😨🙃💕𝘊𝘦𝘳𝘢𝘵𝘰𝘤𝘺𝘴𝘪𝘧𝘮𝘣💗💚地獄谷улкнПоАН🐾🐕😆ה🔗🚽歌舞伎🙈😴🏿🤗🇺🇸мυтѕ⤵🏆🎃😩\u200a🌠🐟💫💰💎эпрд\x95🖐🙅⛲🍰🤐👆🙌\u2002💛🙁👀🙊🙉\u2004ˢᵒʳʸᴼᴷᴺʷᵗʰᵉᵘ\x13🚬🤓\ue602😵άοόςέὸתמדףנרךצט😒͝🆕👅👥👄🔄🔤👉👤👶👲🔛🎓\uf0b7\uf04c\x9f\x10成都😣⏺😌🤑🌏😯ех😲Ἰᾶὁ💞🚓🔔📚🏀👐\u202d💤🍇\ue613小土豆🏡❔⁉\u202f👠》कर्मा🇹🇼🌸蔡英文🌞🎲レクサス😛外国人关系Сб💋💀🎄💜🤢َِьыгя不是\x9c\x9d🗑\u2005💃📣👿༼つ༽😰ḷЗз▱ц￼🤣卖温哥华议会下降你失去所有的钱加拿大坏税骗子🐝ツ🎅\x85🍺آإشء🎵🌎͟ἔ油别克🤡🤥😬🤧й\u2003🚀🤴ʲшчИОРФДЯМюж😝🖑ὐύύ特殊作戦群щ💨圆明园קℐ🏈😺🌍⏏ệ🍔🐮🍁🍆🍑🌮🌯🤦\u200d𝓒𝓲𝓿𝓵안영하세요ЖљКћ🍀😫🤤ῦ我出生在了可以说普通话汉语好极🎼🕺🍸🥂🗽🎇🎊🆘🤠👩🖒🚪天一家⚲\u2006⚭⚆⬭⬯⏖新✀╌🇫🇷🇩🇪🇮🇬🇧😷🇨🇦ХШ🌐\x1f杀鸡给猴看ʁ𝗪𝗵𝗲𝗻𝘆𝗼𝘂𝗿𝗮𝗹𝗶𝘇𝗯𝘁𝗰𝘀𝘅𝗽𝘄𝗱📺ϖ\u2000үսᴦᎥһͺ\u2007հ\u2001ɩｙｅ൦ｌƽｈ𝐓𝐡𝐞𝐫𝐮𝐝𝐚𝐃𝐜𝐩𝐭𝐢𝐨𝐧Ƅᴨןᑯ໐ΤᏧ௦Іᴑ܁𝐬𝐰𝐲𝐛𝐦𝐯𝐑𝐙𝐣𝐇𝐂𝐘𝟎ԜТᗞ౦〔Ꭻ𝐳𝐔𝐱𝟔𝟓𝐅🐋ﬃ💘💓ё𝘥𝘯𝘶💐🌋🌄🌅𝙬𝙖𝙨𝙤𝙣𝙡𝙮𝙘𝙠𝙚𝙙𝙜𝙧𝙥𝙩𝙪𝙗𝙞𝙝𝙛👺🐷ℋ𝐀𝐥𝐪🚶𝙢Ἱ🤘ͦ💸ج패티Ｗ𝙇ᵻ👂👃ɜ🎫\uf0a7БУі🚢🚂ગુજરાતીῆ🏃𝓬𝓻𝓴𝓮𝓽𝓼☘﴾̯﴿₽\ue807𝑻𝒆𝒍𝒕𝒉𝒓𝒖𝒂𝒏𝒅𝒔𝒎𝒗𝒊👽😙\u200cЛ‒🎾👹⎌🏒⛸公寓养宠物吗🏄🐀🚑🤷操美𝒑𝒚𝒐𝑴🤙🐒欢迎来到阿拉斯ספ𝙫🐈𝒌𝙊𝙭𝙆𝙋𝙍𝘼𝙅ﷻ🦄巨收赢得白鬼愤怒要买额ẽ🚗🐳𝟏𝐟𝟖𝟑𝟕𝒄𝟗𝐠𝙄𝙃👇锟斤拷𝗢𝟳𝟱𝟬⦁マルハニチロ株式社⛷한국어ㄸㅓ니͜ʖ𝘿𝙔₵𝒩ℯ𝒾𝓁𝒶𝓉𝓇𝓊𝓃𝓈𝓅ℴ𝒻𝒽𝓀𝓌𝒸𝓎𝙏ζ𝙟𝘃𝗺𝟮𝟭𝟯𝟲👋🦊多伦🐽🎻🎹⛓🏹🍷🦆为和中友谊祝贺与其想象对法如直接问用自己猜本传教士没积唯认识基督徒曾经让相信耶稣复活死怪他但当们聊些政治题时候战胜因圣把全堂结婚孩恐惧且栗谓这样还♾🎸🤕🤒⛑🎁批判检讨🏝🦁🙋😶쥐스탱트뤼도석유가격인상이경제황을렵게만들지않록잘관리해야합다캐나에서대마초와화약금의품런성분갈때는반드시허된사용🔫👁凸ὰ💲🗯𝙈Ἄ𝒇𝒈𝒘𝒃𝑬𝑶𝕾𝖙𝖗𝖆𝖎𝖌𝖍𝖕𝖊𝖔𝖑𝖉𝖓𝖐𝖜𝖞𝖚𝖇𝕿𝖘𝖄𝖛𝖒𝖋𝖂𝕴𝖟𝖈𝕸👑🚿💡知彼百\uf005𝙀𝒛𝑲𝑳𝑾𝒋𝟒😦𝙒𝘾𝘽🏐𝘩𝘨ὼṑ𝑱𝑹𝑫𝑵𝑪🇰🇵👾ᓇᒧᔭᐃᐧᐦᑳᐨᓃᓂᑲᐸᑭᑎᓀᐣ🐄🎈🔨🐎🤞🐸💟🎰🌝🛳点击查版🍭𝑥𝑦𝑧ＮＧ👣\uf020っ🏉ф💭🎥Ξ🐴👨🤳🦍\x0b🍩𝑯𝒒😗𝟐🏂👳🍗🕉🐲چی𝑮𝗕𝗴🍒ꜥⲣⲏ🐑⏰鉄リ事件ї💊「」\uf203\uf09a\uf222\ue608\uf202\uf099\uf469\ue607\uf410\ue600燻製シ虚偽屁理屈Г𝑩𝑰𝒀𝑺🌤𝗳𝗜𝗙𝗦𝗧🍊ὺἈἡχῖΛ⤏🇳𝒙ψՁմեռայինրւդձ冬至ὀ𝒁🔹🤚🍎𝑷🐂💅𝘬𝘱𝘸𝘷𝘐𝘭𝘓𝘖𝘹𝘲𝘫کΒώ💢ΜΟΝΑΕ🇱♲𝝈↴💒⊘Ȼ🚴🖕🖤🥘📍👈➕🚫🎨🌑🐻𝐎𝐍𝐊𝑭🤖🎎😼🕷ｇｒｎｔｉｄｕｆｂｋ𝟰🇴🇭🇻🇲𝗞𝗭𝗘𝗤👼📉🍟🍦🌈🔭《🐊🐍\uf10aლڡ🐦\U0001f92f\U0001f92a🐡💳ἱ🙇𝗸𝗟𝗠𝗷🥜さようなら🔼'
isolate_dict = {ord(c):f' {c} ' for c in symbols_to_isolate}
remove_dict = {ord(c):f'' for c in symbols_to_delete}

punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
small_caps_mapping = { 
"ᴀ": "a", "ʙ": "b", "ᴄ": "c", "ᴅ": "d", "ᴇ": "e", "ғ": "f", "ɢ": "g", "ʜ": "h", "ɪ": "i", 
"ᴊ": "j", "ᴋ": "k", "ʟ": "l", "ᴍ": "m", "ɴ": "n", "ᴏ": "o", "ᴘ": "p", "ǫ": "q", "ʀ": "r", 
"s": "s", "ᴛ": "t", "ᴜ": "u", "ᴠ": "v", "ᴡ": "w", "x": "x", "ʏ": "y", "ᴢ": "z"}
contraction_mapping = {
"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", 
"didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", 
"he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  
"I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": 
"i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", 
"it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", 
"mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", 
"mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
"o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", 
"sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", 
"she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", 
"shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's":"this is","that'd": "that would", 
"that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", 
"here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", 
"they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", 
"we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", 
"what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", 
"when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have",
"who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", 
"won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 
"y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
"you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have",
"trump's": "trump is", "obama's": "obama is", "canada's": "canada is", "today's": "today is"}
specail_signs = { "…": "...", "₂": "2"}
specials = ["’", "‘", "´", "`"]

def handle_punctuation(x):
    x = x.translate(remove_dict)
    x = x.translate(isolate_dict)
    return x

def handle_contractions(x):
    x = tokenizer.tokenize(x)
    return x

def fix_quote(x):
    x = [x_[1:] if x_.startswith("'") else x_ for x_ in x]
    x = ' '.join(x)
    return x

def preprocess(x):
    x = handle_punctuation(x)
    x = handle_contractions(x)
    x = fix_quote(x)
    x = ' '.join(re.findall(r'[A-Z]?[^A-Z\s]+|[A-Z]+', x))
    x = re.sub(r'[A-Za-z]+://[A-Za-z0-9-_]+.[A-Za-z0-9-_:%&;\?#/.=]+', ' https ', x)
    x = correct_misspel(isolate(remove_diacritics(x)))
    return x

x_test = test_df['comment_text'].apply(lambda x:preprocess(x))

max_features = 410047
tokenizer = text.Tokenizer(num_words = max_features, filters='',lower=False)
tokenizer.fit_on_texts(list(x_test))

crawl_matrix, unknown_words_crawl = build_matrix(tokenizer.word_index, CRAWL_EMBEDDING_PATH)
glove_matrix, unknown_words_glove = build_matrix(tokenizer.word_index, GLOVE_EMBEDDING_PATH)

max_features = max_features or len(tokenizer.word_index) + 1

embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)
del crawl_matrix
del glove_matrix

CPU times: user 1min 1s, sys: 5.36 s, total: 1min 7s
Wall time: 1min 7s


In [10]:
class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix, num_aux_targets):
        super(NeuralNet, self).__init__()
        embed_size = embedding_matrix.shape[1]
        
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)
        
        self.lstm1 = nn.LSTM(embed_size, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True)
    
        self.linear1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.linear2 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        
        self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, 1)
        self.linear_aux_out = nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets)
        
    def forward(self, x, lengths=None):
        h_embedding = self.embedding(x.long())
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        avg_pool = torch.mean(h_lstm2, 1)
        max_pool, _ = torch.max(h_lstm2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2
        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        return out

def lstm_predict(path, batch_size, test_size):
    model = NeuralNet(embedding_matrix, 6)
    model.to(device)
    tmp_state = torch.load(path, map_location=device)
    
    for name, param in model.named_parameters():
        if 'embedding' not in name:
            param.data = tmp_state[name]  
            
    del tmp_state
    torch.cuda.empty_cache()
    gc.collect()        
    
    model = model.eval()
    
    test_preds = np.zeros(test_size)
    
    for i, x_batch in enumerate(test_loader):
        with torch.no_grad():
            X = x_batch[0].cuda()
            y_pred = sigmoid(model(X).detach().cpu().numpy()[:,0])
            test_preds[i * batch_size:(i+1) * batch_size] = y_pred
               
    return test_preds
            

In [11]:
%%time
batch_size = 1024
x_test = tokenizer.texts_to_sequences(x_test)
x_test_padded = torch.from_numpy(sequence.pad_sequences(x_test, maxlen=220))
test_loader = data.DataLoader(data.TensorDataset(x_test_padded), batch_size=batch_size, shuffle=False)

CPU times: user 4.87 s, sys: 28 ms, total: 4.9 s
Wall time: 4.92 s


In [12]:
%%time
lstm_bin_preds = []
lstm_bin_models = glob.glob('../input/lstm-bin/*.bin')

for model_path in lstm_bin_models:
    print('Inference:', model_path)
    lstm_bin_preds.append(lstm_predict(model_path, batch_size, len(x_test)))
    
lstm_bin_preds = np.average(lstm_bin_preds, axis=0)

Inference: ../input/lstm-bin/lstm_fold3_epoch4.bin
Inference: ../input/lstm-bin/lstm_fold0_epoch4.bin
Inference: ../input/lstm-bin/lstm_fold1_epoch4.bin
Inference: ../input/lstm-bin/lstm_fold2_epoch4.bin
CPU times: user 27.2 s, sys: 30.4 s, total: 57.6 s
Wall time: 1min


In [13]:
lstm_bin_preds[:5]

array([1.14304568e-02, 6.59104826e-05, 6.56928268e-03, 2.41121446e-03,
       9.88708407e-01])

In [14]:
%%time
lstm_cont_preds = []
lstm_cont_models = glob.glob('../input/lstm-cont/*.bin')

for model_path in lstm_cont_models:
    print('Inference:', model_path)
    lstm_cont_preds.append(lstm_predict(model_path, batch_size, len(x_test)))
    
lstm_cont_preds = np.average(lstm_cont_preds, axis=0)

Inference: ../input/lstm-cont/lstm_fold3_epoch4.bin
Inference: ../input/lstm-cont/lstm_fold0_epoch4.bin
Inference: ../input/lstm-cont/lstm_fold1_epoch4.bin
Inference: ../input/lstm-cont/lstm_fold2_epoch4.bin
CPU times: user 24 s, sys: 29.6 s, total: 53.6 s
Wall time: 53.7 s


In [15]:
lstm_cont_preds[:5]

array([0.03682656, 0.00694891, 0.04743195, 0.03628699, 0.76618168])

In [16]:
del embedding_matrix, x_test, x_test_padded, test_loader
gc.collect()

0

**Binarized BERTs inference**

In [17]:
tokenizer = BertTokenizer.from_pretrained(LARGE_BIN_PATH, cache_dir=None,do_lower_case=True)
X_test, lens = convert_lines_bin(test_df["comment_text"].fillna("DUMMY_VALUE"), MAX_SEQUENCE_LENGTH, tokenizer)

sorted_idx = np.argsort(lens)
X_test = X_test[sorted_idx]

test = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.long))

In [18]:
test_pred = folds_inference(LARGE_BIN_PATH, test)
large_bin_preds = np.zeros((len(X_test)))
large_bin_preds[sorted_idx] = test_pred

test_pred = folds_inference(BASE_BIN_PATH, test)
base_bin_preds = np.zeros((len(X_test)))
base_bin_preds[sorted_idx] = test_pred

Inference: ../input/large-bin-all/1_bert_pytorch_15_binary_all_3.bin


100%|██████████| 1521/1521 [11:38<00:00,  1.50s/it]


Inference: ../input/large-bin-all/1_bert_pytorch_15_binary_all_2.bin


100%|██████████| 1521/1521 [11:38<00:00,  1.50s/it]


Inference: ../input/base-bin-all/1_bert_pytorch_15_binary_all_base_2.bin


100%|██████████| 761/761 [03:35<00:00,  1.19it/s]


Inference: ../input/base-bin-all/1_bert_pytorch_15_binary_all_base_3.bin


100%|██████████| 761/761 [03:36<00:00,  1.21it/s]


Inference: ../input/base-bin-all/1_bert_pytorch_15_binary_all_base_1.bin


100%|██████████| 761/761 [03:36<00:00,  1.20it/s]


Inference: ../input/base-bin-all/1_bert_pytorch_15_binary_all_base_0.bin


100%|██████████| 761/761 [03:36<00:00,  1.20it/s]


In [19]:
large_bin_preds[:5]

array([3.65564321e-04, 6.13849207e-04, 4.49121751e-03, 4.17019566e-04,
       9.91272908e-01])

In [20]:
base_bin_preds[:5]

array([6.50057844e-04, 3.42046558e-04, 5.34829549e-03, 1.02787744e-03,
       9.89213815e-01])

**Continous BERTs inference**

In [21]:
#tokenizer = BertTokenizer.from_pretrained(LARGE_CONT_PATH, cache_dir=None,do_lower_case=True)
X_test, lens = convert_lines_cont(test_df["comment_text"].fillna("DUMMY_VALUE"), MAX_SEQUENCE_LENGTH, tokenizer)

sorted_idx = np.argsort(lens)
X_test = X_test[sorted_idx]

test = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.long))

In [22]:
test_pred = folds_inference(LARGE_CONT_PATH, test)
large_cont_preds = np.zeros((len(X_test)))
large_cont_preds[sorted_idx] = test_pred

test_pred = folds_inference(BASE_CONT_PATH, test)
base_cont_preds = np.zeros((len(X_test)))
base_cont_preds[sorted_idx] = test_pred

Inference: ../input/large-cont-all/1_bert_pytorch_14_NewLoss_all_3.bin


100%|██████████| 1521/1521 [11:30<00:00,  1.49s/it]


Inference: ../input/large-cont-all/1_bert_pytorch_14_NewLoss_all_2.bin


100%|██████████| 1521/1521 [11:30<00:00,  1.49s/it]


Inference: ../input/base-cont-all/1_bert_pytorch_14_NewLoss_all_base_2.bin


100%|██████████| 761/761 [03:34<00:00,  1.21it/s]


Inference: ../input/base-cont-all/1_bert_pytorch_14_NewLoss_all_base_1.bin


100%|██████████| 761/761 [03:33<00:00,  1.16it/s]


Inference: ../input/base-cont-all/1_bert_pytorch_14_NewLoss_all_base_0.bin


100%|██████████| 761/761 [03:33<00:00,  1.20it/s]


Inference: ../input/base-cont-all/1_bert_pytorch_14_NewLoss_all_base_3.bin


100%|██████████| 761/761 [03:33<00:00,  1.22it/s]


In [23]:
del test_pred, test, X_test, lens, test
gc.collect()

NameError: name 'test' is not defined

In [24]:
large_cont_preds[:5]

array([0.00915421, 0.00407942, 0.0651625 , 0.01436458, 0.74449154])

In [25]:
base_cont_preds[:5]

array([0.01757531, 0.00528343, 0.06148341, 0.02968971, 0.71651137])

In [26]:
#final_pred = (large_bin_preds + base_bin_preds + large_cont_preds + base_cont_preds + lstm_bin_preds + lstm_cont_preds)/6
cont_preds = 4 * large_cont_preds + 4 * base_cont_preds + lstm_cont_preds
bin_preds = 4 * large_bin_preds + 4 * base_bin_preds + lstm_bin_preds
final_pred = rankdata(cont_preds) + rankdata(bin_preds)
final_pred = final_pred/np.max(final_pred)

submission = pd.DataFrame.from_dict({
    'id': test_df['id'],
    'prediction': final_pred
})
submission.to_csv('submission.csv', index=False)

submission.head()

Unnamed: 0,id,prediction
0,7000000,0.475741
1,7000001,0.140457
2,7000002,0.647026
3,7000003,0.473249
4,7000004,0.983892
