In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import logging
import pandas as pd
import numpy as np
import gc
import os
import re
import torch
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import f1_score
from torch import optim
import torchtext
import random
from gensim.models import KeyedVectors
from tqdm import tqdm
from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
tqdm.pandas()

In [None]:
df_train= pd.read_csv('../input/quora-insincere-questions-classification/train.csv')
df_test= pd.read_csv('../input/quora-insincere-questions-classification/test.csv')

In [None]:
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')[:300]
def load_embedding(file):
    if file == '../input/quora-insincere-questions-classification/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec':
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file) if len(o)>100)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
    return embeddings_index

In [None]:
newspath = '../input/quora-insincere-questions-classification/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
#googlenews = KeyedVectors.load_word2vec_format(newspath, binary=True)
from gensim import utils
max_features = 95000

def load_word2vec(fname, encoding='utf8', unicode_errors='strict',datatype=np.float32, word_index=None):
    emb_mean,emb_std = -0.0051106834, 0.18445626
    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features, 300))
    with utils.smart_open(fname) as fin:
        header = utils.to_unicode(fin.readline(), encoding=encoding)
        vocab_size, vector_size = (int(x) for x in header.split())
        binary_len = np.dtype(datatype).itemsize * vector_size
        for _ in tqdm(range(vocab_size)):
            # mixed text and binary: read text first, then binary
            word = []
            while True:
                ch = fin.read(1)
                if ch == b' ':
                    break
                if ch == b'':
                    raise EOFError("unexpected end of input")
                if ch != b'\n':
                    word.append(ch)
            word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors)
            weights = np.fromstring(fin.read(binary_len), dtype=datatype).astype(datatype)
            if word not in word_index:
                continue
            i = word_index[word]
            if i >= max_features:
                continue
            embedding_matrix[i] = weights
    return embedding_matrix

In [None]:
glove = load_embedding('../input/quora-insincere-questions-classification/embeddings/glove.840B.300d/glove.840B.300d.txt')

In [None]:
paragram = load_embedding('../input/quora-insincere-questions-classification/embeddings/paragram_300_sl999/paragram_300_sl999.txt')

In [None]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

In [None]:
def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, ' %s '%punct)
    return x

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [None]:
mispell_dict = {"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"}

In [None]:
first_word_mispell_dict = {
                'whta': 'what', 'howdo': 'how do', 'Whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 
                'howmany': 'how many', 'whydo': 'why do', 'doi': 'do i', 'howdoes': 'how does', "whst": 'what', 
                'shoupd': 'should', 'whats': 'what is', "im": "i am", "whatis": "what is", "iam": "i am", "wat": "what",
                "wht": "what","whts": "what is", "whtwh": "what", "whtat": "what", "whtlat": "what", "dueto to": "due to",
                "dose": "does", "wha": "what", 'hw': "how", "its": "it is", "whay": "what", "ho": "how", "whart": "what", 
                "woe": "wow", "wt": "what", "ive": "i have","wha": "what", "wich": "which", "whic": "which", "whys": "why", 
                "doe": "does", "wjy": "why", "wgat": "what", "hiw": "how","howto": "how to", "lets": "let us", "haw": "how", 
                "witch": "which", "wy": "why", "girlfriend": "girl friend", "hows": "how is","whyis": "why is", "whois": "who is",
                "dont": "do not", "hat": "what", "whos": "who is", "whydoes": "why does", "whic": "which","hy": "why", "w? hy": "why",
                "ehat": "what", "whate": "what", "whai": "what", "whichis": "which is", "whi": "which", "isit": "is it","ca": "can", 
                "wwhat": "what", "wil": "will", "wath": "what", "plz": "please", "ww": "how", "hou": "how", "whch": "which",
                "ihave": "i have", "cn": "can", "doesnt": "does not", "shoul": "should", "whatdo": "what do", "isnt": "is not", 
                "whare": "what are","whick": "which", "whatdoes": "what does", "hwo": "how", "howdid": "how did", "why dose": "why does"
}

In [None]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

def correct_first_word(x):
    for key in first_word_mispell_dict.keys():
        if x.startswith(key + " "):
            x = x.replace(key + " ", first_word_mispell_dict[key] + " ")
            break
    return x

In [None]:
df_train["vocab"] = df_train["question_text"].progress_apply(lambda x : correct_first_word(x))
df_test["vocab"] = df_test["question_text"].progress_apply(lambda x : correct_first_word(x))

In [None]:
df_train["vocab"] = df_train["vocab"].progress_apply(lambda x : clean_text(x))
df_test["vocab"] = df_test["vocab"].progress_apply(lambda x : clean_text(x))

In [None]:
df_train["vocab"] = df_train["vocab"].progress_apply(lambda x : clean_numbers(x))
df_test["vocab"] = df_test["vocab"].progress_apply(lambda x : clean_numbers(x))

In [None]:
df_train["vocab"] = df_train["vocab"].progress_apply(lambda x : replace_typical_misspell(x))
df_test["vocab"] = df_test["vocab"].progress_apply(lambda x : replace_typical_misspell(x))

In [None]:
trainX = df_train["vocab"]
testX = df_test["vocab"]
data = pd.concat((trainX,testX),axis=0)

In [None]:
tokenizer = Tokenizer(oov_token = 'xxunk',filters='')

In [None]:
tokenizer.fit_on_texts(list(data))

In [None]:
trainX = tokenizer.texts_to_sequences(trainX)

In [None]:
testX = tokenizer.texts_to_sequences(testX)

In [None]:
trainX = pad_sequences(trainX, maxlen=800)

In [None]:
testX = pad_sequences(testX, maxlen=800)

In [None]:
trainy = df_train["target"].values

In [None]:
def make_embedding_matrix(embedding_index, word_index, embedding_mean, embedding_std,a=0):
    all_embs = np.stack(embedding_index.values())
    embedding_size = all_embs.shape[1]
    word_number = len(word_index)
    embedding_matrix = np.random.normal(embedding_mean, embedding_std , (word_number,embedding_size))
    
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)

        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
        else:
            embedding_vector = embedding_index.get(word.capitalize())
            if embedding_vector is not None: 
                embedding_matrix[i] = embedding_vector
    return embedding_matrix 

In [None]:
glove_mat = make_embedding_matrix(glove, tokenizer.word_index, -0.05838499, 0.48782197)

In [None]:
paragram_mat = make_embedding_matrix(paragram, tokenizer.word_index, -0.0053247833, 0.49346462)

In [None]:
embedding_matrix = np.concatenate((glove_mat,paragram_mat),axis=1)

In [None]:
num_embeddings= 2

In [None]:
x_test = torch.tensor(testX,dtype=torch.long)

In [None]:
batch_size = 128

In [None]:
test = torch.utils.data.TensorDataset(x_test)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

In [None]:
train_x, valid_x, train_y, valid_y = train_test_split(trainX,trainy, test_size = 0.1)

In [None]:
valid_preds = np.zeros(len(valid_y))
test_preds = np.zeros(len(testX))

In [None]:
x_train = torch.tensor(train_x,dtype=torch.long)
x_valid = torch.tensor(valid_x,dtype=torch.long)

In [None]:
y_train = torch.tensor(train_y.reshape(len(train_y), 1), dtype=torch.float32)
y_valid = torch.tensor(valid_y.reshape(len(valid_y), 1), dtype=torch.float32)

In [None]:
train_ds = torch.utils.data.TensorDataset(x_train, y_train)
valid_ds = torch.utils.data.TensorDataset(x_valid, y_valid)

train_dl = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=batch_size, shuffle=False)

In [None]:
class EMBEDDING_DROPOUT(nn.Module):
    def __init__(self, embedding_matrix, max_features = 215224, embedding_size = 300):
        super(EMBEDDING_DROPOUT,self).__init__()
        self.embedding = nn.Embedding(max_features, embedding_size*num_embeddings)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        
        self.embedding_drop = nn.Dropout2d(0.1)
    
    def forward(self,x):
        h_embedding = self.embedding(x)
        h_embedding = torch.squeeze(self.embedding_drop(torch.unsqueeze(h_embedding, 0)))
        return h_embedding

In [None]:
class LSTM_GRU(nn.Module):
    def __init__ (self, embedding_size= 300, hidden_size= 128):
        super(LSTM_GRU,self).__init__()
        self.lstm = nn.LSTM(embedding_size*num_embeddings, hidden_size, bidirectional=True, batch_first=True)
        self.gru = nn.GRU(hidden_size*2, hidden_size, bidirectional=True, batch_first=True)
    
    def forward(self,x):
        h_lstm, _ = self.lstm(x)
        h_gru, _ = self.gru(h_lstm)
        
        return h_gru, h_lstm

In [None]:
class LINEAR_LAYER(nn.Module):
    def __init__(self,embedding_size= 300,intermediate_layer=64,maxlen=800,hidden_size=128):
        super(LINEAR_LAYER,self).__init__()
        self.linear = nn.Linear(hidden_size,intermediate_layer)
        self.dropout = nn.Dropout(0.15)
        self.bn = nn.BatchNorm1d(intermediate_layer)
        self.output = nn.Linear(intermediate_layer, 1)
        
    def forward(self,x):
        return self.output(self.bn(self.dropout(self.linear(x))))

In [None]:
class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix,max_features=215224,embedding_size=300,maxlen=800, hidden_size=128):
        super(NeuralNet,self).__init__()
        self.embedding = EMBEDDING_DROPOUT(embedding_matrix,max_features,embedding_size)
        self.stem = LSTM_GRU(embedding_size,hidden_size)
        self.regressor = LINEAR_LAYER(embedding_size=embedding_size,maxlen=maxlen,hidden_size =hidden_size*4*2 )
    
    def forward(self,x):
        embedding_output = self.embedding(x)
        h_lstm, h_gru = self.stem(embedding_output)
        
        l_maxpool, _ = torch.max(h_lstm,1)
        l_avgpool = torch.mean(h_lstm,1)
        g_maxpool, _ = torch.max(h_gru,1)
        g_avgpool = torch.mean(h_gru,1)
        
        features = torch.cat((l_maxpool,l_avgpool,g_maxpool,g_avgpool),1)
        return self.regressor(features)

In [None]:
model = NeuralNet(embedding_matrix).cuda()

In [None]:
loss_fn = torch.nn.BCEWithLogitsLoss(reduction="mean")
optimizer = torch.optim.Adam(model.parameters())
step_size = 300

scheduler = optim.lr_scheduler.ExponentialLR(optimizer,0.01)

In [None]:
for epoch in range(10):
    print('Epoch: %d'%epoch+1)
    iteration=0
    running_loss=0.0
    model.train()
    for inputs,targets in tqdm(train_dl):
        inputs= inputs.cuda()
        targets= targets.cuda()
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        
        optimizer.step()
        
        loss = loss.item()
        running_loss+= loss
        iteration+=1

    tr_loss = running_loss / iteration
    print('Train: Loss: %.6f'%tr_loss)
    
    iteration = 0
    running_loss = 0.0
    running_acc = 0.0
    model.eval()
    
    with torch.no_grad():
        for inputs, targets in tqdm(valid_dl):
            inputs=inputs.cuda()
            targets=targets.cuda() 
            outputs = model(inputs)
            loss = loss_fn(outputs, targets)
            loss = loss.item()
            running_loss += loss
            iteration+=1
        
        va_loss = running_loss / iteration
        print('Valid: Loss: %.6f'%va_loss)

In [None]:
outputs = np.zeros([1,1])
for inputs,_ in tqdm(train_dl):
    output = model(inputs.cuda(2))
    outputs = np.concatenate((outputs,output.detach()),axis=0)

In [None]:
def bestThresshold(y_train,train_preds):
    tmp = [0,0,0] # idx, cur, max
    delta = 0
    for tmp[0] in tqdm(np.arange(0.1, 0.501, 0.01)):
        tmp[1] = f1_score(y_train, np.array(train_preds)>tmp[0])
        if tmp[1] > tmp[2]:
            delta = tmp[0]
            tmp[2] = tmp[1]
    print('best threshold is {:.4f} with F1 score: {:.4f}'.format(delta, tmp[2]))
    return delta , tmp[2]

delta, _ = bestThresshold(y_train,outputs)