In [1]:
import pandas as pd
import numpy as np

In [2]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import string

In [3]:
from collections import Counter
import pickle
import csv

In [160]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, Dataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [121]:
from sklearn.model_selection import train_test_split
import random

In [122]:
torch.manual_seed(1337)
random.seed(1)

In [5]:
spacy_en = spacy.load('en')
parser = English()
punctuations = string.punctuation

In [6]:
NEGATE = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
 "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
 "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
 "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
 "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere","no",
 "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
 "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
 "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]

In [7]:
stopwords = STOP_WORDS.copy()
for word in STOP_WORDS:
    if word in NEGATE:
        stopwords.remove(word)
        

In [8]:
def spacy_tokenizer(sentence):
    tokens = parser(sentence)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]     
    return tokens

In [36]:
word_counter = Counter()
for text in train_data.comment_text:
    word_counter.update(spacy_tokenizer(text))

In [37]:
pickle.dump(word_counter,open('word_counter.pkl','wb'))

In [9]:
train_data = pd.read_csv('../data/train.csv')

In [10]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [11]:
(train_data['comment_text'] == ' ').any()

False

In [12]:
train_data.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [13]:
train_data['comment_text'].fillna(value = '_na_',inplace=True)

In [14]:
vocab_size = 1000
embed_size = 50

In [15]:
EMBEDDING_FILE = '../data/glove.twitter.27B/glove.twitter.27B.50d.txt'

In [16]:
embeddings_df = pd.read_table(EMBEDDING_FILE, sep=' ', header=None, quoting=csv.QUOTE_NONE)

In [17]:
embeddings_df[0].isnull().sum()

1

In [18]:
embeddings_df.dropna(inplace=True)

In [19]:
embeddings_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,<user>,0.78704,0.72151,0.29148,-0.056527,0.31683,0.47172,0.023461,0.69568,0.20782,...,-2.2162,-0.42537,0.46157,0.88626,-0.22014,0.025599,-0.38615,0.080107,-0.075323,-0.61461
1,.,0.68661,-1.0772,0.011114,-0.24075,-0.3422,0.64456,0.54957,0.30411,-0.54682,...,-1.3495,0.23258,0.25383,-0.10226,0.65824,0.16015,0.20959,-0.067516,-0.51952,-0.34922
2,:,0.98483,0.19784,0.28403,0.35406,0.2438,0.42519,-0.050784,0.48965,0.18231,...,-2.882,-0.30393,0.047981,1.0937,0.4184,-0.68958,-0.45923,0.23368,-0.30628,-0.093607
3,rt,0.68243,0.73589,0.017529,-0.13763,0.36029,0.51704,0.1954,0.75219,0.43029,...,-2.749,-0.55106,0.040409,0.29164,-0.28792,-0.02274,-0.40295,0.14771,0.080503,-0.68115
4,",",0.13744,-1.0151,-0.50491,0.26983,-0.75571,1.1014,0.077018,-0.15144,0.061658,...,-2.0934,0.60543,-0.33726,0.19313,-0.042527,-0.19012,-0.23469,0.21259,-0.19424,-0.93832
5,<repeat>,0.80074,-0.83871,0.1916,-0.42322,-0.11962,0.57089,0.55983,0.72052,-0.31354,...,-1.4854,-0.083002,0.24177,0.043614,0.58303,0.099758,0.30535,-0.19752,-0.39449,-0.036257
6,<hashtag>,1.0018,-0.76141,0.11812,-1.3743,-0.41678,0.48956,0.31381,-0.1128,0.22117,...,-2.2086,-0.64906,-0.044567,0.17027,1.0102,0.40608,0.73295,-0.39454,-0.58679,-0.42343
7,<number>,0.026779,0.080507,0.054467,-0.6626,-0.11954,0.34985,-0.38114,0.12158,-0.39642,...,-3.0616,0.6587,-0.75637,0.15585,0.66404,-1.4051,-0.30053,-1.3671,-0.4988,0.29598
8,<url>,0.55283,-0.57581,-0.76596,-1.1371,0.22059,-0.19504,-0.14078,0.13109,0.049547,...,-1.9521,-0.12149,0.018784,0.92985,0.50244,0.15481,0.32515,-0.5597,-1.4389,-0.10366
9,!,0.90566,-0.71792,-0.19574,-0.80743,-0.024903,0.31071,0.89485,0.63035,-0.33863,...,-1.3769,-0.29166,0.10895,0.61422,0.18414,0.15971,0.071934,0.001123,0.028188,0.30385


In [20]:
embeddings_df.shape

(1193513, 51)

In [21]:
keys = embeddings_df[0]
values = embeddings_df.drop(0, axis=1).as_matrix()

In [22]:
embeddings = dict(zip(keys, values))

In [23]:
len(embeddings)

1193513

In [24]:
all_embs = np.stack(embeddings.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.043995290164890316, 0.73192440745167187)

In [25]:
word_counter = pickle.load(open('word_counter.pkl', 'rb'))

In [26]:
vocabulary = [pair[0] for pair in word_counter.most_common(vocab_size)]

In [27]:
glove_dim = len(embeddings)

In [103]:
nb_words = min(vocab_size+1, glove_dim)
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for i, word in enumerate(vocabulary):
    if i >= vocab_size+1:
        continue
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i+1] = embedding_vector

In [105]:
embedding_matrix[0]

array([-0.90863209, -1.83054233,  0.20075362, -0.09392044, -0.61499785,
       -0.08694068, -0.53790446, -0.69105299,  0.24632631,  1.114145  ,
        0.20596222,  0.1127703 ,  0.1055719 , -0.49976979,  0.16715639,
       -0.61481833,  0.5863114 , -0.59645605, -0.90389832, -0.22466315,
       -0.40626646, -0.65643533,  0.32985416, -0.09072985,  1.4024442 ,
        0.79602253, -1.16494341, -0.12527208, -0.07377362,  0.02483354,
        0.60662275,  0.07979284,  1.02550161, -0.02989132,  0.38192396,
       -1.15426754, -0.14828249,  1.46536808,  0.11470578,  0.80675531,
        0.05579536,  0.89502589, -0.87188143,  0.84338568,  1.14738712,
        0.06312929, -0.85381121,  0.14439408, -0.15484467, -0.46886253])

In [100]:
idx_to_word = {index+1:word for index,word in enumerate(vocabulary)}
word_to_idx = {word:index+1 for index,word in enumerate(vocabulary)}
idx_to_word[0] = '<pad>'
word_to_idx['<pad>'] = 0

In [101]:
def make_seq(df):
    all_tokens = [spacy_tokenizer(df["comment_text"].iloc[i]) for i in range(df["comment_text"].shape[0])]
    seq_all = []
    empty_indices = []
    for i in range(df.shape[0]):
        seq = []
        for tok in all_tokens[i]:
            tok_idx = word_to_idx.get(tok)
            if tok_idx is None:
                continue
            seq.append(tok_idx)
        if(len(seq) == 0):
            empty_indices.append(i)
            continue
        seq_all.append(seq)
        
    target_df = df.loc[~df.index.isin(empty_indices), df.columns[2:]]
    target = target_df.as_matrix()
        #print(tok)
    return seq_all,target

In [102]:
all_seq,target = make_seq(train_data)

In [106]:
max_len = 100

In [107]:
def pad_sequences(vectorized_seqs, seq_lengths):
    max_seq_len = min(seq_lengths.max(),max_len)
    seq_tensor = torch.zeros((len(vectorized_seqs), max_seq_len)).long()
    for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
        min_seq_len = min(seqlen,max_len)
        seq_tensor[idx, :min_seq_len] = torch.LongTensor(seq[:min_seq_len])
    return seq_tensor

In [108]:
all_seq_lengths = torch.LongTensor([len(s) for s in all_seq])

In [109]:
all_seq_pad = pad_sequences(all_seq,all_seq_lengths)

In [162]:
x_train,x_val,y_train,y_val = train_test_split(pd.DataFrame(all_seq_pad.numpy()),pd.DataFrame(target),test_size = 0.2,stratify = target[:,3]) 

In [135]:
y_train.sum()

27607

In [178]:
train_ids = torch.from_numpy(x_train.index.values).long()
val_ids = torch.from_numpy(x_val.index.values).long()

In [179]:
train_seq_lengths = torch.index_select(all_seq_lengths,0,train_ids)
val_seq_lengths = torch.index_select(all_seq_lengths,0,val_ids)

In [142]:
y_train[:,5].sum()/target[:,5].sum()

0.82583454281567492

In [147]:
batch_size=64

In [161]:
class ToxicDataset(Dataset):
    def __init__(self,data_tensor,target_tensor,length_tensor):
        assert data_tensor.size(0) == target_tensor.size(0) == length_tensor.size(0)
        self.data_tensor = data_tensor
        self.target_tensor = target_tensor
        self.length_tensor = length_tensor
    
    def __getitem__(self,index):
        return self.data_tensor[index],self.target_tensor[index],self.length_tensor[index]
    
    def __len__(self):
        return self.data_tensor.size(0)

In [180]:
x_train_t = torch.from_numpy(x_train.values).long()
y_train_t = torch.from_numpy(y_train.values).long()
train_dataset = ToxicDataset(x_train_t,y_train_t,train_seq_lengths)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size,shuffle=True,num_workers=2)

In [182]:
x_val_t = torch.from_numpy(x_val.values).long()
y_val_t = torch.from_numpy(y_val.values).long()
val_dataset = ToxicDataset(x_val_t,y_val_t,val_seq_lengths)
val_dataloader = DataLoader(val_dataset, batch_size=2*batch_size,shuffle=False,num_workers=2)

In [149]:
embed_dim = 50

In [152]:
embed_nn = nn.Embedding(vocab_size+1, embedding_dim=embed_dim)

In [153]:
embed_nn.weight.data.copy_(torch.from_numpy(embedding_matrix))


-9.0863e-01 -1.8305e+00  2.0075e-01  ...   1.4439e-01 -1.5484e-01 -4.6886e-01
 4.9427e-01  1.3234e-01 -2.3199e-02  ...   2.4152e-01 -2.7658e-01 -3.7987e-01
 1.2900e+00  5.2437e-02 -4.3540e-01  ...   4.4992e-02 -9.1239e-01 -1.0489e+00
                ...                   ⋱                   ...                
-2.7085e-01 -9.7770e-01 -9.7691e-01  ...   7.1800e-01  4.4413e-01 -4.4221e-01
 1.1607e-01  7.2430e-01 -5.0700e-01  ...   3.4732e-01 -1.0340e+00 -6.5862e-01
 1.1510e+00  8.3473e-02 -2.0625e-01  ...   1.2821e+00  5.4254e-02 -1.1215e-02
[torch.FloatTensor of size 1001x50]

In [154]:
embed_nn.weight.requires_grad = False

In [156]:
epochs = 1

In [183]:
class LSTMBasicNet(nn.Module):
    def __init__(self,input_size, output_size,hidden_size,num_layers,num_classes,embed):
        super(LSTMBasicNet,self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.num_layers = num_layers
        self.embed = embed
        self.lstm = nn.LSTM(input_size=self.input_size,hidden_size=hidden_size,num_layers = num_layers,bidirectional = True, batch_first = True, dropout = 0.1)
        self.fc1 = nn.Linear(in_features=hidden_size*2,out_features=output_size)
        self.fc2 = nn.Linear(in_features=output_size,out_features=num_classes)
        
    def forward(self,x,lengths):
        embed_out = self.embed(x)
        packed_input = pack_padded_sequence(embed_out,batch_first=True,lengths=lengths) 
        h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)) # 2 for bidirection 
        c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size))
        packed_output,(hn,cn) = self.lstm(packed_input,(h0,c0))
        lstm_out = pad_packed_sequence(packed_output,batch_first=True)
        lstm_out = F.tanh(lstm_out)
        fc1_out = F.dropout(F.relu(self.fc1(lstm_out[:,-1,:])),p=0.1)
        out = F.sigmoid(self.fc2(fc1_out))
        
        return out
        
        
        
        

In [184]:
model = LSTMBasicNet(embed_size,64,64,2,6,embed_nn)