**SETTING EVERYTHING UP (FOR GOOGLE COLAB)**

In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [2]:
%%bash 
curl -O https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
wget https://nlp.stanford.edu/data/glove.840B.300d.zip

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [3]:
%%bash
unzip crawl-300d-2M.vec.zip 
unzip glove.840B.300d.zip

Archive:  crawl-300d-2M.vec.zip
  inflating: crawl-300d-2M.vec       
Archive:  glove.840B.300d.zip
  inflating: glove.840B.300d.txt     


In [0]:
%%bash
rm crawl-300d-2M.vec.zip
rm glove.840B.300d.zip

In [5]:
!ls

crawl-300d-2M.vec  gdrive  glove.840B.300d.txt	sample_data


**Modelling**

In [6]:
import numpy as np
import pandas as pd
import os
import time
import gc
import random
import re
import string
from collections import Counter
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from keras.preprocessing import text, sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn import functional as F
from torch.optim import Adam
from torch.optim.lr_scheduler import LambdaLR
import pickle
import os.path

Using TensorFlow backend.


In [0]:
CRAWL_EMBEDDING_PATH = 'crawl-300d-2M.vec'
GLOVE_EMBEDDING_PATH = 'glove.840B.300d.txt'
NUM_WORDS = 150000

In [0]:
path_data = F"/content/gdrive/My Drive/pytorch/train_original.csv" 
train = pd.read_csv(path_data)

In [0]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

In [0]:
flatten = lambda l: [item for sublist in l for item in sublist]

def clean_special_chars(data):
    '''
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    '''
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def func(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    return func(data, punct)


def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")

    for key in mapping.keys():
      if key in text:
        text = text.replace(key, mapping[key])

    text = [word.replace("'s", "") if word.endswith("'s") else word for word in text.split()]
    text = ' '.join(text)
    return text


def replace_repeating_chars(text):
    for char in string.ascii_lowercase:
      pattern = char + '{2,}'
      text = re.sub(pattern, char, text)
    return text


def preprocess(data):
  data = data.lower()
  data = clean_contractions(data, contraction_mapping)
  data = clean_special_chars(data)
  #data = replace_repeating_chars(data)
  return data

In [0]:
analysis = False
if analysis:
  comments = train['comment_text'].apply(lambda x: preprocess(x))
  pretokenizer = text.Tokenizer()
  pretokenizer.fit_on_texts(list(comments))
  reverse_word_map = dict(map(reversed, pretokenizer.word_index.items()))
  pre_tokenized = pretokenizer.texts_to_sequences(comments)
  pre_tokenized = flatten(pre_tokenized)
  print('Total number of words: {}'.format(len(pre_tokenized)))
  print('Total number of unique words: {}'.format(len(set(pre_tokenized))))
  not_alpha = [word for word in pre_tokenized if (not reverse_word_map[word].isalpha() and not reverse_word_map[word].isdigit())]
  print('Total number of non-alpha words: {}'.format(len(not_alpha)))
  print('Total number of unique non-alpha words: {}'.format(len(set(not_alpha))))
  not_alpha = [reverse_word_map[word] for word in not_alpha]
  counter_not_alpha = Counter(not_alpha)
  counter_alpha = Counter([reverse_word_map[word] for word in pre_tokenized if reverse_word_map[word].isalpha()])
  inv_counter_alpha = counter_alpha.most_common()
  inv_counter_alpha.reverse()
  counter = Counter([reverse_word_map[word] for word in pre_tokenized])
  once = sum([1 for k,v in counter_alpha.most_common() if v == 1])
  twice = sum([1 for k,v in counter_alpha.most_common() if v == 2])
  print('Number of once words: {}'.format(once))
  print('Number of twice words: {}'.format(twice))


In [0]:
def bucketing(data_x, data_y, n_buckets=32):
  data_x = [(data_x[i], i) for i in range(len(data_x))]
  data_x.sort(key = lambda x: len(x[0]))
  data_x = list(filter(lambda x: len(x[0]) != 0, data_x))
  data_x = np.array_split(data_x, n_buckets)
  for bucket in data_x:
    np.random.shuffle(bucket)
  np.random.shuffle(data_x)
  data_x = flatten(data_x)
  data_x = [x.tolist() for x in data_x]
  data_x, permutation = zip(*data_x)
  data_y = data_y.iloc[np.array(permutation)]
  data_y = data_y.reset_index(drop=True)
  return data_x, data_y

def slicing_index(x):
    non_zero_mask = x != 0
    mask_max_values, mask_max_indices = torch.max(non_zero_mask, dim=1)
    return int(min(mask_max_indices))

In [0]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    unknown_words = []
    
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

In [0]:
train['target'] = np.where(train['target'] >= 0.5, 1, 0)
train['comment_text'] = train['comment_text'].apply(lambda x: preprocess(x))

In [0]:
RACE, RELIGION, SEXUALITY, GENDER, DISABILITY = 'race', 'religion', 'sexuality', 'gender', 'disability'

categories_tox = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'sexual_explicit']

categories_iden = {'asian': RACE, 'black': RACE, 'latino': RACE, 'white': RACE, 'other_race_or_ethnicity': RACE,         
                   'atheist': RELIGION, 'buddhist': RELIGION, 'christian': RELIGION, 'hindu': RELIGION, 
                   'jewish': RELIGION, 'muslim': RELIGION, 'other_religion': RELIGION,
                   'bisexual': SEXUALITY, 'heterosexual': SEXUALITY, 'homosexual_gay_or_lesbian': SEXUALITY,
                   'other_sexual_orientation': SEXUALITY,
                   'female': GENDER, 'male': GENDER, 'other_gender': GENDER, 'transgender': GENDER, 
                   'intellectual_or_learning_disability': DISABILITY, 'physical_disability': DISABILITY,
                   'psychiatric_or_mental_illness': DISABILITY, 'other_disability': DISABILITY}


In [0]:
y_train_toxicity = train[categories_tox]
x_train_toxicity = train['comment_text']

In [17]:
train_iden = train[train['identity_annotator_count'] > 0]
print('Number of entries with identity count: {}'.format(len(train_iden)))

Number of entries with identity count: 405130


In [18]:
identities = [RACE, RELIGION, SEXUALITY, GENDER, DISABILITY]
for identity in identities:
  train_iden[identity] = train_iden[[k for k,v in categories_iden.items() if v == identity]].max(axis = 1)
  train_iden[identity] = np.where(train_iden[identity] >= 0.5, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [0]:
y_train_identity = train_iden[identities]
x_train_identity  = train_iden['comment_text']

In [20]:
#https://github.com/keras-team/keras/issues/8092

path_tok = F"/content/gdrive/My Drive/pytorch/tokenizer_best.pickle" 

if os.path.isfile(path_tok):
  print("Unpickling")
  with open(path_tok, 'rb') as f:
    tokenizer = pickle.load(f)
else:
  tokenizer = text.Tokenizer(oov_token='_UNK_', num_words=NUM_WORDS)
  tokenizer.fit_on_texts(list(x_train_toxicity))
  tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= NUM_WORDS}
  with open(path_tok, 'wb') as f:
    pickle.dump(tokenizer, f)


Unpickling


In [21]:
max_features = None or len(tokenizer.word_index)
print('max_features: {}'.format(max_features))

max_features: 150000


In [0]:
def get_dataloaders(x_train, y_train, tokenizer, max_len=250, batch_size=512, test_size=0.2):
  x_train = tokenizer.texts_to_sequences(x_train)
  x_train = sequence.pad_sequences(x_train, maxlen=max_len)
  train_x, validation_x, train_y, validation_y = train_test_split(x_train, y_train, random_state=29, test_size=test_size)

  train_x = torch.tensor(train_x, dtype=torch.long).cuda()
  validation_x = torch.tensor(validation_x, dtype=torch.long).cuda()
  train_y = torch.tensor(np.array(train_y), dtype=torch.float32).cuda()
  validation_y = torch.tensor(np.array(validation_y), dtype=torch.float32).cuda() 

  train_data = TensorDataset(train_x, train_y)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

  validation_data = TensorDataset(validation_x, validation_y)
  validation_sampler = SequentialSampler(validation_data)
  validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

  print("Len of training data / Num of batches: {} / {}".format(len(train_data), len(train_dataloader)))
  print("Len of validation data / Num of batches: {} / {}".format(len(validation_data), len(validation_dataloader)))
  print()

  return train_dataloader, validation_dataloader

In [23]:
train_dataloader_tox, validation_dataloader_tox = get_dataloaders(x_train_toxicity, y_train_toxicity, tokenizer)
train_dataloader_iden, validation_dataloader_iden = get_dataloaders(x_train_identity, y_train_identity, tokenizer)

Len of training data / Num of batches: 1443899 / 2821
Len of validation data / Num of batches: 360975 / 706

Len of training data / Num of batches: 324104 / 634
Len of validation data / Num of batches: 81026 / 159



In [24]:
crawl_matrix, unknown_words_crawl = build_matrix(tokenizer.word_index, CRAWL_EMBEDDING_PATH)
print('unknown words (crawl): ', len(unknown_words_crawl))

glove_matrix, unknown_words_glove = build_matrix(tokenizer.word_index, GLOVE_EMBEDDING_PATH)
print('unknown words (glove): ', len(unknown_words_glove))

embed_matrix = np.true_divide(np.add(crawl_matrix, glove_matrix), 2)
print(embed_matrix.shape)

unknown words (crawl):  41655
unknown words (glove):  39320
(150001, 300)


In [0]:
def get_roc_auc(labels, preds, auc=True):

  if auc:
    labels = np.array(labels)
    size = len(labels.shape)
    if size == 2:
      roc_auc_macro = roc_auc_score(labels, preds, average='macro')
      roc_auc_weighted = roc_auc_score(labels, preds, average='weighted')
      print("ROC-AUC macro: {}".format(roc_auc_macro))
      print("ROC-AUC weighted: {}".format(roc_auc_weighted)) 
      roc_auc = (roc_auc_macro, roc_auc_weighted) 
    else:
      roc_auc = roc_auc_score(labels, preds)
      print("ROC-AUC: {}".format(roc_auc))
  else:
    roc_auc = 'NA'

  return roc_auc

def slice_batch(batch):
  i = slicing_index(batch)
  return batch[:, i:]


def train(model, train_dataloader, validation_dataloader, path_model, lr=0.01, 
          epochs=10, slicing=True, auc=True, auc_on_all=True):

  optimizer = Adam(model.parameters(), lr=lr)
  scheduler = LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
  criterion = torch.nn.BCEWithLogitsLoss()
  sigmoid = nn.Sigmoid()
  
  best_loss = 100

  for i in range(epochs):
      
    print('Epoch: {}'.format(i+1))
    
    # training
    epoch_train_loss = 0
    scheduler.step()
    model.train()
  
    for batch_x, batch_y in tqdm(train_dataloader):
                
      if slicing: batch_x = slice_batch(batch_x)
      
      optimizer.zero_grad()   
      logits, _ = model(batch_x)
      loss = criterion(logits, batch_y)     
      epoch_train_loss += loss.item()   
      loss.backward()
      optimizer.step()
      
    print("Train loss: {}".format(epoch_train_loss/len(train_dataloader)))
  
    # evaluation
    model.eval()
    epoch_test_loss, preds_full, labels_full = 0, [], []
    
    for batch_x, batch_y in validation_dataloader:

      if slicing: batch_x = slice_batch(batch_x)

      with torch.no_grad():
        logits, _  = model(batch_x)

      loss = criterion(logits, batch_y)
      epoch_test_loss += loss.item()

      preds = sigmoid(logits)

      if not auc_on_all: 
        preds = preds[:, 0]
        batch_y = batch_y[:, 0]

      preds = preds.detach().cpu().numpy()
      batch_y = batch_y.to('cpu').numpy()

      preds_full += preds.tolist()
      labels_full += batch_y.tolist()


    roc_auc = get_roc_auc(labels_full, preds_full, auc=auc)
        
    current_loss = epoch_test_loss/len(validation_dataloader)
    print("Validation loss: {}".format(current_loss))    
          
    if current_loss < best_loss:
      best_loss = current_loss       
      state = {'state_dict': model.state_dict(),
              'loss': current_loss,
              'roc_auc': roc_auc}
      torch.save(state, path_model)
      print('Saving model')
      print()

  return model

In [0]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)   
        x = x.permute(0, 3, 2, 1)  
        x = super(SpatialDropout, self).forward(x)  
        x = x.permute(0, 3, 2, 1)  
        x = x.squeeze(2)  
        return x

In [0]:
class IdentityLSTM(nn.Module):
  
    def __init__(self, max_features, n_targets=5, embed_size=300, units=16, num_layers=2):
      
        super(IdentityLSTM, self).__init__()
        
        self.embedding = nn.Embedding(max_features+1, embed_size)
        self.embedding_dropout = SpatialDropout(0.2)
        
        self.lstm = nn.LSTM(embed_size, units, bidirectional=True, dropout=0.2,
                            num_layers=num_layers, batch_first=True)
    
        self.linear = nn.Linear(units*4, units)     
        self.linear_out = nn.Linear(units, n_targets)
        
        
    def forward(self, x):
      
        m_embed = self.embedding(x)  
        m_embed = self.embedding_dropout(m_embed)
        
        m_lstm, _ = self.lstm(m_embed)

        avg_pool = torch.mean(m_lstm, 1) # torch.Size([1, units*2])
        max_pool, _ = torch.max(m_lstm, 1) # torch.Size([1, units*2])
        m_conc = torch.cat((max_pool, avg_pool), 1) # torch.Size([1, units*4])
        m_conc_linear  = F.relu(self.linear(m_conc))    
        out = self.linear_out(m_conc_linear)   
        return out, 0

In [0]:
class ToxicLSTM(nn.Module):
  
    def __init__(self, max_features, n_targets=6, embed_size=300, units=64, num_layers=2):
      
        super(ToxicLSTM, self).__init__()

        self.identity_lstm = IdentityLSTM(max_features)
        
        self.lstm = nn.LSTM(embed_size, units, bidirectional=True, dropout=0.2,
                            num_layers=num_layers, batch_first=True)
    
        self.linear = nn.Linear(units * 4, units * 2)
        self.linear_out = nn.Linear(units * 2 + 5, 1)
        self.linear_aux_out = nn.Linear(units * 2 + 5, n_targets)
        
        
    def forward(self, x):

        identities, _ = self.identity_lstm(x)
      
        m_embed = self.identity_lstm.embedding(x)  
        m_embed = self.identity_lstm.embedding_dropout(m_embed)  
        m_lstm, _ = self.lstm(m_embed)

        avg_pool = torch.mean(m_lstm, 1) # torch.Size([1, units*2])
        max_pool, _ = torch.max(m_lstm, 1) # torch.Size([1, units*2])
        m_conc = torch.cat(( max_pool, avg_pool), 1) # torch.Size([1, units*4+5])
   
        m_linear = torch.cat((identities, self.linear(m_conc)), 1)
        m_linear = torch.relu(m_linear)
        #hidden = m_conc + m_conc_linear
        
        result = self.linear_out(m_linear)
        aux_result = self.linear_aux_out(m_linear)
        out = torch.cat([result, aux_result], 1)      

        return out, identities

**MODEL FOR IDENTITY ATTRIBUTION**

In [0]:
path_model_iden = F"/content/gdrive/My Drive/pytorch/model_lstm_identity.pt" 
model_identity = IdentityLSTM(max_features)

In [55]:
pretrained_identity = True

if pretrained_identity:
  state_dict = torch.load(path_model_iden)
  print('ROC-AUC score(s): {}'.format(state_dict['roc_auc']))
  print('Validation Loss: {}'.format(state_dict['loss']))
  model_identity.load_state_dict(state_dict['state_dict'])
  model_identity.cuda()
  model_identity.eval()

ROC-AUC score(s): (0.9966615977371251, 0.9961783216946134)
Validation Loss: 0.03529140219654677


In [60]:
model_identity

IdentityLSTM(
  (embedding): Embedding(150001, 300)
  (embedding_dropout): SpatialDropout(p=0.2)
  (lstm): LSTM(300, 16, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (linear): Linear(in_features=64, out_features=16, bias=True)
  (linear_out): Linear(in_features=16, out_features=5, bias=True)
)

In [61]:
model_test = True
if model_test:
  model_identity.eval()
  sigmoid = nn.Sigmoid()
  x,y = next(iter(train_dataloader_iden))
  x = slice_batch(x)
  print('Batch dim after slicing: {}'. format(x.size()))
  out, _ = model_identity(x) 
  out = sigmoid(out)
  print('Output dim: {}'. format(out.size()))

Batch dim after slicing: torch.Size([512, 186])
Output dim: torch.Size([512, 5])


In [0]:
if not pretrained_identity:
  model_identity.embedding.weight = nn.Parameter(torch.tensor(embed_matrix, dtype=torch.float32))
  model_identity.embedding.weight.requires_grad = False
  model_identity.cuda()

In [0]:
if not pretrained_identity:
  gc.collect()
  model_identity = train(model_identity, train_dataloader_iden, validation_dataloader_iden, path_model_iden, 
                         lr=0.01, epochs=10, slicing=True, auc=True, auc_on_all=True)

**MODEL FOR TOXICITY SCORES**

In [0]:
path_model_tox = F"/content/gdrive/My Drive/pytorch/model_lstm_toxicity.pt" 
model_toxicity = ToxicLSTM(max_features)

In [65]:
pretrained_toxicity = True

if pretrained_toxicity:
  state_dict = torch.load(path_model_tox)
  print('ROC-AUC score(s): {}'.format(state_dict['roc_auc']))
  print('Validation Loss: {}'.format(state_dict['loss']))
  model_toxicity.load_state_dict(state_dict['state_dict'])
  model_toxicity.cuda()
  model_toxicity.eval()

ROC-AUC score(s): 0.9680310941580701
Validation Loss: 0.07392344654936121


In [67]:
model_test = True
if model_test:
  model_toxicity.eval()
  sigmoid = nn.Sigmoid()
  x,y = next(iter(train_dataloader_tox))
  x = slice_batch(x)
  print('Batch dim after slicing: {}'. format(x.size()))
  toxicity, identities = model_toxicity(x) 
  toxicity = sigmoid(toxicity)
  identities = sigmoid(identities)
  print('Toxicity output dim: {}'. format(toxicity.size()))
  print('Identities output dim: {}'. format(identities.size()))

Batch dim after slicing: torch.Size([512, 186])
Toxicity output dim: torch.Size([512, 7])
Identities output dim: torch.Size([512, 5])


In [0]:
if not pretrained_toxicity:
  model_toxicity.identity_lstm = model_identity
  for param in model_toxicity.identity_lstm.parameters():
    param.requires_grad = False 
  model_toxicity.cuda()

In [0]:
if not pretrained_toxicity:
  gc.collect()
  train(model_toxicity, train_dataloader_tox, validation_dataloader_tox, path_model_tox, 
        lr=0.01, epochs=10, slicing=True, auc=True, auc_on_all=False)

**Credits**

Extremely grateful to the authors of the following kernels:

1.   https://www.kaggle.com/bminixhofer/simple-lstm-pytorch-version/
2.   https://www.kaggle.com/theoviel/improve-your-score-with-some-text-preprocessing

