## MAD - Unidad 3 - Toxic Comments Clasification
###  JSON to Torch
#### Integrantes:
- C. Cárdenas
- A. Morales
- M.J. Núñez

## Descripción de la data

Se le proporciona una gran cantidad de comentarios de Wikipedia que han sido etiquetados por evaluadores humanos por su comportamiento tóxico. Los tipos de toxicidad son:


    toxic (tóxico)
    severe_toxic (muy tóxico)
    obscene (obsceno)
    threat (amenasa)
    insult (insulto)
    identity_hate (odio)

- `train.csv`: el conjunto de entrenamiento, contiene comentarios con sus etiquetas binarias
- `test.csv`: el conjunto de prueba, debe predecir las probabilidades de toxicidad para estos comentarios. Para disuadir el etiquetado manual, el conjunto de prueba contiene algunos comentarios que no están incluidos en la puntuación.
- `sample_submission.csv`: un archivo de envío de muestra en el formato correcto
- `test_labels.csv`: etiquetas para los datos de prueba; el valor de $-1$ indica que no se usó para calificar; (Nota: ¡archivo agregado después del cierre de la competencia!)

### Import libraries


In [71]:
import pandas as pd
import torch
import spacy
import random
#import json
import torchtext
from torchtext import data
from torchtext import datasets
#from sklearn.model_selection import train_test_split

In [8]:
!pip install torch==1.5.1

Collecting torch==1.5.1
[?25l  Downloading https://files.pythonhosted.org/packages/a4/cf/007b6de316c9f3d4cb315a60c308342cc299e464167f5ebc369e93b5e23a/torch-1.5.1-cp37-cp37m-manylinux1_x86_64.whl (753.2MB)
[K     |████████████████████████████████| 753.2MB 87kB/s s eta 0:00:01    |█▋                              | 37.1MB 21.4MB/s eta 0:00:34     |███████▍                        | 173.4MB 21.2MB/s eta 0:00:28     |████████▉                       | 208.8MB 15.6MB/s eta 0:00:35     |█████████▌                      | 223.4MB 15.6MB/s eta 0:00:34     |██████████                      | 235.2MB 20.2MB/s eta 0:00:26     |███████████▌                    | 271.5MB 25.4MB/s eta 0:00:19     |████████████                    | 283.6MB 21.4MB/s eta 0:00:22     |████████████▍                   | 292.6MB 21.4MB/s eta 0:00:22     |█████████████▎                  | 312.1MB 23.4MB/s eta 0:00:19     |█████████████▋                  | 321.1MB 23.4MB/s eta 0:00:19     |████████████████████▋           | 485.8

In [2]:
print(torch.__version__,spacy.__version__,torchtext.__version__)

1.5.1 2.3.0 0.6.0


### Tokenizar

Se definen los _Fields_

In [3]:
TEXT = data.Field(tokenize='spacy', batch_first = True)
TOXIC = data.LabelField(dtype = torch.float)

In [4]:
fields = [(None, None),(None, None),('comment_text', TEXT),(None, None),(None, None),(None, None),(None, None),(None, None),(None, None),(None, None),(None, None),('toxicity', TOXIC)]

Se leen los CSV para tokenizarlos con Torchtext.data

In [5]:
#import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = '../challenge_nlp/data',
                                        train = 'train_data.csv',
                                        validation= 'valid_data.csv',
                                        test = 'test_data.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True
)

#train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [183]:
for i in range(50):
    print(vars(valid_data[i])['comment_text'].__len__())

33
883
69
69
54
73
582
21
143
64
45
67
170
6
29
34
27
172
37
120
66
32
25
113
120
36
80
11
108
40
190
47
24
42
25
33
377
52
20
6
105
75
12
23
62
10
5
74
67
30


In [24]:
vars(test_data[12])

<torchtext.data.example.Example at 0x7fde50d35790>

In [6]:
print(vars(test_data[12]))

{'comment_text': ['stuff', 'keilor', 'downs', 'u', ' ', 'suck'], 'toxicity': '1'}


### Dataloaders

In [7]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

TOXIC.build_vocab(train_data)

In [8]:
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    #sort_key=None,
    sort_key=lambda x:len(x.toxicity),
    sort_within_batch=False)

In [39]:
print('Train:')
for batch in train_iterator:
    print(batch)
    break
    #largo = batch.comment_text[0].__len__()
    #if largo < 10:
    #    print(largo)
    #print(batch.comment_text[j])


 

Train:

[torchtext.data.batch.Batch of size 32]
	[.comment_text]:[torch.cuda.LongTensor of size 32x329 (GPU 0)]
	[.toxicity]:[torch.cuda.FloatTensor of size 32 (GPU 0)]


### CNN 1D

In [9]:
import torch.nn as nn
import torch.nn.functional as F

class CNN1d(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels = embedding_dim, 
                                              out_channels = n_filters, 
                                              kernel_size = fs)
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.permute(0, 2, 1)
        
        #embedded = [batch size, emb dim, sent len]
        
        conved = [F.relu(conv(embedded)) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))
        
        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [10]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN1d(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [11]:
pretrained_embeddings = TEXT.vocab.vectors
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data.copy_(pretrained_embeddings)
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [12]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [13]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [116]:
a = 0.499
b = 0.5001

print(torch.round(torch.tensor(a)))
print(torch.round(torch.tensor(b)))

tensor(0.)
tensor(1.)


In [14]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.comment_text).squeeze(1)
        
        loss = criterion(predictions, batch.toxicity)
        
        acc = binary_accuracy(predictions, batch.toxicity)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [15]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
           
            predictions = model(batch.comment_text).squeeze(1)
            
            loss = criterion(predictions, batch.toxicity)
            
            acc = binary_accuracy(predictions, batch.toxicity)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [16]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [48]:
#print(evaluate(model, valid_iterator, criterion))
for batch in valid_iterator:
    print(batch)
    break


[torchtext.data.batch.Batch of size 32]
	[.comment_text]:[torch.cuda.LongTensor of size 32x989 (GPU 0)]
	[.toxicity]:[torch.cuda.FloatTensor of size 32 (GPU 0)]


In [49]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        nombre = './Models/challenge-model-CNN'+'_ep'+str(epoch+1)+'.pt'
        torch.save({'epoca': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'Valid_loss': best_valid_loss}, nombre)
        #torch.save(model.state_dict(), nombre)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 40s
	Train Loss: 0.138 | Train Acc: 95.15%
	 Val. Loss: 0.113 |  Val. Acc: 95.88%
Epoch: 02 | Epoch Time: 0m 40s
	Train Loss: 0.101 | Train Acc: 96.31%
	 Val. Loss: 0.104 |  Val. Acc: 96.21%
Epoch: 03 | Epoch Time: 0m 39s
	Train Loss: 0.086 | Train Acc: 96.84%
	 Val. Loss: 0.109 |  Val. Acc: 96.18%
Epoch: 04 | Epoch Time: 0m 40s
	Train Loss: 0.073 | Train Acc: 97.28%
	 Val. Loss: 0.128 |  Val. Acc: 95.99%
Epoch: 05 | Epoch Time: 0m 40s
	Train Loss: 0.061 | Train Acc: 97.75%
	 Val. Loss: 0.134 |  Val. Acc: 95.96%
Epoch: 06 | Epoch Time: 0m 40s
	Train Loss: 0.052 | Train Acc: 98.15%
	 Val. Loss: 0.154 |  Val. Acc: 95.73%
Epoch: 07 | Epoch Time: 0m 40s
	Train Loss: 0.045 | Train Acc: 98.45%
	 Val. Loss: 0.167 |  Val. Acc: 95.73%
Epoch: 08 | Epoch Time: 0m 38s
	Train Loss: 0.038 | Train Acc: 98.72%
	 Val. Loss: 0.198 |  Val. Acc: 95.53%
Epoch: 09 | Epoch Time: 0m 38s
	Train Loss: 0.033 | Train Acc: 98.88%
	 Val. Loss: 0.230 |  Val. Acc: 95.51%
Epoch: 10 | Epoch T

Cargar mejor modelo

In [17]:
mejorcito = CNN1d(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)


In [18]:
pretrained_embeddings = TEXT.vocab.vectors
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

mejorcito.embedding.weight.data.copy_(pretrained_embeddings)
mejorcito.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
mejorcito.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [19]:
nombre = './Models/challenge-model-CNN'+'_ep'+str(2)+'.pt'
mejorcito.load_state_dict(torch.load(nombre)['model_state_dict'])

<All keys matched successfully>

In [20]:
from sklearn.metrics import f1_score,confusion_matrix, classification_report

In [21]:
#mejorcito = mejorcito.cuda()
prediction_test = []
labels_test=[]
for batch in test_iterator:
    labels_test.append(batch.toxicity.cpu().detach().numpy())
    predictions = mejorcito(batch.comment_text.cpu()).squeeze(1)
    rounded_preds = torch.round(torch.sigmoid(predictions))
    prediction_test.append(rounded_preds.detach().numpy())
    

y_true = np.concatenate(labels_test)
y_pred = np.concatenate(prediction_test)

In [22]:
display(y_pred,y_true)

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [23]:
cm = confusion_matrix(y_true, y_pred)
display(cm)

print(classification_report(y_true, y_pred))

array([[28123,   544],
       [  817,  2429]])

              precision    recall  f1-score   support

         0.0       0.97      0.98      0.98     28667
         1.0       0.82      0.75      0.78      3246

    accuracy                           0.96     31913
   macro avg       0.89      0.86      0.88     31913
weighted avg       0.96      0.96      0.96     31913



Implementamos un CSV con comentarios y etiquetas manuales par aextender la generalización.

In [26]:
campos =[('comment_text',TEXT),('toxicity',TOXIC)]

In [130]:
manual_data = data.TabularDataset(
                                        path = '../challenge_nlp/data/manual_data.csv',
                                        format = 'csv',
                                        fields = campos,
                                        skip_header = True
)

In [94]:
vars(manual_data[9])

{'comment_text': ['you', 'are', 'a', 'very', 'nice', 'person'],
 'toxicity': '0'}

In [131]:
train_iterator, valid_iterator, manual_iterator = data.BucketIterator.splits(
    (train_data, valid_data, manual_data), 
    batch_size = 1, 
    device = device,
    #sort_key=None,
    sort_key=lambda x:len(x.toxicity),
    sort_within_batch=False)

In [132]:
prediction_test = []
predictions_sig=[]
labels_test=[]
for batch in manual_iterator:
    labels_test.append(batch.toxicity.cpu().detach().numpy())
    predictions = mejorcito(batch.comment_text.cpu()).squeeze(1)
    pred_sig = torch.sigmoid(predictions)
    predictions_sig.append(pred_sig.detach().numpy())
    rounded_preds = torch.round(torch.sigmoid(predictions))
    prediction_test.append(rounded_preds.detach().numpy())
    

y_true = np.concatenate(labels_test)
y_pred = np.concatenate(prediction_test)
y_noroud = np.concatenate(predictions_sig)


In [133]:
cm = confusion_matrix(y_true, y_pred)
display(cm)

print(classification_report(y_true, y_pred))

array([[6, 0],
       [3, 5]])

              precision    recall  f1-score   support

         0.0       0.67      1.00      0.80         6
         1.0       1.00      0.62      0.77         8

    accuracy                           0.79        14
   macro avg       0.83      0.81      0.78        14
weighted avg       0.86      0.79      0.78        14



In [134]:
print(y_true,y_pred)

[0. 1. 1. 1. 0. 0. 0. 1. 1. 0. 1. 1. 1. 0.] [0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0.]


In [135]:
manual_df = pd.read_csv('../challenge_nlp/data/manual_data.csv')
manual_df['pred'] = [int(i) for i in y_pred]
manual_df['no round'] = y_noroud
manual_df

Unnamed: 0,comment_text,toxicity,pred,no round
0,i love you,0,0,0.372464
1,hey man you suck,1,1,0.991344
2,i love you but you have to shout your mouth,1,1,0.603856
3,delete your useless account,1,1,0.697962
4,have a great day,0,0,0.214486
5,this a fake information i do not believe it,0,0,0.225871
6,valdivia is such a nice city,0,0,0.089271
7,you look like a horse,1,0,0.326474
8,have you no brain?!,1,0,0.442662
9,you are a very nice person,0,0,0.358732
