# Text classification: We are going to developp a model able to classify text in 6 categories.

## Choose directories

In [None]:
"""

data_path : concerns the path of the .csv file used as training set

models_folder : referes to the folder where the models obtain after each epochs will store.

N_EPOCHS : Number of iterations(epochs)

"""
data_path = "../input/genderbased-violence-tweet-classification/Gender-Based Violence Tweet Classification Challenge/Train.csv"
models_folder = "./"
N_EPOCHS = 5

## Import

In [1]:
import os
import torch
from torchtext import data
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random

## Reproducibility

In [2]:
SEED = 2021

def reproducibility(seed=2021):
    
    # seed for random, os, numpy and torch librayr
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    # seed fo cuda
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

reproducibility(SEED)

## EDA

In [3]:
donnees = pd.read_csv(data_path, index_col="Tweet_ID")

sns.set_style("dark")
sns.set_context("notebook")
plt.figure(figsize=(8, 5))
plt.subplot(111)
sns.countplot(data=donnees, x="type")
plt.xticks(rotation=90)
plt.legend(loc="upper right")
plt.show()

In [4]:
num_data_per_class = donnees["type"].value_counts()
num_data_per_class

In [5]:
percentage_data_per_class = 100*donnees["type"].value_counts()/donnees.shape[0]
percentage_data_per_class

In [6]:
1 / (100*donnees["type"].value_counts()/donnees.shape[0])

## Resampling

In [7]:
labels = ["sexual_violence", "Physical_violence", "emotional_violence",
         "economic_violence", "Harmful_Traditional_practice"]

In [9]:
import math
percent = 0.7

new_train_data = pd.concat([donnees.loc[donnees["type"]==t, :].sample(math.ceil(percent*num_data_per_class[t])) for t in labels]
                , axis=0)

new_valid_data = pd.concat([donnees.loc[donnees["type"]==t, :].sample(math.floor((1-percent)*num_data_per_class[t])) for t in labels]
                , axis=0)

new_train_data.to_csv("./new_train.csv")
new_valid_data.to_csv("./new_valid.csv")

## Preprocessing

In [10]:
# Now, let us see how to preprocess the text using field objects.
# There are 2 different types of field objects – Field and LabelField.
# Field concern the preprocessing tehcnics to apply on text ans LabelField concern ones toapply for Label.

TEXT = data.Field(tokenize='spacy', batch_first=True, include_lengths=True)
LABEL = data.LabelField()

# This variable is used to read file data.
fields = [('Tweet_ID',None), ('tweet',TEXT), ('type', LABEL)]

# Loading custom dataset    
train_data=data.TabularDataset(path = './new_train.csv'
                                  ,format = 'csv', fields = fields, skip_header = True)
valid_data=data.TabularDataset(path = './new_valid.csv'
                                  ,format = 'csv', fields = fields, skip_header = True)

# Print preprocessed text
print(vars(train_data.examples[0]))

## Split data to train and validation set 

In [14]:
# Check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

# Set batch size
BATCH_SIZE = 64

# Load iterators
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data),
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.tweet),
    sort_within_batch=True,
    device = device)

## Preparing input and output sequences

In [None]:
# Initialize glove embeddings
TEXT.build_vocab(train_data, min_freq=3, vectors = "glove.6B.100d")  
LABEL.build_vocab(train_data)

# No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

# No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

# Commonly used words
print("Commonly used words", TEXT.vocab.freqs.most_common(10))  

# Word dictionary
print("Word dictionary", TEXT.vocab.stoi.items())   

## Build the model

In [18]:
import torch.nn as nn

class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

In [20]:
#define hyperparameters
size_of_vocab = len(TEXT.vocab)
embedding_dim = 100
num_hidden_nodes = 32
num_output_nodes = 5
num_layers = 2
bidirection = True
dropout = 0.2

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout)

In [21]:
#architecture
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

In [44]:
import torch.optim as optim

#define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss(weight=torch.tensor([0.012145, 0.066683, 0.609063, 1.827189, 2.109043])) 

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum().sum() / len(correct)
    return acc
    
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [45]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.tweet   
        #convert to 1D tensor
        predictions = model(text, text_lengths)
        
        #find label
        batch_size = batch.type.shape[0]
        target = torch.zeros((batch_size, num_output_nodes))
        target[[i for i in range(batch_size)], batch.type.to("cpu").numpy()] = 1.0
        target = target.to(device)
        
        #compute the loss
        loss = criterion(predictions, target)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, target)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [48]:
def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.tweet
            
            #convert to 1d tensor
            predictions = model(text, text_lengths)
            
            #compute the label
            batch_size = batch.type.shape[0]
            target = torch.zeros((batch_size, num_output_nodes))
            target[[i for i in range(batch_size)], batch.type.to("cpu").numpy()] = 1.0
            target = target.to(device)
            
            #compute loss and accuracy
            loss = criterion(predictions, target)        
            acc = binary_accuracy(predictions, target)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [50]:
best_valid_loss = float('inf')

train_loss = np.zeros((1, N_EPOCHS))
valid_loss = np.zeros((1, N_EPOCHS))
train_acc = np.zeros((1, N_EPOCHS))
valid_acc = np.zeros((1, N_EPOCHS))

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss[0, epoch], train_acc[0, epoch] = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss[0, epoch], valid_acc[0, epoch] = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss[0, epoch] < best_valid_loss:
        best_valid_loss = valid_loss[0, epoch]
        torch.save(model.state_dict(), models_folder+'saved_weights('+str(epoch)+').pt')
    
    print(f"EPOCH {epoch}%")
    print(f'\tTrain Loss: {train_loss[0, epoch]:.3f} | Train Acc: {train_acc[0, epoch]*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss[0, epoch]:.3f} |  Val. Acc: {valid_acc[0, epoch]*100:.2f}%')


In [26]:
# Plot the complexity graph
plt.plot(train_loss[0,:], label = "train")
plt.plot(valid_loss[0,:], label = "validation")
plt.xlabel('epoch')
# Set the y axis label of the current axis.
plt.ylabel('error')
# Set a title of the current axes.
plt.title('Complexity graph')
# show a legend on the plot
plt.legend()
# Display a figure.
plt.show()

In [28]:
#load weights
path='./saved_weights('+str(N_EPOCHS - 1)+').pt'
model.load_state_dict(torch.load(path));
model.eval();

#inference 
import spacy
nlp = spacy.load('en') # I replace "en" by "fr"

CLASS = ["Sexual violence", "Physical violence", "Emotional Violence",
         "Economic Violence", "Harmful traditional practice"]

def predict(model, sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  #tokenize the sentence 
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]          #convert to integer sequence
    
    length = len(indexed)
    length_tensor = torch.LongTensor([length]).to('cpu')
    
    my_tensor = torch.LongTensor(indexed).to(device)           #convert to tensor
    my_tensor = torch.reshape(my_tensor, (1, length))          #reshape in form of batch,no. of words
    
    prediction = model(my_tensor, length_tensor)               #prediction 
    i = prediction.argmax().to('cpu').numpy()
    
    return CLASS[i]

In [36]:
# make predictions
predict(model, "he humiliate me in front of my children")