In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt

from copy import deepcopy
import time
from sklearn.model_selection import train_test_split
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

import torch

from sklearn.metrics import accuracy_score

In [None]:
!python -m spacy download es_core_news_sm

In [None]:
df = pd.read_csv("/kaggle/input/spanish-lang-suicide-tendency-texts/data_raw.csv")

In [None]:
label_index = {'suicida': 1, 'no_suicida': 0}
index_label = {1: 'suicida', 0: 'no suicida'}

In [None]:
df.isna().sum()

In [None]:
freqs = df['class'].value_counts()
plt.pie(freqs, labels=freqs.index,autopct='%0.2f%%')
plt.show()

In [None]:
df['class'] = df['class'].map(label_index)

In [None]:
train, late = train_test_split(df.values, random_state=42, test_size=0.25)

val, test = train_test_split(late, random_state=42, test_size=0.01)

# Building vocabulary and tokenizing data with text pipeline

In [None]:
tokenizer = get_tokenizer(tokenizer='spacy', language='es')

def yield_tokens(batch):
    for _text, _label in batch:
        yield tokenizer(_text)
        
vocab = build_vocab_from_iterator(yield_tokens(train), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

In [None]:
text_pipeline = lambda x: vocab(tokenizer(x))

# Parameters for DataLoader and model training

In [None]:
EPOCHS = 35
LR=0.1
STEP=5
GAMMA = 0.1
BATCH=32
DECAY=0.9
epochs = list(range(1, EPOCHS+1))
EM_SIZE=64
NUM_CLASSES=2
VOCAB_SIZE=len(vocab)

# Batch collate function for DataLoader

In [None]:
def collate_batch(batch):
    text_list, label_list, offsets = [], [], [0]
    for _text, _label in batch:
        processed = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list += [processed]
        label_list += [_label]
        offsets += [processed.size(0)]
        
    text_list = torch.cat(text_list)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    
    return text_list, label_list, offsets

In [None]:
train_dl = DataLoader(train, batch_size=BATCH, shuffle=True, collate_fn=collate_batch)
val_dl = DataLoader(val, batch_size=BATCH, shuffle=False, collate_fn=collate_batch)

# Model Architecture

In [None]:
class SpanishClassifier(torch.nn.Module):
    def __init__(self, vocab_size, em_size, num_classes):
        super(SpanishClassifier, self).__init__()
        self.em = torch.nn.EmbeddingBag(vocab_size, em_size)
        self.layers = torch.nn.Sequential(torch.nn.Linear(em_size, 256),
                                          torch.nn.ReLU(),
                                          torch.nn.Dropout(p=0.2),
                                         torch.nn.BatchNorm1d(256),
                                         torch.nn.Linear(256, 512),
                                         torch.nn.ReLU(),
                                          torch.nn.Dropout(p=0.2),
                                         torch.nn.BatchNorm1d(512))
        
        self.fc = torch.nn.Linear(512, num_classes)
        
    def forward(self, x, offset):
        x = self.em(x, offset)
        x = self.layers(x)
        return self.fc(x)

# Defining model, device, optimizer and criterion

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SpanishClassifier(VOCAB_SIZE, EM_SIZE, NUM_CLASSES)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=STEP, gamma=GAMMA)

criterion = torch.nn.CrossEntropyLoss()

In [None]:
best_model = deepcopy(model)
best_acc = 0
train_history = []
acc_history = []
val_history = []

for i in range(1, EPOCHS+1):
    model.train()
    
    train_loss = 0
    train_total = 0
    for idx, (text, label, offset) in enumerate(train_dl):
        if torch.cuda.is_available():
            text, label, offset = text.cuda(), label.cuda(), offset.cuda()
        optimizer.zero_grad()
        out = model(text, offset)
        loss = criterion(out, label)
        train_loss += loss.item()
        train_total += out.size(0)
        loss.backward()
        optimizer.step()
        
    train_end = train_loss/train_total
    
    val_loss = 0
    val_total = 0
    acc_loss = 0
    model.eval()
    for idx, (text, label, offset) in enumerate(val_dl):
        if torch.cuda.is_available():
            text, label, offset = text.cuda(), label.cuda(), offset.cuda()
            
        out = model(text, offset)
        loss = criterion(out, label)
        val_loss += loss.item()
        val_total += out.size(0)
        acc_loss += (out.argmax(1)==label).sum().item()
        
        
    val_end = val_loss/val_total
    acc_end = acc_loss/val_total
    
    train_history += [train_end]
    val_history += [val_end]
    acc_history += [acc_end]
    
    if acc_end > best_acc:
        best_model = deepcopy(model)
        best_acc = acc_end
        
    print("Epoch {} || train loss: {} || val loss: {} || acc loss: {}".format(i,
                                                                             train_end,
                                                                             val_end,
                                                                             acc_end))
    
    scheduler.step()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 8))
axes[0].plot(epochs, train_history)
axes[1].plot(epochs, val_history)
axes[2].plot(epochs, acc_history)
axes[0].set_title("Train loss")
axes[1].set_title("Val loss")
axes[2].set_title("Accuracy")
plt.suptitle("Model training and evaluation performance")
plt.tight_layout()
plt.show()

# The best accuracy during training and evaluation is

In [None]:
best_acc

In [None]:
def predict(text):
    best_model.eval()
    with torch.no_grad():
        txt = torch.tensor(text_pipeline(text), dtype=torch.int64)
        offset = torch.tensor([0])
        if torch.cuda.is_available():
            txt, offset = txt.cuda(), offset.cuda()
        out = best_model(txt, offset)
        
        prediction = out.argmax(1).cpu().detach().numpy()
        
        return prediction[0]

In [None]:
test[0, 0]

In [None]:
print("Prediciton is ", index_label[predict(test[0, 0])])
print("Truth value is ", index_label[test[0, 1]])

In [None]:
preds = [predict(x) for x in test[:, 0]]
predicted = np.array(preds).astype('object')
real = test[:, -1]

In [None]:
count = 0
for i, j in zip(predicted, real):
    if i == j:
        count += 1
count/len(predicted)