In [178]:
import numpy as np
import pandas as pd
from transformers import DistilBertTokenizerFast
import torch
from transformers import DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [195]:
import numpy as np
data = pd.read_csv("emb_afr-amh_train.csv", header = None)
data.columns = ['Embeddings','Labels']
data['Labels']=data['Labels'].replace(-1,2)

In [194]:
data_dev = pd.read_csv("emb_afr-amh_dev.csv", header = None)
data_dev.columns = ['Embeddings','Labels']
data_dev['Labels']=data_dev['Labels'].replace(-1,2)

In [196]:
def embedding_cleaner(str_list):
    str_list = str_list.strip('[]')
    float_list = str_list.split()
    float_list = [float(i) for i in float_list]
    numpy_array = np.array(float_list)
    return numpy_array

In [197]:
import torch
from torch.utils.data import Dataset, DataLoader
import ast

class EmbeddingDataset(Dataset):
    def __init__(self, dataframe):
        """
        Args:
            dataframe (pandas.DataFrame): DataFrame containing embeddings and labels.
        """
        self.embeddings = dataframe['Embeddings'].apply(embedding_cleaner).tolist()
        self.labels = dataframe['Labels'].tolist()


    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        embedding = self.embeddings[idx]
        label = self.labels[idx]
        embedding = torch.tensor(embedding, dtype=torch.float)
        embedding = embedding.unsqueeze(0)

        return torch.tensor(embedding, dtype=torch.float), torch.tensor(label, dtype=torch.long)

embedding_dataset = EmbeddingDataset(data)
dev_dataset = EmbeddingDataset(data_dev)
batch_size = 1  
train_loader = DataLoader(embedding_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True)


In [204]:
import torch
import torch.nn as nn

class SentimentBiLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dims, output_dim, drop_prob=0.05):
        super(SentimentBiLSTM, self).__init__()
        
        self.hidden_dims = hidden_dims
        self.n_layers = len(hidden_dims)
        self.lstm_layers = nn.ModuleList()

        for i in range(self.n_layers):
            input_dim = embedding_dim if i == 0 else hidden_dims[i-1] * 2
            self.lstm_layers.append(nn.LSTM(input_dim, hidden_dims[i], 
                                            1, dropout=drop_prob, 
                                            batch_first=True, bidirectional=True))
        
        self.dropout = nn.Dropout(drop_prob)
        self.fc1 = nn.Linear(hidden_dims[-1] * 2, hidden_dims[-1]) 
        self.fc2 = nn.Linear(hidden_dims[-1], output_dim)      

    def forward(self, x):
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size)

        for i, lstm in enumerate(self.lstm_layers):
            x, hidden[i] = lstm(x, hidden[i])
        
        x = x[:, -1, :] 
        x = self.dropout(x)
        x = torch.relu(self.fc1(x)) 
        x = self.dropout(x)
        x = self.fc2(x)
        return x

    def init_hidden(self, batch_size):
        hidden_states = []
        for hidden_dim in self.hidden_dims:
            weight = next(self.parameters()).data
            hidden_states.append((weight.new(2, batch_size, hidden_dim).zero_(),
                                  weight.new(2, batch_size, hidden_dim).zero_()))
        return hidden_states


In [207]:
import torch
import torch.nn as nn


embedding_dim = 100 
hidden_dims = [50, 20] 
output_dim = 3

model = SentimentBiLSTM(embedding_dim, hidden_dims, output_dim)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


In [211]:
def train_model(model, train_loader, criterion, optimizer, n_epochs, device):
    model.to(device) 
    model.train()

    for epoch in range(n_epochs):
        total_loss = 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            labels = torch.tensor(labels, device=device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        average_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}/{n_epochs}, Loss: {average_loss:.4f}')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

n_epochs = 50
train_model(model, train_loader, criterion, optimizer, n_epochs, device)


  return torch.tensor(embedding, dtype=torch.float), torch.tensor(label, dtype=torch.long)
  labels = torch.tensor(labels, device=device)


Epoch 1/50, Loss: 1.0292
Epoch 2/50, Loss: 1.0268
Epoch 3/50, Loss: 1.0263
Epoch 4/50, Loss: 1.0271
Epoch 5/50, Loss: 1.0256
Epoch 6/50, Loss: 1.0258
Epoch 7/50, Loss: 1.0271
Epoch 8/50, Loss: 1.0261
Epoch 9/50, Loss: 1.0266
Epoch 10/50, Loss: 1.0246
Epoch 11/50, Loss: 1.0263
Epoch 12/50, Loss: 1.0245
Epoch 13/50, Loss: 1.0245
Epoch 14/50, Loss: 1.0255
Epoch 15/50, Loss: 1.0259
Epoch 16/50, Loss: 1.0263
Epoch 17/50, Loss: 1.0246
Epoch 18/50, Loss: 1.0257
Epoch 19/50, Loss: 1.0300
Epoch 20/50, Loss: 1.0256
Epoch 21/50, Loss: 1.0254
Epoch 22/50, Loss: 1.0257
Epoch 23/50, Loss: 1.0264
Epoch 24/50, Loss: 1.0259
Epoch 25/50, Loss: 1.0464
Epoch 26/50, Loss: 1.0261
Epoch 27/50, Loss: 1.0247
Epoch 28/50, Loss: 1.0250
Epoch 29/50, Loss: 1.0255
Epoch 30/50, Loss: 1.0260
Epoch 31/50, Loss: 1.0275
Epoch 32/50, Loss: 1.0253
Epoch 33/50, Loss: 1.0259
Epoch 34/50, Loss: 1.0249
Epoch 35/50, Loss: 1.0261
Epoch 36/50, Loss: 1.0253
Epoch 37/50, Loss: 1.0255
Epoch 38/50, Loss: 1.0253
Epoch 39/50, Loss: 1.

In [210]:
from sklearn.metrics import f1_score

def evaluate_model(model, dev_loader, device):
    model.eval()  
    predictions = []
    actuals = []

    with torch.no_grad():
        for inputs, labels in dev_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            labels = torch.tensor(labels, device=device)
            
            outputs = model(inputs)
            predicted = torch.argmax(outputs, dim=1)

            predictions.extend(predicted.view(-1).cpu().numpy())
            actuals.extend(labels.view(-1).cpu().numpy())

    return predictions, actuals

predictions, actuals = evaluate_model(model, dev_loader, device)

f1 = f1_score(actuals, predictions, average = 'micro') 
print("F1 Score:", f1)


  return torch.tensor(embedding, dtype=torch.float), torch.tensor(label, dtype=torch.long)
  labels = torch.tensor(labels, device=device)


F1 Score: 0.5170340681362725


In [174]:
actuals

[0,
 2,
 1,
 0,
 0,
 2,
 2,
 0,
 2,
 1,
 0,
 2,
 0,
 0,
 0,
 1,
 2,
 2,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 2,
 0,
 1,
 0,
 2,
 0,
 2,
 1,
 0,
 0,
 0,
 2,
 0,
 2,
 0,
 2,
 2,
 0,
 2,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 1,
 2,
 2,
 0,
 1,
 1,
 1,
 0,
 2,
 1,
 1,
 2,
 1,
 0,
 0,
 2,
 0,
 2,
 0,
 0,
 0,
 1,
 1,
 0,
 2,
 0,
 2,
 1,
 0,
 2,
 0,
 0,
 2,
 2,
 0,
 2,
 1,
 1,
 0,
 2,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 2,
 2,
 1,
 0,
 0,
 0,
 2,
 0,
 2,
 0,
 0,
 0,
 2,
 0,
 1,
 2,
 0,
 1,
 0,
 1,
 2,
 2,
 1,
 0,
 2,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 2,
 2,
 2,
 2,
 1,
 2,
 1,
 0,
 2,
 0,
 1,
 2,
 1,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 2,
 2,
 1,
 1,
 2,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 2,
 2,
 1,
 0,
 0,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 0,
 1,
 2,
 0,
 0,
 1,
 1,
 0,
 0,
 2,
 0,
 1,
 1,
 1,
 2,
 0,
 1,
 1,
 0,
 0,
 2,
 0,
 1,
 1,
 0,
 1,
 0,
 2,
 0,
 0,
 0,
 2,
 0,
 2,
 2,
 1,
 0,
 2,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 2,
 2,
 2,
 0,
 1,
 1,
 0,
 2,
 2,
 0,
 2,
 1,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 1,
