In [1]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from datasets import load_metric
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import (
    DataLoader, TensorDataset
) 
from sklearn.model_selection import train_test_split
import scipy.sparse



In [2]:
#loading train dataset
train_data = pd.read_csv('split/x_train.csv')
val_data = pd.read_csv('split/x_val.csv')
test_data = pd.read_csv('split/x_test.csv')

train_data = train_data.dropna(subset = ['sar_text'])
train_data = train_data.reset_index(drop=True)

val_data = val_data.dropna(subset = ['sar_text'])
val_data = val_data.reset_index(drop=True)

test_data = test_data.dropna(subset = ['sar_text'])
test_data = test_data.reset_index(drop=True)

In [3]:
users = []
train_users = []

for i in range(len(train_data['sar_user'])):
    user = train_data['sar_user'][i].split('|')
    users.append(int(user[-1]))
    train_users.append(int(user[-1]))
    
for i in range(len(val_data['sar_user'])):
    user = val_data['sar_user'][i].split('|')
    users.append(int(user[-1]))
    
for i in range(len(test_data['sar_user'])):
    user = test_data['sar_user'][i].split('|')
    users.append(int(user[-1]))
    
#number of disctinct users
users = list(dict.fromkeys(users))
train_users = list(dict.fromkeys(train_users))
num_users = len(users)
print(num_users)

22622


In [5]:
#helper function 
def str_to_arr(string_array):
    string_array = string_array.replace("[", "")
    string_array = string_array.replace("]", "")
    string_array = string_array.replace("\n", "")
    string_array = string_array.replace("  ", " ")
    string_array = string_array.split(' ')

    for i in range(768):
        if string_array[i] == '':
            string_array.pop(i)
            
    if len(string_array) != 768:
        print("Duzina string arraya ", len(string_array))
    
    return [float(string_number) for string_number in string_array]

In [15]:
#loading embeddings (for all the users)
file = open('/../../../data/sarcasm_data/user_embeddings_3_final.csv')

embedding = []
tweets_to_users = {}
tweet_to_id = {}

num_tweets = 0
for line in file:
    tweet_id, tweet_embedding, user_id = line.split(',')
    
    if int(user_id) in users and tweet_id not in tweets_to_users:
    #    embedding.extend(str_to_arr(tweet_embedding))
        tweets_to_users[tweet_id] = user_id
        tweet_to_id[tweet_id] = len(tweet_to_id)
        num_tweets += 1

In [16]:
print(len(tweets_to_users))
print(len(tweet_to_id))
print(num_tweets)

2246293
2246293
2246293


In [19]:
#extracting users from the history folder
history_users = list(tweets_to_users.values())
history_users = list(dict.fromkeys(history_users))

i = 0
for user in history_users:
    history_users[i] = int(user.replace('\n',''))
    i += 1

In [20]:
len(history_users)

22315

In [21]:
#all the tweets to users
with open('tweets_to_users.pkl', 'wb') as f:
    pickle.dump(tweets_to_users, f)
    
#all the tweets to ids
with open('tweets_to_ids.pkl', 'wb') as f:
    pickle.dump(tweet_to_id, f)

In [44]:
#creating embedding layer (if it doesn't exist)
embedding = np.reshape(embedding, (-1, 768))
np.save('embdedding_matrix.npy',embedding)
final_embedding = torch.from_numpy(embedding)
embedding_layer = nn.Embedding.from_pretrained(final_embedding)

In [22]:
#loading the embedding layer
with open('embdedding_matrix.npy', 'rb') as f:
    embeddings = np.load(f)
final_embedding = torch.from_numpy(embeddings)
embedding_layer = nn.Embedding.from_pretrained(final_embedding)

In [23]:
embedding_layer

Embedding(2246293, 768)

In [24]:
#

In [53]:
#creating labels for training authors
author_to_label = {}

i = 0
for user in train_users:    
    if user not in author_to_label and user in history_users:
        author_to_label[user] = i
        i += 1
        
print(len(author_to_label))
        
#creating labels for the rest of the authors
for user in users:    
    if user not in author_to_label and user in history_users:
        author_to_label[user] = i
        i += 1
        
print(len(author_to_label))

15329
22315


In [54]:
with open('author_to_label.pkl', 'wb') as f:
    pickle.dump(author_to_label, f)

In [33]:
#just the users from training
all_labels = []
all_tweets = []

for tweet_id in tweets_to_users:
    
    user = int(tweets_to_users[tweet_id].replace('\n',''))
    
    if user in train_users and user in history_users:

        all_labels.append(author_to_label[user])
        all_tweets.append(tweet_to_id[tweet_id])

In [34]:
len(all_tweets)

1548879

In [35]:
len(all_labels)

1548879

In [65]:
#train test split
batch_size = 64
SEED = 1234

train_tweets, val_tweets, train_labels, val_labels = train_test_split(all_tweets, all_labels, test_size=0.2)

train_dataset = TensorDataset(torch.from_numpy(np.asarray(train_tweets)), torch.from_numpy(np.asarray(train_labels)))
train_loader = DataLoader(train_dataset, shuffle=False, batch_size=batch_size, drop_last=True)

val_dataset = TensorDataset(torch.from_numpy(np.asarray(val_tweets)), torch.from_numpy(np.asarray(val_labels)))
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size, drop_last=True)

In [36]:
is_cuda = torch.cuda.is_available()

if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")
    
torch.cuda.set_device(1)
print(torch.cuda.current_device())

GPU is available
1


In [37]:
#2LL Network
class MLPAttribution(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        
    def forward(self, input):
        output = self.relu(self.linear1(input))
        return self.linear2(F.dropout(output, p=0.2, training=self.training))

In [38]:
#helper function
def evaluate_mlp(model, dataloader):
    accuracy_metric = load_metric("accuracy")
    f1_macro = load_metric("f1")
    f1_weighted = load_metric("f1")
   
    model.eval()    
    
    model = model.float()
    for batch in dataloader:
        with torch.no_grad():
            input = batch[0]
            labels = batch[1].to(device)
            embedding = embedding_layer(input).to(device)
            logits = model(embedding.float())

        predictions = torch.argmax(logits, dim=-1)
        accuracy_metric.add_batch(predictions=predictions, references=labels)
        f1_macro.add_batch(predictions=predictions, references=labels)
        f1_weighted.add_batch(predictions=predictions, references=labels)

    return {'accuracy': accuracy_metric.compute()['accuracy'], 'f1_score_macro': f1_macro.compute(average="macro")['f1'], 
        'f1_score_weighted': f1_weighted.compute(average="weighted")['f1']}

In [39]:
#number of users in train
output_dim = len(list(dict.fromkeys(all_labels)))
print(output_dim)

15329


In [71]:
model = MLPAttribution(768, 384, output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [72]:
num_epochs = 100
num_training_steps = num_epochs * len(train_loader)
best_accuracy = 0
best_f1 = 0
samples_per_class = torch.bincount(torch.tensor(all_labels)).tolist()

In [74]:
def CB_loss(labels, logits, samples_per_cls, no_of_classes, loss_type, beta, gamma):
    """Compute the Class Balanced Loss between `logits` and the ground truth `labels`.
    Class Balanced Loss: ((1-beta)/(1-beta^n))*Loss(labels, logits)
    where Loss is one of the standard losses used for Neural Networks.
    Args:
      labels: A int tensor of size [batch].
      logits: A float tensor of size [batch, no_of_classes].
      samples_per_cls: A python list of size [no_of_classes].
      no_of_classes: total number of classes. int
      loss_type: string. One of "sigmoid", "focal", "softmax".
      beta: float. Hyperparameter for Class balanced loss.
      gamma: float. Hyperparameter for Focal loss.
    Returns:
      cb_loss: A float tensor representing class balanced loss
    """
    effective_num = 1.0 - np.power(beta, samples_per_cls)
    weights = (1.0 - beta) / np.array(effective_num)
    weights = weights / np.sum(weights) * no_of_classes

    labels_one_hot = F.one_hot(labels, no_of_classes).float()

    weights = torch.tensor(weights, dtype=torch.float32).to(device)
    weights = weights.unsqueeze(0)
    weights = weights.repeat(labels_one_hot.shape[0], 1) * labels_one_hot
    weights = weights.sum(1)
    weights = weights.unsqueeze(1)
    weights = weights.repeat(1, no_of_classes)

    if loss_type == "focal":
        cb_loss = focal_loss(labels_one_hot, logits, weights, gamma)
    elif loss_type == "sigmoid":
        cb_loss = F.binary_cross_entropy_with_logits(input=logits, target=labels_one_hot, weight=weights)
    elif loss_type == "softmax":
        pred = logits.softmax(dim=1)
        cb_loss = F.binary_cross_entropy(input=pred, target=labels_one_hot, weight=weights)

    return cb_loss

In [40]:
def loss_fn(output, targets, samples_per_cls, no_of_classes=2, loss_type = "softmax"):
    beta = 0.9999
    gamma = 2.0

    return CB_loss(targets, output, samples_per_cls, no_of_classes, loss_type, beta, gamma)

In [76]:
model = model.float()

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input = batch[0]
        labels = batch[1].to(device)
        embedding = embedding_layer(input).to(device)
        output = model(embedding.float())
        pred = output.softmax(dim=1)
        loss = loss_fn(output, labels, samples_per_class, output_dim)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    val_metric = evaluate_mlp(model, val_loader)

    print("Epoch {} **** Loss {} **** Metrics validation: {}".format(epoch, loss, val_metric))
    if val_metric['f1_score_weighted'] > best_f1:
        best_f1 = val_metric['f1_score_weighted']
        torch.save(model.state_dict(), './state_dict.pt')

Epoch 0 **** Loss 0.0006693075411021709 **** Metrics validation: {'accuracy': 0.0007360537190082645, 'f1_score_macro': 3.448788729577643e-05, 'f1_score_weighted': 2.8575128947129812e-05}
Epoch 1 **** Loss 0.0006280039087869227 **** Metrics validation: {'accuracy': 0.0027634297520661155, 'f1_score_macro': 0.0004702338373286316, 'f1_score_weighted': 0.0004551465427733109}
Epoch 2 **** Loss 0.0006047406350262463 **** Metrics validation: {'accuracy': 0.006230630165289256, 'f1_score_macro': 0.0016503023089930958, 'f1_score_weighted': 0.0016663771935469317}
Epoch 3 **** Loss 0.0005875469068996608 **** Metrics validation: {'accuracy': 0.009581611570247934, 'f1_score_macro': 0.0029144205169197718, 'f1_score_weighted': 0.002999214433726065}
Epoch 4 **** Loss 0.0005787684349343181 **** Metrics validation: {'accuracy': 0.01306172520661157, 'f1_score_macro': 0.0045341420979390845, 'f1_score_weighted': 0.004646668852316484}
Epoch 5 **** Loss 0.0005670087994076312 **** Metrics validation: {'accuracy