In [1]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from datasets import load_metric
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import (
    DataLoader, TensorDataset
) 
from sklearn.model_selection import train_test_split
import scipy.sparse



In [22]:
#loading the embedding layer
with open('embdedding_matrix.npy', 'rb') as f:
    embeddings = np.load(f)
final_embedding = torch.from_numpy(embeddings)
embedding_layer = nn.Embedding.from_pretrained(final_embedding)

In [23]:
embedding_layer

Embedding(2246293, 768)

In [67]:
#

In [37]:
class MLPAttribution(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        
    def forward(self, input):
        output = self.relu(self.linear1(input))
        return self.linear2(F.dropout(output, p=0.2, training=self.training))

In [38]:
def evaluate_mlp(model, dataloader):
    accuracy_metric = load_metric("accuracy")
    f1_macro = load_metric("f1")
    f1_weighted = load_metric("f1")
   
    model.eval()    
    
    model = model.float()
    for batch in dataloader:
        with torch.no_grad():
            input = batch[0]
            labels = batch[1].to(device)
            embedding = embedding_layer(input).to(device)
            logits = model(embedding.float())

        predictions = torch.argmax(logits, dim=-1)
        accuracy_metric.add_batch(predictions=predictions, references=labels)
        f1_macro.add_batch(predictions=predictions, references=labels)
        f1_weighted.add_batch(predictions=predictions, references=labels)

    return {'accuracy': accuracy_metric.compute()['accuracy'], 'f1_score_macro': f1_macro.compute(average="macro")['f1'], 
        'f1_score_weighted': f1_weighted.compute(average="weighted")['f1']}

In [74]:
def CB_loss(labels, logits, samples_per_cls, no_of_classes, loss_type, beta, gamma):
    """Compute the Class Balanced Loss between `logits` and the ground truth `labels`.
    Class Balanced Loss: ((1-beta)/(1-beta^n))*Loss(labels, logits)
    where Loss is one of the standard losses used for Neural Networks.
    Args:
      labels: A int tensor of size [batch].
      logits: A float tensor of size [batch, no_of_classes].
      samples_per_cls: A python list of size [no_of_classes].
      no_of_classes: total number of classes. int
      loss_type: string. One of "sigmoid", "focal", "softmax".
      beta: float. Hyperparameter for Class balanced loss.
      gamma: float. Hyperparameter for Focal loss.
    Returns:
      cb_loss: A float tensor representing class balanced loss
    """
    effective_num = 1.0 - np.power(beta, samples_per_cls)
    weights = (1.0 - beta) / np.array(effective_num)
    weights = weights / np.sum(weights) * no_of_classes

    labels_one_hot = F.one_hot(labels, no_of_classes).float()

    weights = torch.tensor(weights, dtype=torch.float32).to(device)
    weights = weights.unsqueeze(0)
    weights = weights.repeat(labels_one_hot.shape[0], 1) * labels_one_hot
    weights = weights.sum(1)
    weights = weights.unsqueeze(1)
    weights = weights.repeat(1, no_of_classes)

    if loss_type == "focal":
        cb_loss = focal_loss(labels_one_hot, logits, weights, gamma)
    elif loss_type == "sigmoid":
        cb_loss = F.binary_cross_entropy_with_logits(input=logits, target=labels_one_hot, weight=weights)
    elif loss_type == "softmax":
        pred = logits.softmax(dim=1)
        cb_loss = F.binary_cross_entropy(input=pred, target=labels_one_hot, weight=weights)

    return cb_loss

In [40]:
def loss_fn(output, targets, samples_per_cls, no_of_classes=2, loss_type = "softmax"):
    beta = 0.9999
    gamma = 2.0

    return CB_loss(targets, output, samples_per_cls, no_of_classes, loss_type, beta, gamma)

In [3]:
#

In [42]:
with open('tweets_to_ids.pkl', 'rb') as handle:
    tweets_to_ids = pickle.load(handle)
    
with open('tweets_to_users.pkl', 'rb') as handle:
    tweets_to_users = pickle.load(handle)

In [43]:
print(len(tweets_to_ids))
print(len(tweets_to_users))

2246293
2246293


In [44]:
user_tweets = {}

for tweet in tweets_to_users:
    
    user = int(tweets_to_users[tweet].replace('\n',''))
    
    if user not in user_tweets:
        user_tweets[user] = []
        
    user_tweets[user].append(tweet_to_id[tweet])

In [None]:
import random
for key in user_tweets:
    if len(user_tweets[key]) >= 100:
        user_tweets[key] = random.sample(user_tweets[key], 100)

In [46]:
len(user_tweets)

22315

In [47]:
is_cuda = torch.cuda.is_available()

if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")
    
torch.cuda.set_device(1)
print(torch.cuda.current_device())

GPU is available
1


In [48]:
#num of train users
output_dim = 15329

In [49]:
model = MLPAttribution(768, 384, output_dim).to(device)
model.load_state_dict(torch.load('./state_dict.pt'), strict=False)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
model.double()

MLPAttribution(
  (linear1): Linear(in_features=768, out_features=384, bias=True)
  (linear2): Linear(in_features=384, out_features=15329, bias=True)
  (relu): ReLU()
)

In [50]:
def extract_batches(seq, batch_size=128):
    n = len(seq) // batch_size
    batches  = []

    for i in range(n):
        batches.append(seq[i * batch_size:(i+1) * batch_size])
    if len(seq) % batch_size != 0:
        batches.append(seq[n * batch_size:])
    return batches

In [63]:
embedding_type = 'prediction'

In [64]:
user_embeddings = {}

In [65]:
for author in user_tweets:
    tweets = user_tweets[author]
    
    batches_tweets = extract_batches(tweets, 256)
    batch_embeddings = [torch.tensor([embeddings[v] for v in batch]) for batch in batches_tweets]
    
    size = 0
    if embedding_type == 'distribution':
        embeddings_final = []
    else:
        embeddings_final = torch.zeros(output_dim)

    for batch in batch_embeddings:
        size += batch.size()[0]                 
        with torch.no_grad():
            output = model(batch.to(device))
            if embedding_type == 'distribution':
                output = F.normalize(output, p=2, dim=1)
                embedding = output.cpu().mean(axis=0).unsqueeze(0)
                embeddings_final.append(embedding)
            elif embedding_type == 'prediction':
                predictions = torch.argmax(output, dim=-1)
                for i in predictions:
                    embeddings_final[i] += 1
            else:
                raise Exception("Wrong embedding type")
         

    if embedding_type == 'distribution':     
        if len(embeddings_final) > 1:
            embeddings_final = torch.cat(embeddings_final)
            user_embeddings[author] = embeddings_final.mean(axis=0).numpy()
        else:
            user_embeddings[author] = embeddings_final[0].squeeze().numpy()
    elif embedding_type == 'prediction':
        user_embeddings[author] = embeddings_final / size
    else:
        raise Exception("Wrong embedding type")
        

In [70]:
with open('user_attribution_prediction.pickle', 'wb') as handle:
    pickle.dump(user_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)