In [2]:
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from scipy.special import softmax
from os.path import join
from torch import cuda

import numpy as np
import torch as th
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv ('data/tweets/IchBinHannaUser.csv')
df['new_date'] = pd.to_datetime(df['created_at']).dt.strftime('%Y-%m-%d %H:%M:%S')
df = df.loc[(df['new_date'] > '2021-06-01 00:00:00') & (df['new_date'] <= '2021-09-30 23:59:59')]
df = df.loc[df['reference_type'] != 'retweeted']
df['text'] = df['text'].astype(str)
df_en = df.loc[df['lang'] == 'en']

In [4]:
checkpoint = "lumalik/vent-roberta-emotion"

class InferenceDataset(Dataset):
    '''
    Inherits from the basic PyTorch "Dataset" class (see documentation at
    https://pytorch.org/tutorials/beginner/basics/data_tutorial.html). The
    DataFrame that is passed to the constructor needs to contain a field "text".
    '''

    def __init__(self, data, tokenizer, max_token_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data_row = self.data.iloc[index]
        text = data_row.text
        encoding = self.tokenizer.encode_plus(
            text=text,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=True,
            padding="max_length",
            truncation=True,
            return_attention_mask=True)

        return dict(input_ids=th.tensor(encoding["input_ids"], dtype=th.long),
                    attention_mask=th.tensor(encoding["attention_mask"], dtype=th.long),
                    token_type_ids=th.tensor(encoding["token_type_ids"], dtype=th.long))
    
# load the pretrained and fine-tuned model 
model = AutoModelForSequenceClassification\
        .from_pretrained("lumalik/vent-roberta-emotion")
# load the byte-level tokenizer
tokenizer = AutoTokenizer.from_pretrained("lumalik/vent-roberta-emotion")

# load the data and stuff it into a Dataset container.
# NOTE: the batch size 512 is more geared towards processing on a GPU. If 
# memory errors occur, it makes sense to reduce the batch size.

inference_set = InferenceDataset(df_en, tokenizer, max_token_len=128)
inference_params = {'batch_size': 512, 'shuffle': False}
inference_loader = DataLoader(inference_set, **inference_params)

# these are dummy arguments, they are (to my knowledge) not needed
# to perform the classification
training_args = TrainingArguments(
    "test-trainer",
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 5,
    learning_rate = 2e-5,
    weight_decay = 0.01,
    evaluation_strategy = "epoch"
)

# trainer object to perform the prediction
trainer = Trainer(
        model,
        training_args,
        tokenizer = tokenizer,
)

device = 'cuda' if cuda.is_available() else 'cpu'
print("using device: {}".format(device))

# raw_pred contains the logits of each emotion label
raw_pred, _, _ = trainer.prediction_loop(inference_loader, 
                                         description="prediction")

em_dict = {0:'Affection', 1:'Anger', 2:'Fear', 3:'Happiness', 4:'Sadness'}
emotions = np.argmax(softmax(raw_pred), axis=1)
emotions = [em_dict[e] for e in emotions]

results_en = pd.DataFrame({'id':df_en['id'], 'em':emotions})

Some weights of the model checkpoint at lumalik/vent-roberta-emotion were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
***** Running prediction *****
  Num examples = 2622
  Batch size = 512


using device: cpu


KeyboardInterrupt: 

In [None]:
df_en = df_en.merge(results_en)

In [5]:
df_en['em'].value_counts()

Anger        959
Happiness    713
Affection    661
Fear         235
Sadness       54
Name: em, dtype: int64

In [6]:
df_out = df_en[['text','em']]

In [7]:
#for further investigation
compression_opts = dict(method='zip',
                        archive_name='vent-roberta-emotion.csv')  
df_out.to_csv('out.zip', index=True,
          compression=compression_opts) 

In [8]:
    
# load the pretrained and fine-tuned model 
model = AutoModelForSequenceClassification\
        .from_pretrained("lumalik/vent-roberta-emotion")
# load the byte-level tokenizer
tokenizer = AutoTokenizer.from_pretrained("lumalik/vent-roberta-emotion")

# load the data and stuff it into a Dataset container.
# NOTE: the batch size 512 is more geared towards processing on a GPU. If 
# memory errors occur, it makes sense to reduce the batch size.

inference_set = InferenceDataset(df, tokenizer, max_token_len=128)
inference_params = {'batch_size': 512, 'shuffle': False}
inference_loader = DataLoader(inference_set, **inference_params)

# these are dummy arguments, they are (to my knowledge) not needed
# to perform the classification
training_args = TrainingArguments(
    "test-trainer",
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 5,
    learning_rate = 2e-5,
    weight_decay = 0.01,
    evaluation_strategy = "epoch"
)

# trainer object to perform the prediction
trainer = Trainer(
        model,
        training_args,
        tokenizer = tokenizer,
)

device = 'cuda' if cuda.is_available() else 'cpu'
print("using device: {}".format(device))

# raw_pred contains the logits of each emotion label
raw_pred, _, _ = trainer.prediction_loop(inference_loader, 
                                         description="prediction")

em_dict = {0:'Affection', 1:'Anger', 2:'Fear', 3:'Happiness', 4:'Sadness'}
emotions = np.argmax(softmax(raw_pred), axis=1)
emotions = [em_dict[e] for e in emotions]

results = pd.DataFrame({'id':df['id'], 'em':emotions})

loading configuration file https://huggingface.co/lumalik/vent-roberta-emotion/resolve/main/config.json from cache at C:\Users\Admin/.cache\huggingface\transformers\128f89e8ec1fc72c5cf6c72711f17c28666a66c2ee7994c1095b0d19d92ae266.ef05c7dc35ead262c4dc7f15bbf8b0c1fa804e1aa98146fe6768906814cd71f2
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Affection",
    "1": "Anger",
    "2": "Fear",
    "3": "Happiness",
    "4": "Sadness"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Affection": 0,
    "Anger": 1,
    "Fear": 2,
    "Happiness": 3,
    "Sadness": 4
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "mode

using device: cuda


In [9]:
df = df.merge(results)
df['em'].value_counts()

Anger        12321
Happiness     5360
Affection     3306
Fear           821
Sadness        116
Name: em, dtype: int64