In [1]:
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from scipy.special import softmax
from os.path import join
from torch import cuda

import numpy as np
import torch as th
import pandas as pd
from typing import List
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
#remove hashtags and only keep tweets with unique texts (keep oldest tweets)
df = pd.read_csv ('data/tweets/IchBinHanna_updated.csv')
df['new_date'] = pd.to_datetime(df['created_at']).dt.strftime('%Y-%m-%d %H:%M:%S')
#sort by date to ensure duplicate removal keeps oldest tweet
df = df.sort_values(by='new_date')
df = df.drop_duplicates(subset=['text'], keep='first')
df = df.loc[df['reference_type'] != 'retweeted']
df['text'] = df['text'].astype(str)
df_en = df.loc[df['lang'] == 'en']
df_ger = df.loc[df['lang'] == 'de']

In [3]:
class SentimentModel():
    def __init__(self, model_name: str):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        self.clean_chars = re.compile(r'[^A-Za-züöäÖÜÄß ]', re.MULTILINE)
        self.clean_http_urls = re.compile(r'https*\\S+', re.MULTILINE)
        self.clean_at_mentions = re.compile(r'@\\S+', re.MULTILINE)
        self.device= device = 'cuda' if cuda.is_available() else 'cpu'
        self.model.to(self.device)
        
    def predict_sentiment(self, texts: List[str])-> List[str]:
        texts = [self.clean_text(text) for text in texts]
        # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
        encoded = self.tokenizer.batch_encode_plus(texts,padding=True, add_special_tokens=True,truncation=True, return_tensors="pt")
        encoded = encoded.to(self.device)
        with th.no_grad():
                logits = self.model(**encoded)
        
        label_ids = th.argmax(logits[0], axis=1)
        return [self.model.config.id2label[label_id.item()] for label_id in label_ids]

    def replace_numbers(self,text: str) -> str:
            return text.replace("0"," null").replace("1"," eins").replace("2"," zwei").replace("3"," drei").replace("4"," vier").replace("5"," fünf").replace("6"," sechs").replace("7"," sieben").replace("8"," acht").replace("9"," neun")         

    def clean_text(self,text: str)-> str:    
            text = text.replace("\n", " ")        
            text = self.clean_http_urls.sub('',text)
            text = self.clean_at_mentions.sub('',text)        
            text = self.replace_numbers(text)                
            text = self.clean_chars.sub('', text) # use only text chars                          
            text = ' '.join(text.split()) # substitute multiple whitespace with single whitespace   
            text = text.strip().lower()
            return text
model = SentimentModel(model_name = "oliverguhr/german-sentiment-bert")

In [4]:
#predict in chunks to not run of memory
preds =[]
n = 400
list_df_ger = [df_ger[i:i+n] for i in range(0,df_ger.shape[0],n)]
for i in list_df_ger:
    curr = model.predict_sentiment(i['text'])
    preds += curr

In [7]:
df_ger['bert_sentiment'] = preds

In [8]:
#for further investigation
compression_opts = dict(method='zip',
                        archive_name='IchBinHanna_German_Bert_Sentiment.csv')  
df_ger.to_csv('out.zip', index=True,
          compression=compression_opts) 

In [4]:
checkpoint = "lumalik/vent-roberta-emotion"

class InferenceDataset(Dataset):
    '''
    Inherits from the basic PyTorch "Dataset" class (see documentation at
    https://pytorch.org/tutorials/beginner/basics/data_tutorial.html). The
    DataFrame that is passed to the constructor needs to contain a field "text".
    '''

    def __init__(self, data, tokenizer, max_token_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data_row = self.data.iloc[index]
        text = data_row.text
        encoding = self.tokenizer.encode_plus(
            text=text,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=True,
            padding="max_length",
            truncation=True,
            return_attention_mask=True)

        return dict(input_ids=th.tensor(encoding["input_ids"], dtype=th.long),
                    attention_mask=th.tensor(encoding["attention_mask"], dtype=th.long),
                    token_type_ids=th.tensor(encoding["token_type_ids"], dtype=th.long))
    
# load the pretrained and fine-tuned model 
model = AutoModelForSequenceClassification\
        .from_pretrained("lumalik/vent-roberta-emotion")
# load the byte-level tokenizer
tokenizer = AutoTokenizer.from_pretrained("lumalik/vent-roberta-emotion")

# load the data and stuff it into a Dataset container.
# NOTE: the batch size 512 is more geared towards processing on a GPU. If 
# memory errors occur, it makes sense to reduce the batch size.

inference_set = InferenceDataset(df_en, tokenizer, max_token_len=128)
inference_params = {'batch_size': 512, 'shuffle': False}
inference_loader = DataLoader(inference_set, **inference_params)

# these are dummy arguments, they are (to my knowledge) not needed
# to perform the classification
training_args = TrainingArguments(
    "test-trainer",
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 5,
    learning_rate = 2e-5,
    weight_decay = 0.01,
    evaluation_strategy = "epoch"
)

# trainer object to perform the prediction
trainer = Trainer(
        model,
        training_args,
        tokenizer = tokenizer,
)

device = 'cuda' if cuda.is_available() else 'cpu'
print("using device: {}".format(device))

# raw_pred contains the logits of each emotion label
raw_pred, _, _ = trainer.prediction_loop(inference_loader, 
                                         description="prediction")

em_dict = {0:'Affection', 1:'Anger', 2:'Fear', 3:'Happiness', 4:'Sadness'}
emotions = np.argmax(softmax(raw_pred), axis=1)
emotions = [em_dict[e] for e in emotions]

results_en = pd.DataFrame({'id':df_en['id'], 'em':emotions})

Some weights of the model checkpoint at lumalik/vent-roberta-emotion were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
***** Running prediction *****
  Num examples = 2622
  Batch size = 512


using device: cpu


KeyboardInterrupt: 

In [None]:
df_en = df_en.merge(results_en)

In [5]:
df_en['em'].value_counts()

Anger        959
Happiness    713
Affection    661
Fear         235
Sadness       54
Name: em, dtype: int64

In [6]:
df_out = df_en[['text','em']]

In [7]:
#for further investigation
compression_opts = dict(method='zip',
                        archive_name='vent-roberta-emotion.csv')  
df_out.to_csv('out.zip', index=True,
          compression=compression_opts) 