### Run Emotion-English-DistilRoBERTa-base on multiple text documents

In [None]:
# install the transformers library
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 43.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 38.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 1.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
 

In [None]:
# import required packages
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

In [None]:
# load tokenizer and model, create trainer
model_name = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)

Downloading:   0%|          | 0.00/294 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/313M [00:00<?, ?B/s]

### **Option 1:** Create list of texts

In [None]:
# create list of texts (can be imported from .csv, .xls etc.)
pred_texts = ['I like that', 'That is annoying', 'This is great!', 'Wouldn´t recommend it.']

### **Option 2:** Upload file to temporary Google space

In [None]:
# run cell and select file for upload
from google.colab import files
files.upload()

MessageError: ignored

In [None]:
# specify your filename
file_name = "s21_posts.csv"  # note: you can right-click on your file and copy-paste the path to it here
text_column = "selftext"  # select the column in your csv that contains the text to be classified

# read in csv
df_pred = pd.read_csv(file_name)
pred_texts = df_pred[text_column].dropna().astype('str').tolist()

### **Option 3:** Connect to Google Drive and select file

In [None]:
# import file stored on Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# specify your filename
file_name = "/content/YOUR_FILENAME.csv"  # note: you can right-click on your file and copy-paste the path to it here
text_column = "text"  # select the column in your csv that contains the text to be classified

# read in csv
df_pred = pd.read_csv(file_name)
pred_texts = df_pred[text_column].dropna().astype('str').tolist()

### Classify texts with model

In [None]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

In [None]:
# Run predictions
predictions = trainer.predict(pred_dataset)

In [None]:
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [None]:
# scores raw
temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))

In [None]:
      # work in progress
# container
anger = []
disgust = []
fear = []
joy = []
neutral = []
sadness = []
surprise = []

# extract scores (as many entries as exist in pred_texts)
for i in range(len(pred_texts)):
  anger.append(temp[i][0])
  disgust.append(temp[i][1])
  fear.append(temp[i][2])
  joy.append(temp[i][3])
  neutral.append(temp[i][4])
  sadness.append(temp[i][5])
  surprise.append(temp[i][6])

In [None]:
# Create DataFrame with texts, predictions, labels, and scores
df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores,  anger, disgust, fear, joy, neutral, sadness, surprise)), columns=['text','pred','label','score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'])
df.head()

Unnamed: 0,text,pred,label,score,anger,disgust,fear,joy,neutral,sadness,surprise
0,My dining halls closed and im so thirsty but i...,5,sadness,0.570498,0.035763,0.006711,0.01321,0.005862,0.099877,0.570498,0.268078
1,Hi! I'm currently a senior in hs who got rejec...,5,sadness,0.250588,0.082646,0.014582,0.23832,0.026051,0.234978,0.250588,0.152835
2,Hey! Do you guys what kind of on campus jobs a...,3,joy,0.471242,0.071377,0.009663,0.026385,0.471242,0.254838,0.01,0.156496
3,Sigh...this is literally such a tell-tale sign...,5,sadness,0.735479,0.010508,0.131132,0.022837,0.00252,0.078439,0.735479,0.019086
4,Are you living with a romantic partner?\n\nYou...,4,neutral,0.859578,0.017785,0.004787,0.002489,0.073649,0.859578,0.00573,0.035982


### Export results

In [None]:
# save results to csv
YOUR_FILENAME = "YOUR_FILENAME_EMOTIONS.csv"  # name your output file
df.to_csv(YOUR_FILENAME)

In [None]:
# download file
files.download(YOUR_FILENAME)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>