# Transformer based classifier

[Beto, fitted for emoji prediction](https://huggingface.co/ccarvajal/beto-emoji).

In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

In [2]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [3]:
MODEL = f"ccarvajal/beto-emoji"
folder = MODEL.replace('ccarvajal/','')

try:
    tokenizer = AutoTokenizer.from_pretrained(folder)
except ValueError:
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    tokenizer.save_pretrained(folder)

In [4]:
# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/camilocarvajalreyes/beto-emoji/main/es_mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [5]:
try:
    model = AutoModelForSequenceClassification.from_pretrained(folder)
except OSError:
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    model.save_pretrained(folder)

In [6]:
def eval_text(text):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return np.argmax(scores), scores

In [7]:
def rank_emojis_text(text):
    _, scores = eval_text(text)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        print(f"{i+1}) {l} {np.round(float(s), 4)}")

In [8]:
ejemplo = "que viva españa"
label, _ = eval_text(ejemplo)
print(ejemplo+'\nemoji = {} ({})'.format(labels[label],label))

que viva españa
emoji = 🇪🇸 (9)


In [9]:
rank_emojis_text(ejemplo)

1) 🇪🇸 0.2508
2) 😍 0.238
3) 👌 0.2225
4) 😂 0.0806
5) ❤ 0.0489
6) 😁 0.0415
7) 😜 0.0232
8) 😎 0.0229
9) 😊 0.0156
10) 😉 0.0119
11) 💜 0.0079
12) 💕 0.0077
13) 💪 0.0066
14) 💘 0.0054
15) 💙 0.0052
16) 💞 0.005
17) 😘 0.0034
18) 🎶 0.0022
19) ✨ 0.0007


In [10]:
import pickle
from config import file_names

df_es_test = pickle.load(open(file_names['df_es_test'], "rb"))

In [11]:
%%time
y_pred = []

for texto in df_es_test['text']:
    label, _ = eval_text(texto)
    y_pred.append(label)

CPU times: user 49min 25s, sys: 22.3 s, total: 49min 47s
Wall time: 8min 19s


In [12]:
%%capture
from sklearn.metrics import classification_report

report = classification_report(df_es_test["label"].astype(int), y_pred, target_names=labels)

In [13]:
print(report)

              precision    recall  f1-score   support

           ❤       0.39      0.43      0.41      2141
           😍       0.29      0.39      0.33      1408
           😂       0.51      0.51      0.51      1499
           💕       0.09      0.05      0.06       352
           😊       0.12      0.23      0.16       514
           😘       0.24      0.23      0.24       397
           💪       0.37      0.43      0.40       307
           😉       0.15      0.17      0.16       453
           👌       0.09      0.16      0.11       180
          🇪🇸       0.46      0.46      0.46       424
           😎       0.12      0.11      0.11       339
           💙       0.36      0.02      0.04       413
           💜       0.00      0.00      0.00       235
           😜       0.04      0.02      0.02       274
           💞       0.00      0.00      0.00        93
           ✨       0.26      0.12      0.17       416
           🎶       0.25      0.24      0.24       212
           💘       0.00    

In [14]:
from sklearn.metrics import f1_score

f1_score(df_es_test["label"].astype(int), y_pred, average='macro')

0.18153243138645886