## Installs and imports

In [None]:
!pip install --upgrade pip
!pip install sentencepiece
!pip install transformers

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import AutoModelForSequenceClassification
from torch.utils.data import DataLoader
import numpy as np
from scipy.special import softmax

## Data

In [None]:
def preprocess(corpus):
  outcorpus = []
  for text in corpus:
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    new_text = " ".join(new_text)
    outcorpus.append(new_text)
  return outcorpus

In [None]:
!wget https://raw.githubusercontent.com/cardiffnlp/xlm-t/main/data/sentiment/all/test_text.txt

In [None]:
dataset_path = './test_text.txt'
dataset = open(dataset_path).read().split('\n')

In [None]:
# this is a dataset in 8 different languages
for example in [0,870,1740,2610,3480,4350,5220,6090]:
  print(dataset[example])

نوال الزغبي (الشاب خالد ليس عالمي) هههههههه أتفرجي على ها الفيديو يا مبتدئة http vía @user
Trying to have a conversation with my dad about vegetarianism is the most pointless infuriating thing ever #caveman 
Royal: le président n'aime pas les pauvres? "c'est n'importe quoi" http …
@user korrekt! Verstehe sowas nicht...
CONGRESS na ye party kabhi bani hoti na india ka partition hota nd na hi humari country itni khokhli hoti   @ 
@user @user Ma Ferrero? il compagno Ferrero? ma il suo partito esiste ancora? allora stiamo proprio frecati !!!
todos os meus favoritos na prova de eliminação #MasterChefBR
@user jajajaja dale, hacete la boluda vos jajaja igual a vos nunca se te puede tomar en serio te mando un abrazo desde Perú!


## Model

In [None]:
CUDA = True # set to true if using GPU (Runtime -> Change runtime Type -> GPU)
BATCH_SIZE = 32
MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
config = AutoConfig.from_pretrained(MODEL) # used for id to label name
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
if CUDA:
  model = model.to('cuda')
_ = model.eval()

## Forward

In [None]:
def forward(text, cuda=True):
  text = preprocess(text)
  encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
  if cuda:
    encoded_input.to('cuda')
    output = model(**encoded_input)
    scores = output[0].detach().cpu().numpy()
  else:
    output = model(**encoded_input)
    scores = output[0].detach().numpy()
  
  scores = softmax(scores, axis=-1)
  return scores

In [None]:
dl = DataLoader(dataset, batch_size=BATCH_SIZE)
all_preds = []
for idx,batch in enumerate(dl):
  print('Batch ',idx+1,' of ',len(dl))
  text = preprocess(batch)
  scores = forward(text, cuda=CUDA)
  preds = np.argmax(scores, axis=-1)
  all_preds.extend(preds)

In [None]:
# this is a dataset in 8 different languages
for example in [0,870,1740,2610,3480,4350,5220,6090]:
  pred = all_preds[example]
  print(dataset[example], '--->', config.id2label[pred])

نوال الزغبي (الشاب خالد ليس عالمي) هههههههه أتفرجي على ها الفيديو يا مبتدئة http vía @user ---> Neutral
Trying to have a conversation with my dad about vegetarianism is the most pointless infuriating thing ever #caveman  ---> Negative
Royal: le président n'aime pas les pauvres? "c'est n'importe quoi" http … ---> Negative
@user korrekt! Verstehe sowas nicht... ---> Negative
CONGRESS na ye party kabhi bani hoti na india ka partition hota nd na hi humari country itni khokhli hoti   @  ---> Negative
@user @user Ma Ferrero? il compagno Ferrero? ma il suo partito esiste ancora? allora stiamo proprio frecati !!! ---> Negative
todos os meus favoritos na prova de eliminação #MasterChefBR ---> Positive
@user jajajaja dale, hacete la boluda vos jajaja igual a vos nunca se te puede tomar en serio te mando un abrazo desde Perú! ---> Neutral
