In [None]:
!pip install transformers
!pip install datasets
!pip install umap-learn
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModel
from torch import torch
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
TEXT_COL      = 'tweet'
CLASS_COL_ST1 = 'sentiment'

In [None]:
# descomente la siguiente línea para leer datos desde un URL

train_url = 'https://raw.githubusercontent.com/carlossuazo/davincis-iberlef-2023/main/data/training_data/train.csv'
test_url = 'https://raw.githubusercontent.com/carlossuazo/davincis-iberlef-2023/main/data/test_data/test.csv'

df = load_dataset("csv", data_files={"train": train_url, "validation": test_url}, sep=",", names=[TEXT_COL, CLASS_COL_ST1])



  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
df

DatasetDict({
    train: Dataset({
        features: ['tweet', 'sentiment'],
        num_rows: 3363
    })
    validation: Dataset({
        features: ['tweet', 'sentiment'],
        num_rows: 51
    })
})

In [None]:
model_ckpt = f"cardiffnlp/twitter-roberta-base-sentiment"#"cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
roberta_tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
text = 'Tokenizar el texto es una tarea central del NLP.'
encoded_text = tokenizer(text)
print(encoded_text)

{'input_ids': [0, 45643, 1210, 271, 1615, 2788, 139, 2714, 542, 102, 326, 7907, 1353, 2424, 234, 21992, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

['<s>', 'Token', 'iz', 'ar', 'Ġel', 'Ġtext', 'o', 'Ġes', 'Ġun', 'a', 'Ġt', 'area', 'Ġcentral', 'Ġdel', 'ĠN', 'LP', '.', '</s>']


In [None]:
print(tokenizer.convert_tokens_to_string(tokens))

<s>Tokenizar el texto es una tarea central del NLP.</s>


In [None]:
tokenizer.vocab_size

50265

In [None]:
tokenizer.model_max_length

1000000000000000019884624838656

In [None]:
tokenizer.model_input_names

['input_ids', 'attention_mask']

In [None]:
def tokenize(batch):
    return tokenizer(batch[TEXT_COL], padding=True, truncation=True)

In [None]:
print(tokenize(df["train"][:2]))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': [[0, 90, 21210, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 10431, 17986, 4330, 1721, 2872, 260, 3213, 242, 263, 2664, 1526, 6852, 4842, 1177, 1615, 21691, 1535, 139, 1076, 40492, 1020, 263, 2575, 438, 1176, 1526, 4, 25840, 27102, 1977, 13300, 366, 4767, 139, 1893, 1517, 40645, 3964, 7822, 366, 1177, 1615, 228, 438, 5332, 4, 1437, 27672, 1180, 2520, 2727, 1076, 46341, 853, 2953, 1615, 1293, 4, 1437, 37522, 118, 1322, 14666, 897, 6296, 5014, 2727, 4, 1437, 1437, 8103, 9085, 18537, 15286, 6320, 15775, 271, 16723, 366, 1437, 849, 5320, 41312, 1866, 7199, 636, 5003, 1437, 2054, 2]], 'attention_mask': [[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
tweets_encoded = df.map(tokenize, batched=True, batch_size=None) #batch_size=None

Map:   0%|          | 0/3363 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

In [None]:
# al aplicar map se agregan dos nuevas columnas al objeto, 
print(tweets_encoded["train"].column_names)

['tweet', 'sentiment', 'input_ids', 'attention_mask']


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment were not used when initializing RobertaModel: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictio

In [None]:
from transformers import TFAutoModel

#tf_model = TFAutoModel.from_pretrained(model_ckpt)

In [None]:
#tf_xlmr = TFAutoModel.from_pretrained("xlm-roberta-base")

In [None]:
#tf_xlmr = TFAutoModel.from_pretrained("xlm-roberta-base", from_pt=True)

In [None]:
text = "this is a test"
inputs = tokenizer(text, return_tensors="pt")
print(f"Input tensor shape: {inputs['input_ids'].size()}")

Input tensor shape: torch.Size([1, 6])


In [None]:
inputs = {k:v.to(device) for k,v in inputs.items()}
with torch.no_grad():
    outputs = model(**inputs)
print(outputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-2.8212e-01, -5.4723e-01,  8.7227e-02,  ..., -6.1494e-01,
          -1.0628e-01, -6.2301e-01],
         [ 3.5916e-02, -8.6982e-01,  5.3141e-01,  ..., -7.1903e-01,
          -1.2872e-01, -2.8742e-01],
         [-9.1783e-02,  3.6825e-04,  1.3581e-01,  ..., -1.3801e+00,
          -2.0054e-01, -3.4174e-02],
         [ 1.5422e-02, -6.7264e-01,  1.4242e-01,  ..., -4.3557e-01,
           2.9332e-01, -1.9287e-01],
         [-5.2928e-02, -5.2406e-01,  2.5271e-01,  ..., -1.0034e+00,
           1.6826e-01, -4.1093e-01],
         [-2.9693e-01, -5.4961e-01,  8.7612e-02,  ..., -6.1877e-01,
          -1.0939e-01, -6.3301e-01]]]), pooler_output=tensor([[ 0.4531, -0.5993, -0.4677,  0.3618,  0.4891,  0.3940,  0.1796, -0.5176,
          0.3724, -0.2722, -0.3911,  0.0257, -0.0213, -0.2728,  0.0986, -0.0405,
          0.5638, -0.1555,  0.8277, -0.3910,  0.8432, -0.3286,  0.3826, -0.2016,
         -0.3739,  0.0089, -0.7108, -0.2826,  0

In [None]:
outputs.last_hidden_state.size()

torch.Size([1, 6, 768])

In [None]:
outputs.last_hidden_state[:,0].size()

torch.Size([1, 768])

In [None]:
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items()
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k: torch.tensor(v).to(device) if isinstance(v, list) else v.to(device)
              for k,v in batch.items()
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
tweets_encoded.set_format("torch",  columns=["input_ids", "attention_mask", CLASS_COL_ST1])

In [None]:
tweets_hidden = tweets_encoded.map(extract_hidden_states, batched=True)

Map:   0%|          | 0/3363 [00:00<?, ? examples/s]

KeyboardInterrupt: ignored

In [None]:
def label_int2str(row):
    return tweets["train"].features[CLASS_].int2str(row)

In [None]:
emotions_hidden["train"].column_names

In [None]:
import numpy as np

X_train = np.array(emotions_hidden["train"]["hidden_state"])
X_valid = np.array(emotions_hidden["validation"]["hidden_state"])
y_train = np.array(emotions_hidden["train"]["label"])
y_valid = np.array(emotions_hidden["validation"]["label"])
X_train.shape, X_valid.shape

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
import gc
#del variables
gc.collect()

In [None]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))