In [1]:
import warnings 
warnings.filterwarnings('ignore')

# Pipeline

In [2]:
from transformers import pipeline

pipe  = pipeline('text-classification', model='finiteautomata/beto-sentiment-analysis')




Device set to use cuda:0


In [3]:
pipe("este producto es muy malo")

[{'label': 'NEG', 'score': 0.9990278482437134}]

In [4]:
pipe("este producto es bueno")

[{'label': 'POS', 'score': 0.9984679818153381}]

In [5]:
sentences = [
    "este producto es bueno",
    "este producto es malo"
]
pipe(sentences)

[{'label': 'POS', 'score': 0.9984679818153381},
 {'label': 'NEG', 'score': 0.9989533424377441}]

In [6]:
pipe

<transformers.pipelines.text_classification.TextClassificationPipeline at 0x209c5d1bb50>

# Tasks

## Zero-shot classification

In [7]:
pipe = pipeline("zero-shot-classification")

No model was supplied, defaulted to facebook/bart-large-mnli and revision d7645e1 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


In [8]:
pipe(
    "This is a course about the transformers library",
    candidate_labels=["education","politics", "business"]
)

{'sequence': 'This is a course about the transformers library',
 'labels': ['education', 'business', 'politics'],
 'scores': [0.9192399978637695, 0.06077896058559418, 0.019981082528829575]}

In [9]:
pipe.__class__

transformers.pipelines.zero_shot_classification.ZeroShotClassificationPipeline

## Text generation

In [10]:
pipe = pipeline("text-generation", model="distilgpt2")

Device set to use cuda:0


In [11]:
pipe(
    "In this course, we will teach you how to",
    num_return_sequences=2,
    max_length=70
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'In this course, we will teach you how to manage your budget and to create efficient, reliable, and convenient ways to support your budget and ensure that you do not have to worry about spending any more on your expenses.\n\n\n\n\nThis course will guide you to a lot of things, including managing your budget and spending.\nThe Basics'},
 {'generated_text': 'In this course, we will teach you how to change the rules about a person/mom, and how we can make it change. So do not just do the following to your student in the course; you may look forward to learning more about who you are and how you can move forward. The lesson will also be written on the basis of what'}]

## NER

In [12]:
# Name Entity Recognition

In [13]:
pipe = pipeline("ner", grouped_entities=True)
pipe("Mi nombre es Eduardo y trabajo en la UCM que está en Madrid")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


[{'entity_group': 'PER',
  'score': 0.9222223,
  'word': 'Eduardo',
  'start': 13,
  'end': 20},
 {'entity_group': 'ORG',
  'score': 0.99872696,
  'word': 'UCM',
  'start': 37,
  'end': 40},
 {'entity_group': 'LOC',
  'score': 0.99876946,
  'word': 'Madrid',
  'start': 53,
  'end': 59}]

## QA

In [14]:
pipe = pipeline("question-answering")
pipe(
    question="Where do I work?",
    context="My name is Eduardo and I work at ucm in madrid"
)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


{'score': 0.7700124382972717, 'start': 33, 'end': 36, 'answer': 'ucm'}

## Summarization

In [15]:
pipe = pipeline("summarization")
pipe("""
Donald John Trump (born June 14, 1946) is an American politician, media personality, and businessman who is the 47th president of the United States. A member of the Republican Party, he served as the 45th president from 2017 to 2021.

Born into a wealthy family in the New York City borough of Queens, Trump graduated from the University of Pennsylvania in 1968 with a bachelor's degree in economics. He became the president of his family's real estate business in 1971, renamed it the Trump Organization, and began acquiring and building skyscrapers, hotels, casinos, and golf courses. He launched side ventures, many licensing the Trump name, and filed for six business bankruptcies in the 1990s and 2000s. From 2004 to 2015, he hosted the reality television show The Apprentice, bolstering his fame as a billionaire. Presenting himself as a political outsider, Trump won the 2016 presidential election against Democratic Party nominee Hillary Clinton.

During his first presidency, Trump imposed a travel ban on seven Muslim-majority countries, expanded the Mexico–United States border wall, and enforced a family separation policy on the border. He rolled back environmental and business regulations, signed the Tax Cuts and Jobs Act, and appointed three Supreme Court justices. In foreign policy, Trump withdrew the U.S. from agreements on climate, trade, and Iran's nuclear program, and initiated a trade war with China. In response to the COVID-19 pandemic from 2020, he downplayed its severity, contradicted health officials, and signed the CARES Act. After losing the 2020 presidential election to Joe Biden, Trump attempted to overturn the result, culminating in the January 6 Capitol attack in 2021. He was impeached in 2019 for abuse of power and obstruction of Congress, and in 2021 for incitement of insurrection; the Senate acquitted him both times.

In 2023, Trump was found liable in civil cases for sexual abuse and defamation and for business fraud. He was found guilty of falsifying business records in 2024, making him the first U.S. president convicted of a felony. After winning the 2024 presidential election against Kamala Harris, he was sentenced to a penalty-free discharge, and two felony indictments against him for retention of classified documents and obstruction of the 2020 election were dismissed without prejudice. A racketeering case related to the 2020 election in Georgia is pending.
""")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


[{'summary_text': ' Donald John Trump is the 47th president of the United States . He served as the 45th president from 2017 to 2021 . He was impeached in 2019 for abuse of power and obstruction of Congress, and in 2021 for incitement of insurrection . Trump was found guilty of falsifying business records in 2024, making him the first U.S. president convicted of a felony .'}]

## Translation

In [16]:
pipe = pipeline("translation", model='Helsinki-NLP/opus-mt-es-en')

Device set to use cuda:0


In [17]:
pipe("Mi nombre es Eduardo")

[{'translation_text': 'My name is Eduardo.'}]

In [18]:
pipe.__class__

transformers.pipelines.text2text_generation.TranslationPipeline

# AutoTokenizer

In [19]:
from transformers import AutoTokenizer

model_id = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [20]:
tokenizer.__class__

transformers.models.distilbert.tokenization_distilbert_fast.DistilBertTokenizerFast

In [21]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-uncased-finetuned-sst-2-english', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [22]:
sentences

['este producto es bueno', 'este producto es malo']

In [23]:
tokenized = tokenizer.tokenize(sentences)
tokenizer.convert_tokens_to_ids(tokenized)
ids = tokenizer.encode(sentences)
tokenizer.decode(ids)
tokenizer(sentences)

{'input_ids': [[101, 28517, 4031, 2080, 9686, 20934, 16515, 102], [101, 28517, 4031, 2080, 9686, 15451, 2080, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [24]:
sentences = [
    "I've been waiting for a Huggingface course my whole life.",
    "I hate this so much!"
]

inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
inputs

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}

# AutoModel

In [25]:
from transformers import AutoModel

model = AutoModel.from_pretrained(model_id)

In [26]:
model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

## AutoConfig

In [27]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_id)

In [28]:
# config.attention_dropout = 0.2

In [29]:
# config

In [30]:
# model = AutoModel.from_config(config)

In [31]:
# model

In [32]:
outputs = model(**inputs)
outputs

BaseModelOutput(last_hidden_state=tensor([[[-0.1798,  0.2333,  0.6321,  ..., -0.3017,  0.5008,  0.1481],
         [ 0.2758,  0.6497,  0.3200,  ..., -0.0760,  0.5136,  0.1329],
         [ 0.9046,  0.0985,  0.2950,  ...,  0.3352, -0.1407, -0.6464],
         ...,
         [ 0.1466,  0.5661,  0.3235,  ..., -0.3376,  0.5100, -0.0561],
         [ 0.7500,  0.0487,  0.1738,  ...,  0.4684,  0.0030, -0.6084],
         [ 0.0519,  0.3729,  0.5223,  ...,  0.3584,  0.6500, -0.3883]],

        [[-0.2937,  0.7283, -0.1497,  ..., -0.1187, -1.0227, -0.0422],
         [-0.2206,  0.9384, -0.0951,  ..., -0.3643, -0.6605,  0.2407],
         [-0.1536,  0.8988, -0.0728,  ..., -0.2189, -0.8528,  0.0710],
         ...,
         [-0.3017,  0.9002, -0.0200,  ..., -0.1082, -0.8412, -0.0861],
         [-0.3338,  0.9674, -0.0729,  ..., -0.1952, -0.8181, -0.0634],
         [-0.3454,  0.8824, -0.0426,  ..., -0.0993, -0.8329, -0.1065]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [33]:
last_hidden_state = outputs.last_hidden_state
last_hidden_state.shape

torch.Size([2, 16, 768])

In [34]:
import torch

with torch.no_grad():
    last_hidden_state = last_hidden_state.cpu().numpy()
    
last_hidden_state

array([[[-0.17977962,  0.23332772,  0.6320998 , ..., -0.30166614,
          0.5008202 ,  0.14814308],
        [ 0.2757772 ,  0.64971167,  0.31997764, ..., -0.0759954 ,
          0.51361716,  0.13292125],
        [ 0.90458596,  0.09851333,  0.29497284, ...,  0.3351953 ,
         -0.1407413 , -0.6464039 ],
        ...,
        [ 0.14658983,  0.5660593 ,  0.32352832, ..., -0.3375748 ,
          0.5099788 , -0.05610962],
        [ 0.7500047 ,  0.04872601,  0.17380042, ...,  0.46841565,
          0.0029665 , -0.6083766 ],
        [ 0.05194494,  0.37294793,  0.52233267, ...,  0.35840583,
          0.65004313, -0.38829845]],

       [[-0.2937065 ,  0.72825634, -0.14972717, ..., -0.11868107,
         -1.0226725 , -0.04215725],
        [-0.22063631,  0.9383849 , -0.09512513, ..., -0.36431676,
         -0.660522  ,  0.24069716],
        [-0.15360813,  0.89875025, -0.07276463, ..., -0.21891764,
         -0.8527593 ,  0.0709941 ],
        ...,
        [-0.30174723,  0.9002209 , -0.01995075, ..., -

In [35]:
last_hidden_state.shape

(2, 16, 768)

In [36]:
last_hidden_state[0].shape

(16, 768)

In [37]:
last_hidden_state[0][1].shape

(768,)

# Ejercicio (clasificación)

In [38]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/eduardofc/data/main/amazon_sports.csv")
df['review_body'] = df['review_body'].str.replace("[^a-zA-ZñÑáéíóú .,]", "", regex=True)
df['review_body'] = df['review_body'].str.lower()
df.head()

Unnamed: 0,stars,review_body,review_title,product_category
0,1,nunca llego el pedido y el vendedor pasa de to...,No llego nunca,sports
1,1,"no sé como es, porque debería haber llegado ay...",Todavía no ha llegado,sports
2,1,"guantes cómodos, no lo niego, pero de mala cal...",Guantes de baja calidad,sports
3,1,hasta hoy no he visto el producto. el pedido h...,Muy Mala experiencia,sports
4,1,"no puedo valorarla porque, después de casi una...",Paquete perdido?,sports


In [39]:
df = df[df.stars != 2]
df = df[df.stars != 3]
df = df[df.stars != 4]

df['good_product'] = (df.stars > 3).astype(int)

df.groupby('good_product').size()

good_product
0    2438
1    2512
dtype: int64

In [40]:
# 1) modelo
model_id = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english'

# 2) preparamos modelo y tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)

In [41]:
X = list(df['review_body'].values)
y = df['good_product']

In [42]:
X_inputs = tokenizer(X, padding=True, truncation=True, return_tensors='pt')
X_inputs

{'input_ids': tensor([[  101, 16634,  3540,  ...,     0,     0,     0],
        [  101,  2053,  7367,  ...,     0,     0,     0],
        [  101, 19739, 24985,  ...,     0,     0,     0],
        ...,
        [  101,  4206,  2319,  ...,     0,     0,     0],
        [  101, 24970, 15781,  ...,     0,     0,     0],
        [  101,  2064,  9386,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [44]:
# X_outputs = model(**X_inputs)

In [49]:
def forward_pass(x):
    inputs = tokenizer(x, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    last_hidden_state = outputs.last_hidden_state
    with torch.no_grad():
        last_hidden_state = last_hidden_state.cpu().numpy()
    return last_hidden_state[0][1]

In [50]:
%%time

last_layer_model = df['review_body'].map(forward_pass)

CPU times: total: 24min 32s
Wall time: 3min 5s


In [51]:
last_layer_model.shape

(4950,)

In [53]:
last_layer_model[0].shape

(768,)

In [56]:
pd_X = pd.DataFrame(last_layer_model.to_list(), columns=[f"f_{i}" for i in range(768)])
pd_X.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_758,f_759,f_760,f_761,f_762,f_763,f_764,f_765,f_766,f_767
0,-0.14812,-0.379483,0.762809,-0.557681,-0.130396,1.108286,1.68766,0.95955,1.12163,-0.17472,...,0.189335,-0.601361,0.639028,-0.962111,0.088468,-0.441341,0.572124,-0.472088,0.042865,1.043632
1,-0.571195,0.6544,-0.146572,-0.37174,-0.561087,0.482954,0.573449,1.367549,0.715778,-0.132606,...,0.904775,0.101262,-0.045985,-0.905033,0.343865,-0.559881,0.769444,-0.414418,-0.041818,0.411064
2,-0.767859,-0.032666,0.192588,-0.222683,-0.234021,0.034018,0.298249,0.913226,0.543267,-0.024575,...,0.860801,-0.422094,0.225442,-1.015883,0.652911,0.123236,-0.05387,-0.334967,0.12581,0.478331
3,-0.191025,0.593415,0.110412,-0.693767,-0.555686,0.283406,0.834452,1.027178,0.514113,-0.352057,...,0.807701,-0.201013,0.141311,-0.729934,0.017483,-0.474524,0.901508,-0.185042,-0.411359,0.675568
4,-0.521726,0.500266,0.016186,-0.287753,-0.563475,0.629885,0.500428,0.971674,0.61566,0.037409,...,1.359321,-0.025231,-0.190737,-1.132495,0.245646,-0.599195,0.629616,-0.078504,-0.116343,0.992917


In [57]:
from sklearn.model_selection import train_test_split

X = pd_X
y = df.good_product

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=99)

In [58]:
len(X_train)

3960

In [59]:
len(X_test)

990

In [60]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=99)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8313131313131313

# Automodel for sequence classification

In [61]:
sentences

["I've been waiting for a Huggingface course my whole life.",
 'I hate this so much!']

In [62]:
model_id

'distilbert/distilbert-base-uncased-finetuned-sst-2-english'

In [63]:
from transformers import AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id)

In [65]:
# model

In [66]:
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [67]:
outputs = model(**inputs)
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [68]:
outputs.logits

tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)

In [70]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits)
print(predictions)

tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)


In [72]:
# model.config.id2label

# All together in a pipeline

In [74]:
from transformers import pipeline
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

model_id = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english'

tokenizer = AutoTokenizer.from_pretrained(model_id, padding=True, truncation=True, return_tensors=)
model = AutoModelForSequenceClassification.from_pretrained(model_id)

pipe = pipeline(
    task = "text-classification",
    model = model,
    tokenizer = tokenizer
)

Device set to use cuda:0
