<a href="https://colab.research.google.com/github/brunojaime/hugging_face_projects/blob/master/creating_publishing_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers evaluate datasets accelerate -q

## 1 Datasets
We can retrieve all the datasets that exist in hugging face

In [None]:
from huggingface_hub import list_datasets,dataset_info
from datasets import load_dataset, DatasetInfo
all_datasets = list_datasets(sort="downloads",direction=-1,limit=5)

In [None]:
next(all_datasets)

DatasetInfo(id='hails/mmlu_no_train', author='hails', sha='7f9d4f237bd7496914f430fa600c73017331885f', created_at=datetime.datetime(2023, 10, 31, 17, 25, 54, tzinfo=datetime.timezone.utc), last_modified=datetime.datetime(2024, 1, 22, 20, 46, 30, tzinfo=datetime.timezone.utc), private=False, gated=False, disabled=False, downloads=10338955, likes=7, paperswithcode_id=None, tags=['task_categories:question-answering', 'language:en', 'license:mit', 'region:us'], card_data=None, siblings=None)

In [None]:
dataset_info('acronym_identification') # We can look the information of one of the sets

DatasetInfo(id='acronym_identification', author=None, sha='15ef643450d589d5883e289ffadeb03563e80a9e', created_at=datetime.datetime(2022, 3, 2, 23, 29, 22, tzinfo=datetime.timezone.utc), last_modified=datetime.datetime(2024, 1, 9, 11, 39, 57, tzinfo=datetime.timezone.utc), private=False, gated=False, disabled=False, downloads=461, likes=18, paperswithcode_id='acronym-identification', tags=['task_categories:token-classification', 'annotations_creators:expert-generated', 'language_creators:found', 'multilinguality:monolingual', 'size_categories:10K<n<100K', 'source_datasets:original', 'language:en', 'license:mit', 'acronym-identification', 'croissant', 'arxiv:2010.14678', 'region:us'], card_data={'annotations_creators': ['expert-generated'], 'language_creators': ['found'], 'language': ['en'], 'license': ['mit'], 'multilinguality': ['monolingual'], 'size_categories': ['10K<n<100K'], 'source_datasets': ['original'], 'task_categories': ['token-classification'], 'task_ids': [], 'paperswithcod

## Cargar los datos
Obtenemos los datos del dataset que nos interesa

In [None]:
dataset = load_dataset("yelp_review_full")
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [None]:
dataset['train'][4550]

{'label': 1,
 'text': 'I brought two pairs of pants and two (cheap) blazers there to be altered.  He was very nice. However, he failed to mention to me that the cost of these 4 things- one of which only needed alternation tape- would come to $96!!!!!!  When I came to pick up my items I was extremely embarrassed because I had to tell him I could not afford it. He then told me he could \\"help me out \\" and make it $80.  I tried explaining to him that the two blazers were only $25 each to begin with.  Overall, this was an extremely humiliating experience and it all could have been solved if he were more professional and would have told me that simple alterations would cost so much.  I will definitely be searching for a new tailor in the future.  Way too expensive if you ask me.'}

In [None]:
small_train_dataset = dataset['train'].shuffle(seed=42).select(range(80))
small_test_dataset = dataset['test'].shuffle(seed=42).select(range(40))

### 3 Tokenizer
Here we will convert a phrase into a series of tokens

In [None]:
from transformers import AutoTokenizer

In [None]:
model = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model)

def tokenize_function(examples):
  return tokenizer(examples['text'],padding="max_length",truncation=True)

small_train_dataset = small_train_dataset.map(tokenize_function,batched=True)
small_test_dataset = small_test_dataset.map(tokenize_function,batched=True)




### 4 Model


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model,num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 5 Entrenamiento

First we have to login

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import numpy as np
import evaluate

In [None]:
metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
  logits,labels = eval_pred
  predictions = np.argmax(logits,axis=-1)
  return metric.compute(predictions=predictions,references=labels)


In [None]:
from transformers import TrainingArguments,Trainer

In [None]:
training_args = TrainingArguments(
    "mi-modelo",
    evaluation_strategy="steps",
    logging_steps=5,
    num_train_epochs=1,
    push_to_hub=True,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = small_train_dataset,
    eval_dataset = small_test_dataset,
    compute_metrics =compute_metrics,
    )

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
5,1.6331,1.617779,0.15
10,1.6661,1.603998,0.15


TrainOutput(global_step=10, training_loss=1.649580478668213, metrics={'train_runtime': 791.5124, 'train_samples_per_second': 0.101, 'train_steps_per_second': 0.013, 'total_flos': 21049451397120.0, 'train_loss': 1.649580478668213, 'epoch': 1.0})

In [None]:
trainer.push_to_hub()

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

events.out.tfevents.1715371709.0efae9095793.337.0:   0%|          | 0.00/4.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

events.out.tfevents.1715371870.0efae9095793.30440.0:   0%|          | 0.00/6.26k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/BrunoJaime/mi-modelo/commit/f140fb98a077dd46ac7fe4c74c7ad54f5d61ba7d', commit_message='End of training', commit_description='', oid='f140fb98a077dd46ac7fe4c74c7ad54f5d61ba7d', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub("BrunoJaime/mi-modelo")

README.md:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/BrunoJaime/mi-modelo/commit/c92f2a67a863cbf3334bca4beea33c539d8f6fe4', commit_message='Upload tokenizer', commit_description='', oid='c92f2a67a863cbf3334bca4beea33c539d8f6fe4', pr_url=None, pr_revision=None, pr_num=None)