In [None]:
!pip install transformers datasets

!pip install torch

!pip install tensorflow

In [4]:
import torch
from transformers  import pipeline



In [5]:
from transformers import AutoTokenizer

model_name= "nlptown/bert-base-multilingual-uncased-sentiment"

tokenizer= AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [6]:
encoding = tokenizer ("We are very happy to celebrate the labor day.")

print (encoding)

{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 59571, 10103, 19106, 11111, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [9]:
tf_batch = tokenizer (["We are very happy to celebrate the labor day.","Hope you don't just sleep all the time."], padding= True, truncation= True, max_length= 512, return_tensors="tf")

print (tf_batch)



{'input_ids': <tf.Tensor: shape=(2, 13), dtype=int32, numpy=
array([[  101, 11312, 10320, 12495, 19308, 10114, 59571, 10103, 19106,
        11111,   119,   102,     0],
       [  101, 18763, 10855, 11530,   112,   162, 12125, 32524, 10367,
        10103, 10573,   119,   102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(2, 13), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 13), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}


In [10]:
pt_batch= tokenizer (["We are very happy to celebrate the labor day.","Hope you don't just sleep all the time."], padding= True, truncation= True, max_length= 512, return_tensors="pt")

In [11]:
#AutoModel using Pytorch

from transformers import AutoModelForSequenceClassification

model_name= "nlptown/bert-base-multilingual-uncased-sentiment"

pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)



Downloading pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

In [12]:
# Pass pre-processed batch of inputs directly to the model. Unpack the dictionary by adding **

pt_outputs = pt_model(**pt_batch)

In [13]:
# Predictions using Pytorch. The model outputs the final activations in the logits attribute. Apply the softmax function to the logits to retrieve the probabilities

from torch import nn

pt_prediction = nn.functional.softmax(pt_outputs.logits, dim=-1)

print(pt_prediction)

tensor([[0.0020, 0.0018, 0.0168, 0.2525, 0.7268],
        [0.1609, 0.1292, 0.1669, 0.2077, 0.3354]], grad_fn=<SoftmaxBackward0>)


In [14]:
# AutoModel using TensorFlow

from transformers import TFAutoModelForSequenceClassification

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

tf_model= TFAutoModelForSequenceClassification.from_pretrained(model_name)

tf_outputs = tf_model(tf_batch)

import tensorflow as tf

tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
tf_predictions


Downloading tf_model.h5:   0%|          | 0.00/670M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


<tf.Tensor: shape=(2, 5), dtype=float32, numpy=
array([[0.00204665, 0.0018379 , 0.01683202, 0.25250512, 0.72677827],
       [0.16092661, 0.12921175, 0.1668502 , 0.20765285, 0.3353586 ]],
      dtype=float32)>

In [None]:
# Building a custom model

# The objective is to modify the model's configuration class to change how a model is built. The configuaration specifies a model's attributes, such as the number of hidden layers or attention heads. Import Autoconfig and
# then load the pretrained model to be modified. Specify the attribute within AutoConfig.from_pretrained() class.

from transformers import AutoConfig

my_config= AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)


In [None]:
#Pytorch framework

from transformers import AutoModel

my_model= AutoModel.from_config(my_config)

In [None]:
#TensorFlow framework

from transformers import TFAutoModel

my_model= TFAutoModel.from_config(my_config)

In [None]:
# Trainer - a PyTorch optimized training loop

In [None]:
#1. Start with a PreTrainedModel or a torch.nn.Module
from transformers import AutoModelForSequenceClassification

model= AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")

In [None]:

#2 TrainingArguments  contains the model hyperparameters you can change like learning rate, batch size, and the number of epochs to train for. The default values are used if you don't specify any training arguments

from transformers import TrainingArguments

training_args = TrainingArguments ( output_dir= "path/to/save/folder/", learning_rate=2e-5, per_device_train_batch_size= 8, per_device_eval_batch_size=8, num_train_epochs=2)

In [None]:
#3 Load a preprocessing class like a tokenizer, image processor, feature extractor, or processor

from transformers import AutoTokenizer

tokenizer= AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
#4 Load a dataset

from datasets import load_dataset

dataset= load_dataset("rotten_tomatoes")

In [None]:
#5a Create a function to tokenize the dataset:

def tokenize_dataset (dataset):

  return tokenizer(dataset["text"])

# 5b. Then apply it over the entire dataset with map

dataset= dataset.map(tokenize_dataset, batched=True)

# 5c. A DataCollatorWithPadding to create a batch of examples from your dataset:

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding (tokenizer= tokenizer)


In [None]:
#6 Gather all these classes in the trainer

from transformers import Trainer

trainer = Trainer(model=model, args= training_args, train_dataset= dataset["train"], eval_dataset=dataset["test"], tokenizer=tokenizer, data_collator=data_collator )

#7 Call train() to start training

trainer.train()

In [None]:
# Train with Tensorflow

# All models are a standard tf.keras.Model ;so they can be trained in TensorFlow with the Keras API. HF Transformers provides the prepare_tf_dataset() method to easily load datatset as tf.data.Dataset so that it can be
# train rightaway with Keras' complile and fit methoods.

In [None]:
#1. Start with a TFPreTrainedModel

from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")

#2 Load a preprocessing class like a tokenizer , image processor, feature extractor, or processor:

from transformers import AutoTokenizer

tokenizer= AutoTokenizer.from_pretrained("distilbert-base-uncased")

#3 Create a function to tokenize the dataset

def tokenize_dataset(dataset):

  return tokenizer(dataset["text"])

#4 Apply the tokenizer over the entire dataset with map, and then pass the dataset and tokenizer to prepare_tf_dataset.

dataset= dataset.map(tokenize_dataset)

tf_dataset= model.prepare_tf_dataset( dataset["train"], batch_size=16, shuffle=True, tokenizer= tokenizer)

#5 Call compile and fit to start training

from tensorflow.keras.optimizers import Adam

model.compile(optimizer=Adam(3e-5))

model.fit(tf_dataset)