In [1]:
!pip install transformers evaluate datasets -qU

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import transformers

#Set to avoid warning messages.
transformers.logging.set_verbosity_error()

## Pre-built pipelines
# transformers library has various pre-biult pipelines for various NLP and CV tasks
from transformers.pipelines import PIPELINE_REGISTRY

#Get the list of tasks that are supported by Huggingface pipeline

pip_list = PIPELINE_REGISTRY.get_supported_tasks()
for i in pip_list: print(i)


audio-classification
automatic-speech-recognition
conversational
depth-estimation
document-question-answering
feature-extraction
fill-mask
image-classification
image-feature-extraction
image-segmentation
image-to-image
image-to-text
mask-generation
ner
object-detection
question-answering
sentiment-analysis
summarization
table-question-answering
text-classification
text-generation
text-to-audio
text-to-speech
text2text-generation
token-classification
translation
video-classification
visual-question-answering
vqa
zero-shot-audio-classification
zero-shot-classification
zero-shot-image-classification
zero-shot-object-detection


In [4]:
#Get information about a specific task, pt - PyTorch, tf - TensorFlow
print("\nDefault Model for Sentiment Analysis: ")
print(PIPELINE_REGISTRY.check_task('sentiment-analysis')[1].get('default'))


Default Model for Sentiment Analysis: 
{'model': {'pt': ('distilbert/distilbert-base-uncased-finetuned-sst-2-english', 'af0f99b'), 'tf': ('distilbert/distilbert-base-uncased-finetuned-sst-2-english', 'af0f99b')}}


## Sentiment Analysis

In [5]:
from transformers import pipeline
import os

#Load a pipeline. This will download the model checkpoint from huggingface and cache it
#locally on disk. If model is already available in cache, it will simply use the cached version
#Download will usually take a long time, depending on network bandwidth

sentiment_classifier = pipeline("sentiment-analysis")

#Cache usually available at : <<user-home>>.cache\huggingface\hub

cache_dir = os.path.expanduser('~') + "/.cache/huggingface/hub"
print("Huggingface Cache directory is : ", cache_dir)

#Contents of cache directory
os.listdir(cache_dir)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Huggingface Cache directory is :  /root/.cache/huggingface/hub


['models--distilbert--distilbert-base-uncased-finetuned-sst-2-english',
 '.locks',
 'version.txt']

In [6]:
#Predict sentiment using the pipeline
sentiment_results=sentiment_classifier("This is a great course")
print(sentiment_results)

[{'label': 'POSITIVE', 'score': 0.9998713731765747}]


In [7]:
# using a specific model in the pipeline
sentiment_classifier = pipeline(task="sentiment-analysis",
                                model="finiteautomata/bertweet-base-sentiment-analysis")

sentiment_result=sentiment_classifier("This is a great course")

print(sentiment_result)

#Contents of cache directory
os.listdir(cache_dir)

config.json:   0%|          | 0.00/949 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/338 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

[{'label': 'POS', 'score': 0.9920700192451477}]


['models--finiteautomata--bertweet-base-sentiment-analysis',
 'models--distilbert--distilbert-base-uncased-finetuned-sst-2-english',
 '.locks',
 'version.txt']

## Named Entity Recognition

In [8]:
# extract NERs from text (person, company, location, date, custom, order_number, etc)
from transformers import pipeline

input_text="Sam went to California on the 23rd of August. \
There, he visited Google headquarters with John Smith and bought a cap for $23"

basic_ner = pipeline("ner")

basic_ner(input_text)

config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

[{'entity': 'I-PER',
  'score': 0.99887806,
  'index': 1,
  'word': 'Sam',
  'start': 0,
  'end': 3},
 {'entity': 'I-LOC',
  'score': 0.99972683,
  'index': 4,
  'word': 'California',
  'start': 12,
  'end': 22},
 {'entity': 'I-ORG',
  'score': 0.9960085,
  'index': 15,
  'word': 'Google',
  'start': 64,
  'end': 70},
 {'entity': 'I-PER',
  'score': 0.99891376,
  'index': 18,
  'word': 'John',
  'start': 89,
  'end': 93},
 {'entity': 'I-PER',
  'score': 0.99921584,
  'index': 19,
  'word': 'Smith',
  'start': 94,
  'end': 99}]

In [9]:
#Using a Custom Model and tokenizer
from transformers import AutoTokenizer, TFAutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner-with-dates",
                                          from_pt=True)

model = TFAutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner-with-dates",
                                                          from_pt=True)

print(model.config.id2label)

tokenizer_config.json:   0%|          | 0.00/423 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/970 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

{0: 'O', 1: 'I-LOC', 2: 'I-PER', 3: 'I-MISC', 4: 'I-ORG', 5: 'I-DATE'}


In [10]:
#Prediction
enhanced_ner = pipeline('ner',
                        model=model,
                        tokenizer=tokenizer,
                        aggregation_strategy="simple")
enhanced_ner(input_text)

[{'entity_group': 'PER',
  'score': 0.9776213,
  'word': 'Sam',
  'start': 0,
  'end': 3},
 {'entity_group': 'LOC',
  'score': 0.9936407,
  'word': 'California',
  'start': 11,
  'end': 22},
 {'entity_group': 'DATE',
  'score': 0.9235597,
  'word': 'August',
  'start': 37,
  'end': 44},
 {'entity_group': 'ORG',
  'score': 0.57216954,
  'word': 'Google',
  'start': 63,
  'end': 70},
 {'entity_group': 'PER',
  'score': 0.9938346,
  'word': 'John Smith',
  'start': 88,
  'end': 99},
 {'entity_group': 'DATE',
  'score': 0.6406439,
  'word': '23',
  'start': 122,
  'end': 124}]

## Question answering

In [11]:
# importing answering pipeline from Transformers and pre-trained model deepset/minilm-uncased-squad2
# model is trained on domain-specific limited text, fast and accurate

from transformers import pipeline

# providing context for the model
context="""
Earth is the third planet from the Sun and the only astronomical object
known to harbor life. While large volumes of water can be found
throughout the Solar System, only Earth sustains liquid surface water.
About 71% of Earth's surface is made up of the ocean, dwarfing
Earth's polar ice, lakes, and rivers. The remaining 29% of Earth's
surface is land, consisting of continents and islands.
Earth's surface layer is formed of several slowly moving tectonic plates,
interacting to produce mountain ranges, volcanoes, and earthquakes.
Earth's liquid outer core generates the magnetic field that shapes Earth's
magnetosphere, deflecting destructive solar winds.
"""

# activating pipelone
quan_pipeline = pipeline("question-answering",
                         model="deepset/minilm-uncased-squad2")

# generating answer for our question, based on our context
answer=quan_pipeline(question="How much of earth is land?",
             context=context)
print(answer)

config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/107 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

{'score': 0.9553403258323669, 'start': 327, 'end': 330, 'answer': '29%'}


## Natural Language Generation

In [12]:
# cintent creation
from transformers import pipeline

text_generator = pipeline("text-generation",
                          model="gpt2")
transformers.set_seed(1)

input_text="Natural Language Processing is a \
growing domain in machine learning"

synthetic_text=text_generator(input_text,
                              num_return_sequences=3,
                              max_new_tokens=50)

for text in synthetic_text:
    print(text.get("generated_text") ,"\n-----------------")

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Natural Language Processing is a growing domain in machine learning. At Caltech's Computer Science & Artificial Intelligence Lab, we design, benchmark, and train applications to detect, extract, and interpret binary numbers and other information by searching neural networks. Our algorithms are designed to identify natural languages by the complexity of their 
-----------------
Natural Language Processing is a growing domain in machine learning, as well as as in non-interactive programming using machine learning techniques in general.

Machine Learning In Machine Learning

In addition to machine learning, machine learning also incorporates the use of many other processes, including reinforcement learning for examples, 
-----------------
Natural Language Processing is a growing domain in machine learning, providing solutions for many human-related problems including visual memory and complex language processing. Its main challenge has been overcoming the computational complexities of dee

## Bot conversation

In [13]:
# chatbot conversation example
from transformers import  Conversation

conversational_pipeline = pipeline("conversational",
                                   model="facebook/blenderbot_small-90M")

print(conversational_pipeline.model.config)

config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/350M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/311 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/964k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/345k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

BlenderbotSmallConfig {
  "_name_or_path": "facebook/blenderbot_small-90M",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BlenderbotSmallForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 8,
  "decoder_start_token_id": 1,
  "do_blenderbot_90_layernorm": true,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 8,
  "eos_token_id": 2,
  "extra_pos_embeddings": 0,
  "force_bos_token_to_be_generated": false,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id

In [14]:
#Sample inputs
first_input="Do you have any hobbies?"
second_input = "I like to watch movies"
third_input = "action movies"

#Create a context
bot_conversation = Conversation(first_input)

print("\nFirst Exchange: \n--------------------")

conversational_pipeline(bot_conversation)
print(" User Input:", bot_conversation.past_user_inputs[0])
print(" Bot Output:", bot_conversation.generated_responses[0])

print("\nSecond Exchange: \n--------------------")
bot_conversation.add_user_input(second_input)
conversational_pipeline(bot_conversation)

print(" User Input:", bot_conversation.past_user_inputs[1])
print(" Bot Output:", bot_conversation.generated_responses[1])

print("\nThird Exchange: \n--------------------")
bot_conversation.add_user_input(third_input)
conversational_pipeline(bot_conversation)

print(" User Input:", bot_conversation.past_user_inputs[2])
print(" Bot Output:", bot_conversation.generated_responses[1])

print("\nAccessing All Responses: ")
print(bot_conversation)


First Exchange: 
--------------------
 User Input: Do you have any hobbies?
 Bot Output: yes, i love going to the beach. what about you? do you have any hobbies?

Second Exchange: 
--------------------
 User Input: I like to watch movies
 Bot Output: i love going to the beach. i also like to watch movies. what kind of movies do you like?

Third Exchange: 
--------------------
 User Input: action movies
 Bot Output: i love going to the beach. i also like to watch movies. what kind of movies do you like?

Accessing All Responses: 
Conversation id: 1a7a6e2b-4d50-4bc6-b519-b7d8042389f5
user: Do you have any hobbies?
assistant: yes, i love going to the beach. what about you? do you have any hobbies?
user: I like to watch movies
assistant: i love going to the beach. i also like to watch movies. what kind of movies do you like?
user: action movies
assistant: i love going to the beach as well. i like action movies as well, but i don't get to see them often.

