In [None]:
!pip install transformers



In [None]:
!nvidia-smi

Sun Nov 17 07:49:47 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Hugging Face Tasks

In [None]:
from transformers import pipeline

#------------------------------------------------#
#                  NLP TASKS                     #
#------------------------------------------------#

'''
1. Text Classification: Assigning a category to a piece of text.
Sentiment Analysis
Topic Classification
Spam Detection '''

classifier = pipeline("text-classification")

'''
2. Token Classification: Assigning labels to individual tokens in a sequence.
Named Entity Recognition (NER)
Part-of-speech Tagging
'''

token_classifier = pipeline("token-classification")

'''
3. Question Answering: Extracting an answer from a given context based on a question.
'''

question_answerer = pipeline("question-answering")

'''
4. Text Generation: Generating text based on a given prompt
Language Modeling
Story Generation
'''

text_generator = pipeline("text-generation")

'''
5. Summurization: Condensing long documents into shorter summaries.
'''

summarizer = pipeline("summarization")

'''
6. Text2Text Generation: General-purpose text transformation, including summarization and translation.
'''

text2text_generator = pipeline("text2text-generation")

'''
7. Fill-Mask: Predicting the masked token in a sequence.
'''

fill_mask = pipeline("fill-mask")

'''
Translation: Translating text from one language to another.
'''

translator = pipeline("translation", model="facebook/mbart-large-50-many-to-many-mmt")

'''
8. Feature Extraction: Extracting hidden states or features from text.
'''

feature_extractor = pipeline("feature-extraction")

'''
9. Sentence Similarity: Measuring the similarity between two sentences.
'''
sentence_similarity = pipeline("sentence-similarity")

#---------------------------------------------------#
#             Computer Vision TASKS                 #
#---------------------------------------------------#

'''
1. Image Classification: Classifying the main content of an image.

'''

image_classifier = pipeline("image-classification")

'''
2. Object Detection: Identifying objects within an image and their bounding boxes.
'''

object_detector = pipeline("object-detection")

'''
3. Image Segmentation: Segmenting different parts of an image into classes.
'''

image_segmenter = pipeline("image-segmentation")

'''
4. Image Generation: Generating images from textual descriptions (using DALL-E or similar models).
'''

#---------------------------------------------------#
#             Speech Processing TASKS               #
#---------------------------------------------------#

'''
1. utomatic Speech Recognition (ASR): Converting spoken language into text.
'''

speech_recognizer = pipeline("automatic-speech-recognition")

'''
2. Speech Translation: Translating spoken language from one language to another.
3. Audio Classification: Classifying audio signals into predefined categories.
'''

#---------------------------------------------------#
#                   Multimodal TASKS                #
#---------------------------------------------------#

'''
1. Image Captioning: Generating a textual description of an image.
'''
image_captioner = pipeline("image-to-text")
'''
2. Visual Question Answering (VQA): Answering questions about the content of an image.
'''

#---------------------------------------------------#
#                     Other TASKS                   #
#---------------------------------------------------#
'''
1. Table Question Answering: Answering questions based on tabular data.
'''
table_qa = pipeline("table-question-answering")

'''
2. Document Question Answering: Extracting answers from documents like PDFs.

'''
doc_qa = pipeline("document-question-answering")
'''
3. Time Series Forecasting: Predicting future values in time series data (not directly supported in the main Transformers library but available through extensions).
'''

# NLP Tasks

# Sentiment Analysis

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
result = classifier("I was not happy with the last Mission Impossible movie.")
print(result)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'NEGATIVE', 'score': 0.9997472167015076}]


In [None]:
result = classifier("I love to travel to London.")
print(result)

[{'label': 'POSITIVE', 'score': 0.999681830406189}]


In [None]:
result = classifier("I don't hate any James Bond movies.")
print(result)

[{'label': 'POSITIVE', 'score': 0.993482768535614}]


In [None]:
result = classifier("I hate any James Bond movies.")
print(result)

[{'label': 'NEGATIVE', 'score': 0.9991707801818848}]


In [None]:
pipeline(task="sentiment-analysis")("I was confused with the Barbie movie")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'NEGATIVE', 'score': 0.9992005228996277}]

In [None]:
classifier = pipeline(task = "sentiment-analysis")

task_list = ["I really like Autoencoders, best models for Anomaly Detection", \
            "I am not sure if we CAN actually Evaluate LLMs.", \
            "PassiveAgressive is the name of a Linear Regression Model that so many people do not know.",\
            "I hate long Meetings."]
classifier(task_list)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'POSITIVE', 'score': 0.9978686571121216},
 {'label': 'NEGATIVE', 'score': 0.9995476603507996},
 {'label': 'NEGATIVE', 'score': 0.9983083009719849},
 {'label': 'NEGATIVE', 'score': 0.9969879984855652}]

# Text Generation

In [None]:
# Use pipeline as a high-level helper

from transformers import pipeline

text_generator = pipeline("text-generation", model = "distilbert/distilgpt2")
generated_text = text_generator("Today is a rainy day in London", truncation=True, num_return_sequences=2)
print("Generated_text:\n ", generated_text[0]["generated_text"])

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated_text:
  Today is a rainy day in London.

On the other side of Brighton and Chelsea, on the east edge of London all night long the sight of trains is a perfect chance for traffic jams and noise.
It's the same situation yesterday when


# Question Answering

In [None]:
from transformers import pipeline

qa_model = pipeline("question-answering")
question = "What is my job?"
context = "I sell digital products online."
qa_model(question=question, context=context)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


{'score': 0.3406529724597931,
 'start': 7,
 'end': 30,
 'answer': 'digital products online'}

# Tokenization

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification, pipeline

In [None]:
model_name2 = "nlptown/bert-base-multilingual-uncased-sentiment"
mymodel2 = AutoModelForSequenceClassification.from_pretrained(model_name2)
mytokenizer2 = AutoTokenizer.from_pretrained(model_name2)

classifier = pipeline("sentiment-analysis", model=mymodel2, tokenizer=mytokenizer2)
res = classifier("I was so not happy with the Barbie movie.")
print(res)

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': '2 stars', 'score': 0.5481419563293457}]


In [None]:
from transformers import AutoTokenizer

#load a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

#example text
text = "I was so not happy with the Barbie movie."

#tokenize the text
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokens: ['i', 'was', 'so', 'not', 'happy', 'with', 'the', 'barbie', 'movie', '.']


In [None]:
#convert tokens to input IDs

input_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Input IDs:", input_ids)

Input IDs: [1045, 2001, 2061, 2025, 3407, 2007, 1996, 22635, 3185, 1012]


In [None]:
#encode the text (tokenziation + converting to input IDs)

encoded_input = tokenizer(text)
print("Encoded Input:", encoded_input)

Encoded Input: {'input_ids': [101, 1045, 2001, 2061, 2025, 3407, 2007, 1996, 22635, 3185, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
# Decode the text

decoded_output = tokenizer.decode(input_ids)
print("Decoded Output:", decoded_output)

Decoded Output: i was so not happy with the barbie movie.


# Fine Tuning using a pre-trained Model

## Step1. Install Necessary Libraries

In [None]:
!pip install datasets



In [None]:
!pip install transformers



## Step2. Load and Prepare the Datasets

In [None]:
from datasets import load_dataset
dataset = load_dataset("stanfordnlp/imdb")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [None]:
dataset["train"][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

## Step3. Preprocess the Data
Tokenize the dataset using the tokenizer associated with the pre-trained model.

In [None]:
from transformers import AutoTokenizer

#load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

#tokenize the dataset
def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenize_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
tokenize_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [None]:
tokenize_datasets["train"][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

## Step4. Set up the training arguments

Specify the hyperparameters and training settings

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",                    #Output directory
    eval_strategy="epoch",                     #Evaluate every epoch
    learning_rate=2e-5,                        #Learning rate
    per_device_train_batch_size=16,            #Batch size for training
    per_device_eval_batch_size=16,             #Batch size for evaluation
    num_train_epochs=3,                        #Number of training epochs
    weight_decay=0.01                          #strength of weight decay
)

training_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=epoch,
eval_use_gather_object

## Step5. Initialize the model

Load the pre-trained model and define the training procedure

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer

#load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=2)

#initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenize_datasets["train"],
    eval_dataset=tokenize_datasets["test"]
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Step6. Train the model

Fine-tune the pre-trained model on your specific dataset

In [None]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.2072,0.2052


## Step7. Evaluate the model
Assess the model's performence on a validation set

In [None]:
# Evaluate the model
results = trainer.evaluate()
print(results)

## Step8. Save the fine-tuned model
Save the fine-tuned model for later use

In [None]:
# Save the model
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-tokenizer')