####**Load Tokenized Data**

In [None]:
# Install required packages
!pip install datasets --quiet
!pip install evaluate --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Find Colab environment
import os

colab_env = os.environ.get('GOOGLE_CLOUD_PROJECT')
if colab_env == None:
    print("Using Colab Research")
else:
    print("Using Colab Enterprise")

Using Colab Enterprise


In [None]:
# Login to Hugging Face
from huggingface_hub import login

if colab_env == None:
    # Get access token from Hugging Face hub
    from google.colab import userdata

    HF_TOKEN = userdata.get('HF_TOKEN')
else:
    # Get access token from Secret Manager
    !pip install google-cloud-secret-manager --quiet
    from google.cloud import secretmanager

    client = secretmanager.SecretManagerServiceClient()
    project_id = !gcloud config get-value project
    secret_name = f"projects/{project_id[0]}/secrets/HF_TOKEN/versions/latest"
    response = client.access_secret_version(request={"name": secret_name})
    HF_TOKEN = response.payload.data.decode("UTF-8")

if HF_TOKEN:
    login(HF_TOKEN)
    print("Successfully logged in to Hugging Face!")
else:
    print("Token is not set. Please set the token first.")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/218.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m153.6/218.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m218.1/218.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hSuccessfully logged in to Hugging Face!


In [None]:
# Load Hugging Face tokenized dataset dict anjan-k/Sentiment-Analysis-Tokenized
from datasets import load_dataset

sentiment_analysis_tokenized = load_dataset("anjan-k/Sentiment-Analysis-Tokenized")
print("Tokenized sentiment analysis dataset dict:")
print(sentiment_analysis_tokenized)

README.md:   0%|          | 0.00/670 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/4.34M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/737k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/720k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/31232 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5205 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5206 [00:00<?, ? examples/s]

Tokenized sentiment analysis dataset dict:
DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 31232
    })
    validation: Dataset({
        features: ['id', 'text', 'label', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 5205
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 5206
    })
})


In [None]:
# Get tokenized train and validatation dataset
tokenized_train = sentiment_analysis_tokenized["train"]
tokenized_val = sentiment_analysis_tokenized["validation"]

####**Training Prerequisites**

In [None]:
# Load model and tokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "distilbert/distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Use a data_collator to convert training samples to PyTorch tensors
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import numpy as np
import evaluate

# Define a function to compute metrics
def compute_metrics(eval_pred):
   load_accuracy = evaluate.load("accuracy")
   load_f1 = evaluate.load("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
   return {"accuracy": accuracy, "f1": f1}

####**Train model**

In [None]:
# Use Hugging Face Trainer
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Define training arguments
training_args = TrainingArguments(
    output_dir="Sentiment-Analysis-FineTune-HuggingFace",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=25,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    optim="adamw_torch",
    load_best_model_at_end=True,
    report_to="none",
)

# Instantiate the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6061,0.576078,0.761575,0.762049
2,0.4957,0.593605,0.758886,0.760285
3,0.382,0.663915,0.753506,0.754288
4,0.2679,0.852115,0.75293,0.751589


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

TrainOutput(global_step=7808, training_loss=0.4436363831895297, metrics={'train_runtime': 2165.6318, 'train_samples_per_second': 360.541, 'train_steps_per_second': 22.534, 'total_flos': 1.422857703158784e+16, 'train_loss': 0.4436363831895297, 'epoch': 4.0})

In [None]:
# Upload the trained model and tokenizer to Hugging Face Hub
trainer.push_to_hub()

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...eTune-HuggingFace/model.safetensors:   2%|1         | 4.59MB /  268MB            

  ...eTune-HuggingFace/training_args.bin:   7%|7         |   401B / 5.37kB            

CommitInfo(commit_url='https://huggingface.co/anjan-k/Sentiment-Analysis-FineTune-HuggingFace/commit/adf772afc2c3d75c5ebbbd8714357fa183fb4e91', commit_message='End of training', commit_description='', oid='adf772afc2c3d75c5ebbbd8714357fa183fb4e91', pr_url=None, repo_url=RepoUrl('https://huggingface.co/anjan-k/Sentiment-Analysis-FineTune-HuggingFace', endpoint='https://huggingface.co', repo_type='model', repo_id='anjan-k/Sentiment-Analysis-FineTune-HuggingFace'), pr_revision=None, pr_num=None)