# Final Project: Hugging Face 

This code uses : [https://huggingface.co/docs/datasets/en/quickstart#vision]

In [1]:
# # Pip intalling necessary items
# ! pip install datasets
# ! pip install datasets[vision] #used to work with the Image features 

# # I will be using Pytorch for this project, so I will install it here. But you can use Tensorflow if you prefer
# ! pip install torch
# ! pip install torchvision


# # I will be using the Hugging Face Transformers library for this project
# ! pip install transformers[torch]
# ! pip install "accelerate>=0.26.0"
# ! pip install transformers
# ! pip install evaluate
# ! pip install Pillow
# ! pip install evaluate



In [2]:
import numpy as np 
import evaluate
import torch
import matplotlib.pyplot as plt

from datasets import load_dataset, Image
from transformers import AutoImageProcessor
from torchvision.transforms import Compose, ColorJitter, ToTensor,  RandomResizedCrop, Normalize
from transformers import DefaultDataCollator
from torch.utils.data import DataLoader
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer
from transformers import ViTFeatureExtractor, ViTForImageClassification, ViTImageProcessor, SwinForImageClassification
from transformers import pipeline
from PIL import Image


accuracy = evaluate.load("accuracy")



In [3]:
# loading the Beans dataset 
dataset = load_dataset("beans")
# dataset = dataset.train_test_split(test_size=0.2)

In [4]:
dataset["train"].features, #dataset["train"][0]

({'image_file_path': Value(dtype='string', id=None),
  'image': Image(mode=None, decode=True, id=None),
  'labels': ClassLabel(names=['angular_leaf_spot', 'bean_rust', 'healthy'], id=None)},)

In [5]:
# make a way to get the image features easier for model 
labels = dataset['train'].features['labels']


In [6]:
# A feature extractor is needed to preprocess the image into a TENSOR
# checkpoint = "google/vit-base-patch16-224-in21k"
checkpoint = "microsoft/swin-tiny-patch4-window7-224"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

In [7]:
def process_example(example):
    inputs = image_processor(example['image'], return_tensors='pt')
    inputs['labels'] = example['labels']
    return inputs


In [8]:
def transform(example_batch):
    # Take a list of PIL images and turn them to pixel values
    inputs = image_processor([x for x in example_batch['image']], return_tensors='pt')

    # Don't forget to include the labels!
    inputs['labels'] = example_batch['labels']
    return inputs

In [9]:
prepared_ds = dataset.with_transform(transform)

In [10]:
prepared_ds['train'][0:2]

{'pixel_values': tensor([[[[-1.1760, -1.1760, -1.1760,  ..., -0.0458,  0.6563, -0.5253],
          [-1.2617, -1.2445, -1.2274,  ...,  0.0569,  0.1083, -0.5938],
          [-1.3130, -1.3302, -1.2959,  ..., -0.4397, -0.7650, -0.4226],
          ...,
          [-1.1075, -1.1247, -1.1932,  ..., -0.0801, -0.0801, -0.2342],
          [-1.1589, -1.2103, -1.2103,  ...,  0.8447,  0.4508,  0.3652],
          [-1.1932, -1.2274, -1.2617,  ...,  0.6563,  0.3652,  0.5193]],

         [[-1.3880, -1.3880, -1.4755,  ..., -0.6702, -0.0049, -1.2129],
          [-1.4405, -1.4405, -1.5105,  ..., -0.5301, -0.4076, -1.1429],
          [-1.4755, -1.4755, -1.5280,  ..., -0.9678, -1.3179, -1.0028],
          ...,
          [-1.5105, -1.5455, -1.6155,  ..., -0.3375,  0.0476,  0.3452],
          [-1.5805, -1.6155, -1.6155,  ...,  0.6954,  0.9930,  1.0805],
          [-1.6155, -1.6331, -1.6506,  ...,  0.3102,  0.8529,  1.0980]],

         [[-1.7870, -1.7870, -1.8044,  ..., -0.8284, -0.2532, -1.2467],
          [-1

In [11]:
import torch

def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }


In [12]:
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)


In [13]:
# creat a function that passses the predictions and labels to the accuracy function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [14]:
from transformers import AutoModelForImageClassification
import torch.nn as nn

labels = dataset['train'].features['labels'].names

# Load the model with ignore_mismatched_sizes to bypass the size mismatch issue
model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)},
    ignore_mismatched_sizes=True
)

# Create a new classifier layer with the correct number of labels
model.classifier = nn.Linear(in_features=model.classifier.in_features, out_features=len(labels))

# Now, model is ready with the correct classifier layer for your dataset


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./vit-base-beans",
    per_device_train_batch_size=16,
    evaluation_strategy="steps",
    num_train_epochs=4,
    fp16=False,  # Ensure fp16 is disabled
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    weight_decay=0.01,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=True,
    report_to='tensorboard',
    load_best_model_at_end=True,
    no_cuda=True,  # Force the use of CPU
)




In [16]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["validation"],
    tokenizer=image_processor,
)


  trainer = Trainer(


In [None]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()


  0%|          | 0/260 [00:00<?, ?it/s]

{'loss': 0.9649, 'grad_norm': 15.253263473510742, 'learning_rate': 0.00019230769230769233, 'epoch': 0.15}
{'loss': 0.4952, 'grad_norm': 12.642136573791504, 'learning_rate': 0.00018461538461538463, 'epoch': 0.31}
{'loss': 0.3586, 'grad_norm': 6.72530460357666, 'learning_rate': 0.00017692307692307693, 'epoch': 0.46}
{'loss': 0.4554, 'grad_norm': 36.84693908691406, 'learning_rate': 0.00016923076923076923, 'epoch': 0.62}
{'loss': 0.3472, 'grad_norm': 8.32830810546875, 'learning_rate': 0.00016153846153846155, 'epoch': 0.77}
{'loss': 0.2445, 'grad_norm': 12.446823120117188, 'learning_rate': 0.00015384615384615385, 'epoch': 0.92}
{'loss': 0.2426, 'grad_norm': 9.914612770080566, 'learning_rate': 0.00014615384615384615, 'epoch': 1.08}
{'loss': 0.1964, 'grad_norm': 9.404363632202148, 'learning_rate': 0.00013846153846153847, 'epoch': 1.23}
{'loss': 0.0612, 'grad_norm': 3.2337546348571777, 'learning_rate': 0.00013076923076923077, 'epoch': 1.38}
{'loss': 0.1068, 'grad_norm': 37.80059051513672, 'lea

  0%|          | 0/17 [00:00<?, ?it/s]

{'eval_loss': 0.06387340277433395, 'eval_accuracy': 0.9774436090225563, 'eval_runtime': 5.761, 'eval_samples_per_second': 23.086, 'eval_steps_per_second': 2.951, 'epoch': 1.54}
{'loss': 0.1322, 'grad_norm': 35.027339935302734, 'learning_rate': 0.00011538461538461538, 'epoch': 1.69}
{'loss': 0.144, 'grad_norm': 0.8483041524887085, 'learning_rate': 0.0001076923076923077, 'epoch': 1.85}
{'loss': 0.104, 'grad_norm': 0.03202478587627411, 'learning_rate': 0.0001, 'epoch': 2.0}
{'loss': 0.0698, 'grad_norm': 0.2923188805580139, 'learning_rate': 9.230769230769232e-05, 'epoch': 2.15}
{'loss': 0.0238, 'grad_norm': 0.008691404946148396, 'learning_rate': 8.461538461538461e-05, 'epoch': 2.31}


In [None]:
metrics = trainer.evaluate()
# some nice to haves:
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

  0%|          | 0/17 [00:00<?, ?it/s]

***** eval metrics *****
  epoch                   =        4.0
  eval_accuracy           =        1.0
  eval_loss               =     0.0043
  eval_runtime            = 0:00:04.59
  eval_samples_per_second =     28.974
  eval_steps_per_second   =      3.703
