In [None]:
pip install transformers datasets torchvision evaluate

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
import os
import pandas as pd
import torch
from torchvision import transforms
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report
from torchvision.transforms import v2
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer, AutoImageProcessor
from evaluate import load
from datasets import load_dataset, ClassLabel
import matplotlib.pyplot as plt


In [None]:
drive.mount('/content/drive')

read_from_drive = True

if read_from_drive:
  drive.mount('/content/drive/')
  train_path = '/content/drive/MyDrive/COMP90086_2024_Project_train/train'
  test_path = '/content/drive/MyDrive/COMP90086_2024_Project_test/test'
else:
  train_path = "./COMP90086_2024_Project_train/train"
  test_path = "./COMP90086_2024_Project_test/test"


Mounted at /content/drive
Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
dataset = load_dataset("imagefolder", data_dir=train_path)


Resolving data files:   0%|          | 0/7681 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/7681 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
model_checkpoint = "facebook/dinov2-base"
BATCH_SIZE = 32
EPOCH = 20

In [None]:
class_labels = ClassLabel(names=["1", "2", "3","4","5","6"])


print("Original features:", dataset['train'].features)

def convert_labels(example):
    example['stable_height'] = class_labels.str2int(str(example['stable_height']))
    return example


dataset = dataset.map(convert_labels)


dataset = dataset.cast_column('stable_height', class_labels)

print("Updated features:", dataset['train'].features)

Original features: {'image': Image(mode=None, decode=True, id=None), 'stable_height': Value(dtype='int64', id=None)}


Map:   0%|          | 0/7680 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7680 [00:00<?, ? examples/s]

Updated features: {'image': Image(mode=None, decode=True, id=None), 'stable_height': ClassLabel(names=['1', '2', '3', '4', '5', '6'], id=None)}


In [None]:
dataset = dataset["train"]

In [None]:
splits = dataset.train_test_split(test_size=0.2, seed = 42)
train_ds = splits['train']
val_ds = splits['test']

In [None]:
train_ds.features["stable_height"]

ClassLabel(names=['1', '2', '3', '4', '5', '6'], id=None)

In [None]:
id2label = {id:label for id, label in enumerate(range(1,7))}
label2id = {label:id for id,label in id2label.items()}
id2label

{0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6}

In [None]:
# Define image augmentation transforms
train_transform = transforms.Compose([
    v2.ToImage(),
    v2.RandomHorizontalFlip(p=0.5),
    v2.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

validation_transform = transforms.Compose([
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])


In [None]:
def train_transforms(images):
    images['pixel_values'] = [train_transform(image.convert("RGB")) for image in images['image']]
    return images

def val_transforms(images):
    images['pixel_values'] = [validation_transform(image.convert("RGB")) for image in images['image']]
    return images

In [None]:
train_ds.set_transform(train_transforms)
val_ds.set_transform(val_transforms)

In [None]:
model  = AutoModelForImageClassification.from_pretrained(
    model_checkpoint,
    # "/content/drive/MyDrive",
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes = True,
    num_labels=6
)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCH,
    weight_decay=0.01,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    remove_unused_columns=False,
    warmup_ratio=0.1,
    report_to = "none"
)



In [None]:
metric = load("accuracy")

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["stable_height"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
from transformers import BeitImageProcessor
image_processor  = AutoImageProcessor.from_pretrained(model_checkpoint)

preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

In [None]:
train_results = trainer.train()

trainer.save_model("/content/drive/MyDrive")
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)

Epoch,Training Loss,Validation Loss,Accuracy
1,1.398,1.346626,0.413411
2,1.3577,1.445327,0.358073
3,1.2228,1.273948,0.472656
4,1.2478,1.17562,0.515625
5,1.0518,1.130393,0.542318
6,0.9438,1.125372,0.557943
7,0.7718,1.175479,0.563151
8,0.7198,1.119183,0.582682
9,0.6271,1.207124,0.556641
10,0.5422,1.254647,0.572917


***** train metrics *****
  epoch                    =         20.0
  total_flos               = 8949857480GF
  train_loss               =       0.5651
  train_runtime            =   0:42:40.95
  train_samples_per_second =       47.982
  train_steps_per_second   =        1.499


In [None]:
metrics = trainer.evaluate()

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =       20.0
  eval_accuracy           =     0.6146
  eval_loss               =     3.6157
  eval_runtime            = 0:00:11.39
  eval_samples_per_second =    134.751
  eval_steps_per_second   =      4.211


### Evaluation on validation set

In [None]:
# Get predictions for the validation dataset
predictions = trainer.predict(val_ds)

# Extract predicted labels
predicted_labels = np.argmax(predictions.predictions, axis=1) + 1


id_list = [int(instance['image'].filename.split("/")[-1].split(".")[0]) for instance in val_ds]
validation_df = pd.DataFrame({'id': id_list, 'predicted_label': predicted_labels})

metadata_df = pd.read_csv('/content/drive/MyDrive/COMP90086_2024_Project_train/train.csv')
merged_df = pd.merge(metadata_df, validation_df, on='id', how='inner')
merged_df.to_csv('/content/drive/MyDrive/merged_df.csv', index=False)

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, f1_score

# Compute overall performance accuracy / classification_report
overll_accuracy = accuracy_score(merged_df['stable_height'], merged_df['predicted_label'])
classification_report = classification_report(merged_df['stable_height'], merged_df['predicted_label'])

print(f"Overall accuracy: {overll_accuracy}")
print(f"Classification report:\n{classification_report}")

# Compare the accuracy across different camera angle / Only cube versus different shapes / instability type / easy or hard
angle_accuracy = merged_df.groupby('cam_angle').apply(lambda x: accuracy_score(x['stable_height'], x['predicted_label'])).to_dict()
shape_accuracy = merged_df.groupby('shapeset').apply(lambda x: accuracy_score(x['stable_height'], x['predicted_label'])).to_dict()
instability_accuracy = merged_df.groupby('instability_type').apply(lambda x: accuracy_score(x['stable_height'], x['predicted_label'])).to_dict()
type_accuracy = merged_df.groupby('type').apply(lambda x: accuracy_score(x['stable_height'], x['predicted_label'])).to_dict()

# angle_f1 = merged_df.groupby('cam_angle').apply(lambda x: f1_score(x['stable_height'], x['predicted_label'], average="macro")).to_dict()
# shape_f1 = merged_df.groupby('shapeset').apply(lambda x: f1_score(x['stable_height'], x['predicted_label'], average="macro")).to_dict()
# instability_f1 = merged_df.groupby('instability_type').apply(lambda x: f1_score(x['stable_height'], x['predicted_label'], average="macro")).to_dict()
# type_f1 = merged_df.groupby('type').apply(lambda x: f1_score(x['stable_height'], x['predicted_label'], average="macro")).to_dict()

Overall accuracy: 0.6145833333333334
Classification report:
              precision    recall  f1-score   support

           1       0.72      0.61      0.66       373
           2       0.69      0.70      0.70       380
           3       0.61      0.69      0.65       303
           4       0.54      0.56      0.55       233
           5       0.49      0.46      0.48       169
           6       0.33      0.37      0.35        78

    accuracy                           0.61      1536
   macro avg       0.56      0.57      0.56      1536
weighted avg       0.62      0.61      0.62      1536



  angle_accuracy = merged_df.groupby('cam_angle').apply(lambda x: accuracy_score(x['stable_height'], x['predicted_label'])).to_dict()
  shape_accuracy = merged_df.groupby('shapeset').apply(lambda x: accuracy_score(x['stable_height'], x['predicted_label'])).to_dict()
  instability_accuracy = merged_df.groupby('instability_type').apply(lambda x: accuracy_score(x['stable_height'], x['predicted_label'])).to_dict()
  type_accuracy = merged_df.groupby('type').apply(lambda x: accuracy_score(x['stable_height'], x['predicted_label'])).to_dict()
  angle_f1 = merged_df.groupby('cam_angle').apply(lambda x: f1_score(x['stable_height'], x['predicted_label'], average="macro")).to_dict()
  shape_f1 = merged_df.groupby('shapeset').apply(lambda x: f1_score(x['stable_height'], x['predicted_label'], average="macro")).to_dict()
  instability_f1 = merged_df.groupby('instability_type').apply(lambda x: f1_score(x['stable_height'], x['predicted_label'], average="macro")).to_dict()
  type_f1 = merged_df.groupby

### Predict for test set

In [None]:
import os
import re
from PIL import Image
import matplotlib.pyplot as plt


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

file_names = os.listdir(test_path)

# Sort the file names by the number in the file name
def get_number_from_filename(filename):
  return int(filename.split(".")[0])

file_names.sort(key=get_number_from_filename)

predicted_labels = []
# Iterate through the sorted file names and display the image
for file_name in file_names:
  file_path = os.path.join(test_path, file_name)

  img = Image.open(file_path)
  encoding = image_processor(img.convert("RGB"), return_tensors="pt").to(device)

  with torch.no_grad():
      outputs = model(**encoding)
      logits = outputs.logits
      predicted_class_idx = logits.argmax(-1).item()
      predicted_labels.append(model.config.id2label[predicted_class_idx])


In [None]:
import shutil
import pandas as pd

# Copy the original CSV file to a new file
original_csv_path = '/content/drive/MyDrive/COMP90086_2024_Project_test/test.csv'
new_csv_path = '/content/drive/MyDrive/COMP90086_2024_Project_test/test_predict_DINO.csv'
shutil.copyfile(original_csv_path, new_csv_path)


df = pd.read_csv(new_csv_path)
df['stable_height'] = predicted_labels
df.to_csv(new_csv_path, index=False)
