[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/17GWLNju7k-MaqNGCQVvA_m4g_d0_v35D?usp=sharing)


In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
import torch
device = torch.device("cuda:0") if torch.cuda.is_available() else None

In [None]:
torch.cuda.get_device_name(0)

'Quadro RTX 6000'

In [None]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Sat May 27 23:26:40 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.57.02    Driver Version: 516.93       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:0B:00.0 Off |                  N/A |
| 35%   56C    P0   112W / 350W |     14MiB / 12288MiB |      0%      Default |
|                               |     

In [None]:
!mkdir image2music 
%cd image2music

/home/revolt/image2music_min/image2music


In [None]:
from datasets import load_dataset

data = load_dataset('AnyaSchen/image2music_abc')

Found cached dataset parquet (/home/revolt/.cache/huggingface/datasets/AnyaSchen___parquet/AnyaSchen--image2music_abc-784eee9f15716c2e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|█████████████████████████████████████████████████| 1/1 [00:00<00:00, 398.28it/s]


In [None]:
from PIL import Image
from torch.utils.data import Dataset
from transformers import ViTFeatureExtractor, AutoTokenizer

class ImageMusicDataset(Dataset):
    def __init__(self, dataset, vit_feature_extractor, tokenizer):
        self.dataset = dataset
        self.vit_feature_extractor = vit_feature_extractor
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Load and preprocess the image
        image = self.dataset[idx]['image'].convert("RGB")
        inputs = self.vit_feature_extractor(images=image, return_tensors="pt", padding=True)
        pixel_values = inputs["pixel_values"].squeeze(0)


        # Get the ABC notation text
        text = f"<bos>{self.dataset[idx]['music']}<eos>"

        # Tokenize the ABC notation text
        tokens = self.tokenizer(text, return_tensors="pt", padding="max_length", max_length=450, truncation=True)
        input_ids = tokens["input_ids"].squeeze(0)
        attention_mask = tokens["attention_mask"].squeeze(0)

        return {
            "pixel_values": pixel_values,
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            'labels': input_ids.clone()
        }

# Load the ViT feature extractor
vit_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")

# Load a BART tokenizer for the ABC notation
tokenizer = AutoTokenizer.from_pretrained('sander-wood/text-to-music')
SPECIAL_TOKENS = {'bos_token':'<bos>','eos_token' :'<eos>', 'pad_token':'<pad>', 'sep_token': '<sep>'}
tokenizer.add_special_tokens(SPECIAL_TOKENS)

# Create the Dataset
dataset = ImageMusicDataset(data['train'], vit_feature_extractor, tokenizer)

# Example usage
sample = dataset[0]
print(sample["pixel_values"].shape)  # Processed image tensor
print(sample["input_ids"].shape)  # Tokenized music input IDs
print(sample["attention_mask"].shape)  # Tokenized music attention mask



torch.Size([3, 224, 224])
torch.Size([450])
torch.Size([450])


In [None]:
import torch
from torch.utils.data import DataLoader, random_split
from transformers import VisionEncoderDecoderModel, VisionEncoderDecoderConfig, TrainingArguments, Trainer

# Define the encoder and decoder names
encoder_name = 'google/vit-base-patch16-224-in21k'
decoder_name = 'sander-wood/text-to-music'

# Create a configuration for VisionEncoderDecoderModel
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_name, decoder_name)

model.to(device)
model.decoder.resize_token_embeddings(len(tokenizer))
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# Split the dataset into train and validation sets (80-20 split)
train_size = int(0.95 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders
batch_size = 20
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./checkpoints",
    overwrite_output_dir=True,
    num_train_epochs=150,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps = 10,
    evaluation_strategy="epoch",
    logging_dir="./image_music_logs",
    save_steps = 1000,
    learning_rate=3e-5,
    weight_decay=0.01,
    # fp16=True,  # Use mixed precision training if possible (requires an NVIDIA GPU with Tensor Cores)
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./model')
tokenizer.save_pretrained('./tokenizer')
vit_feature_extractor.save_pretrained('./vit_feature_extractor')

Some weights of the model checkpoint at sander-wood/text-to-music were not used when initializing BartForCausalLM: ['model.encoder.layers.1.fc1.weight', 'model.encoder.layers.5.self_attn_layer_norm.weight', 'model.encoder.layers.4.self_attn.k_proj.bias', 'model.encoder.layers.0.fc2.bias', 'model.encoder.layers.3.final_layer_norm.weight', 'model.encoder.layers.1.self_attn.v_proj.weight', 'model.encoder.layernorm_embedding.bias', 'model.encoder.layers.3.final_layer_norm.bias', 'model.encoder.layers.5.fc1.bias', 'model.encoder.layers.2.final_layer_norm.bias', 'model.encoder.layers.5.fc2.bias', 'model.encoder.layers.4.self_attn.out_proj.bias', 'model.encoder.layers.2.self_attn.q_proj.bias', 'model.encoder.layers.2.fc2.bias', 'model.encoder.layers.4.final_layer_norm.weight', 'model.encoder.layers.0.self_attn_layer_norm.bias', 'model.encoder.layers.3.self_attn.q_proj.weight', 'model.encoder.layernorm_embedding.weight', 'model.encoder.layers.2.self_attn.q_proj.weight', 'model.encoder.layers.4

Epoch,Training Loss,Validation Loss
0,No log,4.246109
1,No log,3.143557
2,No log,2.71325
3,No log,2.46816
5,No log,2.329846
5,No log,2.240067
6,No log,2.135495
7,No log,2.048815
8,No log,1.970275
10,No log,1.896106


['./vit_feature_extractor/preprocessor_config.json']

# load to huging face

In [None]:
!pip install huggingface_hub
!huggingface-cli login --token {auth_token}

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /home/revolt/.cache/huggingface/token
Login successful


In [None]:
path = 'AnyaSchen/image2music'
fine_tuned_model.push_to_hub(path)
tokenizer.push_to_hub(path)
feature_extractor.push_to_hub(path)

# Generation

In [None]:
import torch

In [None]:
device = torch.device('cuda') if toch.cuda.is_available() else None

In [None]:
from PIL import Image
import requests
from transformers import AutoTokenizer, CLIPProcessor, VisionEncoderDecoderModel, ViTImageProcessor

def generate_music(fine_tuned_model, image, tokenizer):
    # Preprocess the image using the CLIP processor
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    
    # Generate the poetry with the fine-tuned VisionEncoderDecoder model
    generated_tokens = fine_tuned_model.generate(
        pixel_values,
        max_length=300,
        num_beams=5,
        top_p=0.9,
        temperature=2.0,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    # Decode the generated tokens
    generated_poetry = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    return generated_poetry


# Load the fine-tuned model

path = 'AnyaSchen/image2music'
fine_tuned_model = VisionEncoderDecoderModel.from_pretrained(path).to(device)
feature_extractor = ViTImageProcessor.from_pretrained(path)
tokenizer = AutoTokenizer.from_pretrained(path)

In [None]:
url = 'https://anandaindia.org/wp-content/uploads/2018/12/happy-man.jpg'
image = Image.open(requests.get(url, stream=True).raw)

generated_music = generate_music(fine_tuned_model, image, tokenizer)
print(generated_music)

X:1
L:1/4
M:2/2
K:F
V:1 treble nm="Piano" snmata!PMm="Pno""C7" c3/2 c/ c2- | c C D E |"F" F F F/E/F/G/ |
"Dm" A2"G7" G2 |"Gm7" d d d/c/d/e/ | d2 G F |"C" E C"Cm6" D _E |"Bb" D F G _A |"D7" A c c/=B/ c |
 c c _A c |"Eb7" _B _e e d |"Ab" _d2 c2 | c c d e | f f e/d/"Db7"_d/=d/ |"Gb" c c c B | c2 G"F6" A |
 A c"C+7" ^c"D9" d | e e"Ab6" f e |"Db9" e d d c | d3 c/B/ | A A _A"C9" G | B B A G | c4 | c3 z |]
