In [1]:
import os

import torch
import numpy as np
from IPython.utils import io
import random
from evaluate import load
from torchvision import transforms
from tqdm import tqdm
from undecorated import undecorated
from types import MethodType
from torch.utils.data import DataLoader
from torchmetrics.multimodal import CLIPScore
from peft import PeftModel, PeftModelForCausalLM, prepare_model_for_int8_training, LoraConfig, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, AutoFeatureExtractor, AutoModelForSeq2SeqLM

from generate_model_seq import SeqGen
from general_dataset import GeneralDataset
from agi_utils import *
from combine_model_seq import SeqCombine

  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /common/home/yg334/anaconda3/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 111
CUDA SETUP: Loading binary /common/home/yg334/anaconda3/envs/vicuna/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda111.so...


In [2]:
"""
assign openagi data path 
"""
data_path = "YOUR_DATA_PATH"

task_discriptions = txt_loader("./task_description.txt")
training_task_idx = [7,20,30,40,50,60]
training_dataloaders = []
for i in tqdm(training_task_idx):
    dataset = GeneralDataset(i,data_path)
    dataloader = DataLoader(dataset, batch_size=5)
    training_dataloaders.append(dataloader)

test_task_idx = [2,3,10,15,20,35,45,55,65,70,70,90,106,107]
test_dataloaders = []
for i in tqdm(test_task_idx):
    dataset = GeneralDataset(i, data_path)
    dataloader = DataLoader(dataset, batch_size=5)
    test_dataloaders.append(dataloader)
    
test_tasks = [task_discriptions[i].strip() for i in test_task_idx]
training_tasks = [task_discriptions[i].strip() for i in training_task_idx]

100%|█████████████████████████████████████████████| 6/6 [00:01<00:00,  3.40it/s]
100%|███████████████████████████████████████████| 14/14 [00:03<00:00,  3.75it/s]


In [3]:
base_model = "google/flan-t5-large"
load_8bit = True

max_memory_mapping = {
    0: "24GB",
    1: "24GB",
    2: "0GB",
    3: "0GB",
    4: "0GB",
    5: "0GB",
    6: "0GB",
    7: "0GB",
}


tokenizer = AutoTokenizer.from_pretrained(
    base_model,
)
tokenizer.add_special_tokens({'pad_token': '<pad>'})

model = AutoModelForSeq2SeqLM.from_pretrained(
    base_model,
    device_map="auto",
    max_memory=max_memory_mapping,
)

generate_with_grad = undecorated(model.generate)
model.generate_with_grad = MethodType(generate_with_grad, model)


lora_weights = "YOUR_LORA_WEIGHTS"

model = PeftModelForCausalLM.from_pretrained(
                                            model,
                                            lora_weights,
                                            torch_dtype=torch.float16,
                                            is_trainable=True
                                            )


model.print_trainable_parameters()


seqGen = SeqGen(model, tokenizer)

trainable params: 2359296 || all params: 785509376 || trainable%: 0.30035236651331837


In [4]:
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=42)
# parser.add_argument("--model_name", type=str, default="google/flan-t5-large")
parser.add_argument("--epochs", type=int, default=10)
parser.add_argument("--batch_size", type=int, default=5)
parser.add_argument("--num_seq", type=int, default=1)
parser.add_argument("--learning_rate", type=float, default=1e-5)
parser.add_argument("--epsilon", type=float, default=0.2)
parser.add_argument("--decay_rate", type=float, default=0.9)
parser.add_argument("--weight_decay", type=float, default=1e-6)
parser.add_argument("--accumulate_steps", type=int, default=1)
parser.add_argument("--warm_up_proportion", type=float, default=0.1)

args = parser.parse_args("")


# output["scores"]
import openai
# prompt_length = len(input_text)
openai.api_key = "YOUR_OPENAPI_KEY"

def generate_module_list_with_gpt(generated_module_seq):
    todo_prompt = "You are a key phrase extractor who is able to extract potential module names from the given context. You have already known all the module names in the full module list. The full module list is: [Image Classification, Colorization, Object Detection, Image Deblurring, Image Denoising, Image Super Resolution, Image Captioning, Text to Image Generation, Visual Question Answering, Sentiment Analysis, Question Answering, Text Summarization, Machine Translation]. Given the following context: '{}'. Please extract a module sequence from this context and remove module names which do not exist in the full module list from this sequence. Output the module sequence after filtering as the format of 'module: module1, module: module2, module: module3, etc...'. "
    prompt = todo_prompt.format(generated_module_seq)

    completion = openai.ChatCompletion.create(
      model="gpt-4",
      messages=[
        {"role": "user", "content": prompt}
      ]
    )

    content = completion.choices[0].message["content"]
    
    # print(content)
    
    content = content.split("module: ")[1:]
    
    result = ""
    for c in content:
        result += c
    
    # result = result[:-1] if len(result) > 0 else result
    
    return result

# generated_module_list = generate_module_list_with_gpt(response[prompt_length:])
# print(generated_module_list)

In [5]:
"""
Loading Evaluation Metrics
"""
from sentence_transformers import SentenceTransformer, util
sentence_model = SentenceTransformer('all-MiniLM-L6-v2', device="cpu")


clip_score = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16")


# Load a pre-trained Vision Transformer model and its feature extractor
vit_ckpt = "nateraw/vit-base-beans"
vit = AutoModel.from_pretrained(vit_ckpt)
vit.eval()
vit_extractor = AutoFeatureExtractor.from_pretrained(vit_ckpt)

f = transforms.ToPILImage()
bertscore = load("bertscore")

# device_list = ["cuda:1","cuda:2","cuda:3","cuda:4","cuda:5","cuda:7","cpu"]
device_list = ["cuda:4","cpu"]
seqCombination = SeqCombine(device_list)

from utils import construct_optimizer
optimizer, scheduler = construct_optimizer(args, model, args.epochs)

Some weights of the model checkpoint at nateraw/vit-base-beans were not used when initializing ViTModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTModel were not initialized from the model checkpoint at nateraw/vit-base-beans and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
rewards = []
clips = []
berts = []
similairies = []

module_length = 4
num_beams = 1
num_return_sequences = 1

eval_device = "cuda:3"

epsilon = args.epsilon
decay_rate = args.decay_rate
module_length = 10

for e in range(args.epochs):
    baseline = 0
    rewards = []

    for i, task_description in enumerate(tqdm(training_tasks)):
        # if i == 1:
        #     break

        print(task_description)
        optimizer.zero_grad()
        task_rewards = []
        input_s = [task_description]
        input_ids = tokenizer.batch_encode_plus(
            input_s, padding="longest", return_tensors="pt"
        )["input_ids"].to(eval_device)

        generated_module_seq, log_prob = seqGen.generate_sequence(input_ids,\
                                                                   module_length=4, \
                                                                   beam_size=20, \
                                                                   num_seq=20,\
                                                                   top_k=40,\
                                                                   top_p=0.75,\
                                                                   temperature=0.2,\
                                                                   num_beam_groups=1,\
                                                                   max_length=50)
        
    
        if random.random() >= epsilon:
            action = torch.argmax(torch.stack(log_prob).detach())
        else:
            action = torch.distributions.Categorical(torch.stack(log_prob).detach()).sample()

        # decrease epsilon by the decay rate after each step
        epsilon *= decay_rate

    
        vicuna_steps = generated_module_seq[action].strip()[:-1].split(",")
        module_list = match_module_seq(vicuna_steps, sentence_model)
        # print(output_sequence)
        print(module_list)
        


        # if len(module_list) >= 1 and module_seq_filter(module_list, test_task_idx[i]):
        if len(module_list) >= 1 and whole_module_seq_filter(module_list, test_task_idx[i]):
            seqCombination.construct_module_seq(module_list)

            for idx, batch in enumerate(test_dataloaders[i]):
                inputs = list(batch['input'][0])
                # print("Inputs: ", inputs)
                try:
                    predictions = seqCombination.run_module_seq(inputs)
                except:
                    ave_task_reward = 0
                    break

                if 0 <= test_task_idx[i] <= 14:
                    outputs = list(batch['output'][0])
                    dist = image_similarity(predictions, outputs, vit, vit_extractor)
                    task_rewards.append(dist / 100)
                elif 15 <= test_task_idx[i] <= 104 or 107 <= test_task_idx[i]:
                    outputs = list(batch['output'][0])
                    f1 = np.mean(txt_eval(predictions, outputs, bertscore, device=eval_device))

                    task_rewards.append(f1)
                else:
                    score = clip_score(predictions, inputs)
                    task_rewards.append(score.detach()/100)

            ave_task_reward = np.mean(task_rewards)  
            rewards.append(ave_task_reward)
            seqCombination.close_module_seq()

        else:
            rewards.append(-1)     

        # if 0 <= test_task_idx[i] <= 14:
        #     similairies.append(ave_task_reward)
        # elif 15 <= test_task_idx[i] <= 104 or 107 <= test_task_idx[i]:
        #     berts.append(ave_task_reward)
        # else:
        #     clips.append(ave_task_reward)

           
    

        avg_reward = np.mean(rewards)
        # print("Average reward: " + str(avg_reward))
        loss = -log_prob[action] * (avg_reward - baseline)
        print("Loss: "+ str(loss.item()))
        loss.backward()
        optimizer.step()
        scheduler.step()
        # baseline = avg_reward

print("Finished training!")    

  0%|                                                     | 0/6 [00:00<?, ?it/s]

Given grayscale image, how to return the regular image step by step?
Object Detection
Loss: -0.749862551689148


 17%|███████▌                                     | 1/6 [00:12<01:02, 12.55s/it]

Given low-resolutioned noisy blurry grayscale image, how to return the object names in English step by step?
Object Detection, Text to Image Generation, Colorization



  0%|                                                    | 0/50 [00:00<?, ?it/s][A
  2%|▉                                           | 1/50 [00:02<02:09,  2.65s/it][A
  4%|█▊                                          | 2/50 [00:03<01:29,  1.86s/it][A
  6%|██▋                                         | 3/50 [00:05<01:16,  1.62s/it][A
  8%|███▌                                        | 4/50 [00:06<01:08,  1.50s/it][A
 10%|████▍                                       | 5/50 [00:07<01:04,  1.44s/it][A
 12%|█████▎                                      | 6/50 [00:09<01:01,  1.40s/it][A
 14%|██████▏                                     | 7/50 [00:10<00:59,  1.37s/it][A
 16%|███████                                     | 8/50 [00:11<00:57,  1.36s/it][A
 18%|███████▉                                    | 9/50 [00:13<00:55,  1.35s/it][A
 20%|████████▌                                  | 10/50 [00:14<00:53,  1.34s/it][A
 22%|█████████▍                                 | 11/50 [00:15<00:52,  1.34

Loss: -0.003980204463005066


 33%|██████████████▋                             | 2/6 [24:01<56:22, 845.52s/it]

Given low-resolutioned blurry grayscale image, how to return the caption in English step by step?
Object Detection
Loss: -0.13277265429496765


 50%|██████████████████████                      | 3/6 [24:04<23:03, 461.05s/it]

Given low-resolutioned noisy grayscale image, how to return the class label in German step by step?
Object Detection




Loss: -0.009254304692149162


 67%|█████████████████████████████▎              | 4/6 [24:34<09:41, 290.69s/it]

Given noisy grayscale image, how to return the object names in English step by step?
Object Detection




Loss: 7.045271195238456e-05


 83%|████████████████████████████████████▋       | 5/6 [24:51<03:12, 192.25s/it]

Given grayscale image, how to return the caption in English step by step?
Object Detection




Loss: 0.016524262726306915


100%|████████████████████████████████████████████| 6/6 [25:08<00:00, 251.49s/it]
  0%|                                                     | 0/6 [00:00<?, ?it/s]

Given grayscale image, how to return the regular image step by step?
Machine Translation
Loss: -7.511727333068848


 17%|███████▌                                     | 1/6 [00:03<00:19,  3.83s/it]

Given low-resolutioned noisy blurry grayscale image, how to return the object names in English step by step?
Object Detection
Loss: -0.07251397520303726


 33%|███████████████                              | 2/6 [00:06<00:13,  3.38s/it]

Given low-resolutioned blurry grayscale image, how to return the caption in English step by step?
Object Detection
Loss: -0.4005163609981537


 50%|██████████████████████▌                      | 3/6 [00:10<00:10,  3.49s/it]

Given low-resolutioned noisy grayscale image, how to return the class label in German step by step?
Object Detection




Loss: -0.03787778690457344


 67%|██████████████████████████████               | 4/6 [00:26<00:16,  8.47s/it]

Given noisy grayscale image, how to return the object names in English step by step?
Object Detection




Loss: -0.029240602627396584


 83%|█████████████████████████████████████▌       | 5/6 [00:43<00:11, 11.34s/it]

Given grayscale image, how to return the caption in English step by step?
Object Detection




Loss: -0.03953194618225098


100%|█████████████████████████████████████████████| 6/6 [00:59<00:00,  9.85s/it]
  0%|                                                     | 0/6 [00:00<?, ?it/s]

Given grayscale image, how to return the regular image step by step?
Object Detection
Loss: -0.7586144804954529


 17%|███████▌                                     | 1/6 [00:03<00:19,  3.90s/it]

Given low-resolutioned noisy blurry grayscale image, how to return the object names in English step by step?
Object Detection
Loss: -0.0729927122592926


 33%|███████████████                              | 2/6 [00:06<00:12,  3.19s/it]

Given low-resolutioned blurry grayscale image, how to return the caption in English step by step?
Object Detection
Loss: -0.403041273355484


 50%|██████████████████████▌                      | 3/6 [00:09<00:08,  2.87s/it]

Given low-resolutioned noisy grayscale image, how to return the class label in German step by step?
Object Detection




Loss: -0.03801935538649559


 67%|██████████████████████████████               | 4/6 [00:24<00:15,  7.63s/it]

Given noisy grayscale image, how to return the object names in English step by step?
Object Detection




Loss: -0.02938924916088581


 83%|█████████████████████████████████████▌       | 5/6 [00:40<00:10, 10.70s/it]

Given grayscale image, how to return the caption in English step by step?
Object Detection




Loss: -0.03974370285868645


100%|█████████████████████████████████████████████| 6/6 [00:58<00:00,  9.68s/it]
  0%|                                                     | 0/6 [00:00<?, ?it/s]

Given grayscale image, how to return the regular image step by step?
Object Detection
Loss: -0.76043701171875


 17%|███████▌                                     | 1/6 [00:05<00:25,  5.11s/it]

Given low-resolutioned noisy blurry grayscale image, how to return the object names in English step by step?
Object Detection
Loss: -0.07328777015209198


 33%|███████████████                              | 2/6 [00:07<00:14,  3.63s/it]

Given low-resolutioned blurry grayscale image, how to return the caption in English step by step?
Object Detection
Loss: -0.4041570723056793


 50%|██████████████████████▌                      | 3/6 [00:10<00:09,  3.06s/it]

Given low-resolutioned noisy grayscale image, how to return the class label in German step by step?
Object Detection




Loss: -0.03807120770215988


 67%|██████████████████████████████               | 4/6 [00:25<00:15,  7.77s/it]

Given noisy grayscale image, how to return the object names in English step by step?
Object Detection




Loss: -0.02942909300327301


 83%|█████████████████████████████████████▌       | 5/6 [00:41<00:10, 10.83s/it]

Given grayscale image, how to return the caption in English step by step?
Object Detection




Loss: -0.039777807891368866


100%|█████████████████████████████████████████████| 6/6 [00:57<00:00,  9.57s/it]
  0%|                                                     | 0/6 [00:00<?, ?it/s]

Given grayscale image, how to return the regular image step by step?
Object Detection
Loss: -0.7606476545333862


 17%|███████▌                                     | 1/6 [00:04<00:21,  4.30s/it]

Given low-resolutioned noisy blurry grayscale image, how to return the object names in English step by step?
Object Detection
Loss: -0.07330051064491272


 33%|███████████████                              | 2/6 [00:06<00:13,  3.27s/it]

Given low-resolutioned blurry grayscale image, how to return the caption in English step by step?
Object Detection
Loss: -0.4041570723056793


 50%|██████████████████████▌                      | 3/6 [00:09<00:08,  2.89s/it]

Given low-resolutioned noisy grayscale image, how to return the class label in German step by step?
Object Detection




Loss: -0.03807120770215988


 67%|██████████████████████████████               | 4/6 [00:24<00:15,  7.72s/it]

Given noisy grayscale image, how to return the object names in English step by step?
Object Detection




Loss: -0.02942909300327301


 83%|█████████████████████████████████████▌       | 5/6 [00:40<00:10, 10.86s/it]

Given grayscale image, how to return the caption in English step by step?
Object Detection




Loss: -0.039777807891368866


100%|█████████████████████████████████████████████| 6/6 [00:56<00:00,  9.43s/it]
  0%|                                                     | 0/6 [00:00<?, ?it/s]

Given grayscale image, how to return the regular image step by step?
Object Detection
Loss: -0.7606476545333862


 17%|███████▌                                     | 1/6 [00:06<00:33,  6.73s/it]

Given low-resolutioned noisy blurry grayscale image, how to return the object names in English step by step?
Object Detection
Loss: -0.07330051064491272


 33%|███████████████                              | 2/6 [00:09<00:17,  4.42s/it]

Given low-resolutioned blurry grayscale image, how to return the caption in English step by step?
Object Detection
Loss: -0.4041570723056793


 50%|██████████████████████▌                      | 3/6 [00:11<00:10,  3.52s/it]

Given low-resolutioned noisy grayscale image, how to return the class label in German step by step?
Object Detection




Loss: -0.03807120770215988


 67%|██████████████████████████████               | 4/6 [00:26<00:16,  8.05s/it]

Given noisy grayscale image, how to return the object names in English step by step?
Object Detection




Loss: -0.02942909300327301


 83%|█████████████████████████████████████▌       | 5/6 [00:43<00:11, 11.19s/it]

Given grayscale image, how to return the caption in English step by step?
Object Detection




Loss: -0.039777807891368866


100%|█████████████████████████████████████████████| 6/6 [00:59<00:00,  9.88s/it]
  0%|                                                     | 0/6 [00:00<?, ?it/s]

Given grayscale image, how to return the regular image step by step?
Object Detection
Loss: -0.7606476545333862


 17%|███████▌                                     | 1/6 [00:04<00:21,  4.30s/it]

Given low-resolutioned noisy blurry grayscale image, how to return the object names in English step by step?
Object Detection
Loss: -0.07330051064491272


 33%|███████████████                              | 2/6 [00:07<00:13,  3.39s/it]

Given low-resolutioned blurry grayscale image, how to return the caption in English step by step?
Object Detection
Loss: -0.4041570723056793


 50%|██████████████████████▌                      | 3/6 [00:10<00:10,  3.55s/it]

Given low-resolutioned noisy grayscale image, how to return the class label in German step by step?
Object Detection




Loss: -0.03807120770215988


 67%|██████████████████████████████               | 4/6 [00:25<00:15,  7.81s/it]

Given noisy grayscale image, how to return the object names in English step by step?
Object Detection




Loss: -0.02942909300327301


 83%|█████████████████████████████████████▌       | 5/6 [00:40<00:10, 10.41s/it]

Given grayscale image, how to return the caption in English step by step?
Object Detection




Loss: -0.039777807891368866


100%|█████████████████████████████████████████████| 6/6 [00:56<00:00,  9.44s/it]
  0%|                                                     | 0/6 [00:00<?, ?it/s]

Given grayscale image, how to return the regular image step by step?
Object Detection
Loss: -0.7606476545333862


 17%|███████▌                                     | 1/6 [00:04<00:20,  4.11s/it]

Given low-resolutioned noisy blurry grayscale image, how to return the object names in English step by step?
Object Detection
Loss: -0.07330051064491272


 33%|███████████████                              | 2/6 [00:07<00:13,  3.40s/it]

Given low-resolutioned blurry grayscale image, how to return the caption in English step by step?
Object Detection
Loss: -0.4041570723056793


 50%|██████████████████████▌                      | 3/6 [00:09<00:09,  3.15s/it]

Given low-resolutioned noisy grayscale image, how to return the class label in German step by step?
Object Detection




Loss: -0.03807120770215988


 67%|██████████████████████████████               | 4/6 [00:25<00:16,  8.10s/it]

Given noisy grayscale image, how to return the object names in English step by step?
Object Detection




Loss: -0.02942909300327301


 83%|█████████████████████████████████████▌       | 5/6 [00:43<00:11, 11.65s/it]

Given grayscale image, how to return the caption in English step by step?
Object Detection




Loss: -0.039777807891368866


100%|█████████████████████████████████████████████| 6/6 [00:59<00:00,  9.91s/it]
  0%|                                                     | 0/6 [00:00<?, ?it/s]

Given grayscale image, how to return the regular image step by step?
Object Detection
Loss: -0.7606476545333862


 17%|███████▌                                     | 1/6 [00:04<00:24,  4.83s/it]

Given low-resolutioned noisy blurry grayscale image, how to return the object names in English step by step?
Object Detection
Loss: -0.07330051064491272


 33%|███████████████                              | 2/6 [00:07<00:14,  3.54s/it]

Given low-resolutioned blurry grayscale image, how to return the caption in English step by step?
Object Detection
Loss: -0.4041570723056793


 50%|██████████████████████▌                      | 3/6 [00:09<00:09,  3.00s/it]

Given low-resolutioned noisy grayscale image, how to return the class label in German step by step?
Object Detection




Loss: -0.03807120770215988


 67%|██████████████████████████████               | 4/6 [00:24<00:15,  7.74s/it]

Given noisy grayscale image, how to return the object names in English step by step?
Object Detection




Loss: -0.02942909300327301


 83%|█████████████████████████████████████▌       | 5/6 [00:41<00:10, 10.95s/it]

Given grayscale image, how to return the caption in English step by step?
Object Detection




Loss: -0.039777807891368866


100%|█████████████████████████████████████████████| 6/6 [00:58<00:00,  9.79s/it]
  0%|                                                     | 0/6 [00:00<?, ?it/s]

Given grayscale image, how to return the regular image step by step?
Object Detection
Loss: -0.7606476545333862


 17%|███████▌                                     | 1/6 [00:05<00:25,  5.03s/it]

Given low-resolutioned noisy blurry grayscale image, how to return the object names in English step by step?
Object Detection
Loss: -0.07330051064491272


 33%|███████████████                              | 2/6 [00:07<00:14,  3.59s/it]

Given low-resolutioned blurry grayscale image, how to return the caption in English step by step?
Object Detection
Loss: -0.4041570723056793


 50%|██████████████████████▌                      | 3/6 [00:11<00:11,  3.86s/it]

Given low-resolutioned noisy grayscale image, how to return the class label in German step by step?
Object Detection




Loss: -0.03807120770215988


 67%|██████████████████████████████               | 4/6 [00:27<00:17,  8.57s/it]

Given noisy grayscale image, how to return the object names in English step by step?
Object Detection




Loss: -0.02942909300327301


 83%|█████████████████████████████████████▌       | 5/6 [00:44<00:11, 11.47s/it]

Given grayscale image, how to return the caption in English step by step?
Object Detection




Loss: -0.039777807891368866


100%|█████████████████████████████████████████████| 6/6 [01:00<00:00, 10.03s/it]

Finished training!





In [16]:
from sentence_transformers import SentenceTransformer, util
sentence_model = SentenceTransformer('all-MiniLM-L6-v2', device="cpu")

rewards = []
clips = []
berts = []
similairies = []

module_length = 10
num_beams = 1
num_return_sequences = 1

eval_device = "cuda:2"

for i, task_description in enumerate(tqdm(test_tasks)):
    print(task_description)
    task_rewards = []
    with torch.no_grad():
        input_s = [task_description]
        # input_s = ["### Human: "+task_description]
        input_ids = tokenizer.batch_encode_plus(
            input_s, padding="longest", return_tensors="pt"
        )["input_ids"].to(eval_device)
        output = model.generate(
            input_ids=input_ids,
            max_length=512, return_dict_in_generate=True, output_scores=True, output_hidden_states=True,repetition_penalty=1.25
        )
        
    generated_seq = tokenizer.decode(
        output["sequences"][0], skip_special_tokens=True, temperature=0, top_p=0.8, repetition_penalty=1.25
    )#[len(input_s[0]):]
    
    print(generated_seq)
    
    vicuna_steps = generate_module_list_with_gpt(generated_seq).split(",")
    module_list = match_module_seq(vicuna_steps, sentence_model)
    print(module_list)

    

    if len(module_list) >= 1 and whole_module_seq_filter(module_list, test_task_idx[i]):
        seqCombination.construct_module_seq(module_list)

        for idx, batch in enumerate(test_dataloaders[i]):
            inputs = list(batch['input'][0])
            # print("Inputs: ", inputs)
            try:
                predictions = seqCombination.run_module_seq(inputs)
            except:
                ave_task_reward = 0
                break

            if 0 <= test_task_idx[i] <= 14:
                outputs = list(batch['output'][0])
                dist = image_similarity(predictions, outputs, vit, vit_extractor)
                task_rewards.append(dist / 100)
            elif 15 <= test_task_idx[i] <= 104 or 107 <= test_task_idx[i]:
                outputs = list(batch['output'][0])
                f1 = np.mean(txt_eval(predictions, outputs, bertscore, device=eval_device))
                
                task_rewards.append(f1)
            else:
                score = clip_score(predictions, inputs)
                task_rewards.append(score.detach()/100)
                
        ave_task_reward = np.mean(task_rewards)    
        seqCombination.close_module_seq()
            
    else:
        ave_task_reward = 0
        
    print(ave_task_reward)
        
    if 0 <= test_task_idx[i] <= 14:
        similairies.append(ave_task_reward)
    elif 15 <= test_task_idx[i] <= 104 or 107 <= test_task_idx[i]:
        berts.append(ave_task_reward)
    else:
        clips.append(ave_task_reward)

    rewards.append(ave_task_reward)     
    

print("Finished testing!")    

  0%|                                                    | 0/14 [00:00<?, ?it/s]

Given low-resolutioned blurry grayscale image, how to return the regular image step by step?
The image is a grayscale image with a blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry blurry b

  7%|███▏                                        | 1/14 [01:04<14:03, 64.85s/it]

0.7541353912353517
Given blurry grayscale image, how to return the regular image step by step?
The blurred grayscale image is a grayscale image of a white plate. The regular image is a white plate with a blurred grayscale image of a white plate.
Image Deblurring, Colorization


 14%|██████▎                                     | 2/14 [01:41<09:39, 48.26s/it]

0.7349049491882323
Given low-resolutioned blurry image, how to return the regular image step by step?
The blurry image is a blurry image with a low resolution. To return the regular image, we need to reduce the blurry image to a high resolution.
Image Deblurring, Image Super Resolution


 21%|█████████▍                                  | 3/14 [03:36<14:24, 78.57s/it]

0.6046432552337646
Given low-resolutioned noisy blurry grayscale image, how to return the caption in German step by step?
Nach einer kleinen Rekordierung, lautet die Titel:


 29%|████████████▌                               | 4/14 [03:40<08:12, 49.24s/it]


0
Given low-resolutioned noisy blurry grayscale image, how to return the object names in English step by step?
The object is a white, smudge-like substance that is a blob of ice.
Object Detection


 36%|███████████████▋                            | 5/14 [03:54<05:29, 36.61s/it]

0.6551846090704203
Given blurry grayscale image, how to return the object names in German step by step?
Objekte können in der Bild mit einer beliebigen Farbe beziehen.


 43%|██████████████████▊                         | 6/14 [03:57<03:20, 25.07s/it]

Object Detection, Colorization
0
Given noisy grayscale image, how to return the caption in German step by step?
Nach der grünen Bild ist die Titel in grayscale.


 50%|██████████████████████                      | 7/14 [03:59<02:03, 17.59s/it]


0
Given low-resolutioned grayscale image, how to return the class label in English step by step?
The class name is a glyph for class.


 57%|█████████████████████████▏                  | 8/14 [04:01<01:15, 12.61s/it]


0
Given low-resolutioned noisy blurry image, how to return the object names in German step by step?
Der Objekt wurde in einer low-resolutionen, blurrigen Bild.


 64%|████████████████████████████▎               | 9/14 [04:03<00:47,  9.44s/it]

Object Detection, Image Deblurring, Image Super Resolution
0
Given noisy blurry image, how to return the class label in German step by step?
Der Objekt wird in der Bild eingeschränkt, um die Farbe zu beziehen.


 71%|██████████████████████████████▋            | 10/14 [04:06<00:28,  7.18s/it]

Object Detection, Colorization
0
Given noisy blurry image, how to return the class label in German step by step?
Der Objekt wird in der Bild eingeschränkt, um die Farbe zu beziehen.


 79%|█████████████████████████████████▊         | 11/14 [04:08<00:17,  5.68s/it]

Object Detection, Colorization
0
Given low-resolutioned noisy image, how to return the caption in English step by step?
The image is low-resolution, noisy.


 86%|████████████████████████████████████▊      | 12/14 [04:11<00:09,  4.93s/it]

Image Denoising, Image Super Resolution
0
Given English text, how to generate a image step by step?
The sand is a sand castle.


 93%|███████████████████████████████████████▉   | 13/14 [04:14<00:04,  4.19s/it]


0
Given clozed English text, how to return the summarization in German step by step?
Wie in der englischen Text ausgefüllt wurde, können wir die Zusammenfassung in zwei Schritte abholen.


100%|███████████████████████████████████████████| 14/14 [04:16<00:00, 18.35s/it]


0
Finished testing!





In [18]:
np.mean(clips), np.mean(berts), np.mean(similairies), (np.mean(clips) + np.mean(berts) + np.mean(similairies))/3

(0.0, 0.06551846090704203, 0.6978945318857829, 0.25447099759760833)