In [1]:
from torch.utils.data import DataLoader
import os
os.chdir('../')
os.environ["TRANSFORMERS_CACHE"]="/common/users/zl502/huggingface/cache/"

import torch

# from peft import PeftModel, PeftModelForCausalLM, prepare_model_for_int8_training, LoraConfig, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, AutoFeatureExtractor

from general_dataset import GeneralDataset
from agi_utils import *
from tqdm import tqdm
from undecorated import undecorated
from types import MethodType


import numpy as np
from IPython.utils import io
import random
from evaluate import load
from torchvision import transforms
from torchmetrics.multimodal import CLIPScore
from combine_model_seq import SeqCombine

In [2]:
"""
assign openagi data path 
"""
# data_path = "YOUR_DATA_PATH"
data_path = "/common/users/zl502/openagi_data/"

task_discriptions = txt_loader("./task_description.txt")
# test_task_idx = [0,21,61,105,110,120,10,35,62,107,115]
test_task_idx = [2,3,10,15,20,35,45,55,65,70,70,90,106,107]
# test_task_idx = list(range(183))
test_dataloaders = []
for i in tqdm(test_task_idx):
    dataset = GeneralDataset(i, data_path)
    dataloader = DataLoader(dataset, batch_size=20)
    test_dataloaders.append(dataloader)
    
test_tasks = [task_discriptions[i].strip() for i in test_task_idx]

100%|███████████████████████████████████████████| 14/14 [00:05<00:00,  2.50it/s]


In [3]:
# base_model = "eachadea/vicuna-7b-1.1"
# base_model = "NousResearch/Nous-Hermes-Llama2-13b"
# base_model = "chainyo/alpaca-lora-7b"
base_model = "meta-llama/Llama-2-13b-chat-hf"
# base_model = "meta-llama/Llama-2-70b-chat-hf"
# base_model = "TheBloke/Llama-2-13B-chat-GGML"
# base_model = "chainyo/alpaca-lora-7b"
load_8bit = True

hf_token = "YOUR TOKEN"

max_memory_mapping = {
    0: "0GB",
    1: "0GB",
    2: "0GB",
    3: "0GB",
    4: "48GB",
    5: "48GB",
    6: "48GB",
    7: "48GB",
}

# max_memory_mapping = {
#     0: "0GB",
#     1: "0GB",
#     2: "24GB",
#     3: "24GB",
# }

tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    cache_dir="/common/users/zl502/huggingface/cache",
    use_auth_token=hf_token
    # padding_side='left'
)
# tokenizer.add_special_tokens({'pad_token': '<pad>'})
tokenizer.pad_token_id = 0

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    cache_dir="/common/users/zl502/huggingface/cache",
    device_map="auto",
    max_memory=max_memory_mapping,
    use_auth_token=hf_token
)


model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
# output["scores"]
import openai
# prompt_length = len(input_text)
openai.api_key = "YOUR TOKEN"

def generate_module_list_with_gpt(generated_module_seq):
    todo_prompt = "You are a key phrase extractor who is able to extract potential module names from the given context. You have already known all the module names in the full module list. The full module list is: [Image Classification, Colorization, Object Detection, Image Deblurring, Image Denoising, Image Super Resolution, Image Captioning, Text to Image Generation, Visual Question Answering, Sentiment Analysis, Question Answering, Text Summarization, Machine Translation]. Given the following context: '{}'. Please extract a module sequence from this context and remove module names which do not exist in the full module list from this sequence. Output the module sequence after filtering as the format of 'module: module1, module: module2, module: module3, etc...'. "
    prompt = todo_prompt.format(generated_module_seq)

    completion = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
        {"role": "user", "content": prompt}
      ]
    )

    content = completion.choices[0].message["content"]
    
    # print(content)
    
    content = content.split("module: ")[1:]
    
    result = ""
    for c in content:
        result += c
    
    # result = result[:-1] if len(result) > 0 else result
    
    return result

# generated_module_list = generate_module_list_with_gpt(response[prompt_length:])
# print(generated_module_list)

In [7]:
"""
Loading Evaluation Metrics
"""

clip_score = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16")


# Load a pre-trained Vision Transformer model and its feature extractor
vit_ckpt = "nateraw/vit-base-beans"
vit = AutoModel.from_pretrained(vit_ckpt)
vit.eval()
vit_extractor = AutoFeatureExtractor.from_pretrained(vit_ckpt)

f = transforms.ToPILImage()
bertscore = load("bertscore")

# device_list = ["cuda:1","cuda:2","caugment_prompt:3","cuda:4","cuda:5","cuda:7","cpu"]
device_list = ["cuda:0", "cpu"]
seqCombination = SeqCombine(device_list)

Some weights of the model checkpoint at nateraw/vit-base-beans were not used when initializing ViTModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTModel were not initialized from the model checkpoint at nateraw/vit-base-beans and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fetching 20 files:   0%|          | 0/20 [00:00<?, ?it/s]

The config attributes {'scaling_factor': 0.18215} were passed to AutoencoderKL, but are not expected and will be ignored. Please verify your config.json configuration file.


In [8]:
from sentence_transformers import SentenceTransformer, util
sentence_model = SentenceTransformer('all-MiniLM-L6-v2', device="cpu")


rewards = []
clips = []
berts = []
similairies = []

module_length = 10
num_beams = 1
num_return_sequences = 1

eval_device = "cuda:1"

for i, task_description in enumerate(tqdm(test_tasks)):
    print(task_description)
    task_rewards = []
    with torch.no_grad():
        input_s = ["### Human: "+ task_description +"\n### Assistant:"]
        input_ids = tokenizer.batch_encode_plus(
            input_s, padding="longest", return_tensors="pt"
        )["input_ids"].to(eval_device)
        output = model.generate(
            input_ids=input_ids,
            max_length=2048, 
            return_dict_in_generate=True, 
            output_scores=True, 
            output_hidden_states=True,
            repetition_penalty=1.25
        )
    generated_seq = tokenizer.decode(
        output["sequences"][0], skip_special_tokens=True, temperature=0, top_p=0.8, repetition_penalty=1.25
    )

    # print(generated_seq)

    # print(generated_seq.split("### Human: "))
    
    llama_solution = generated_seq.split("### Human: ")[1]
    print(llama_solution)
    llama_solution = llama_solution.split("\n")

    llama_steps = []
    for l,j in enumerate(llama_solution):
        if j[0:4] == "Step":
            llama_steps.append(llama_solution[l])
    print(llama_steps)
                
    module_list = match_module_seq(llama_steps, sentence_model)
    print("Module list: ", module_list)
    
    # vicuna_steps = generate_module_list_with_gpt(generated_seq).split(",")
    # module_list = match_module_seq(vicuna_steps, sentence_model)
    # print(module_list)

    # if len(module_list) >= 1 and module_seq_filter(module_list, test_task_idx[i]):
    if len(module_list) >= 1 and whole_module_seq_filter(module_list, test_task_idx[i]):
        seqCombination.construct_module_seq(module_list)

        for idx, batch in tqdm(enumerate(test_dataloaders[i])):
            inputs = list(batch['input'][0])
            # print("Inputs: ", inputs)
            try:
                predictions = seqCombination.run_module_seq(inputs)
            except:
                ave_task_reward = 0
                break

            if 0 <= test_task_idx[i] <= 14:
                outputs = list(batch['output'][0])
                dist = image_similarity(predictions, outputs, vit, vit_extractor)
                task_rewards.append(dist / 100)
            elif 15 <= test_task_idx[i] <= 104 or 107 <= test_task_idx[i]:
                outputs = list(batch['output'][0])
                f1 = np.mean(txt_eval(predictions, outputs, bertscore, device=eval_device))
                
                task_rewards.append(f1)
            else:
                score = clip_score(predictions, inputs)
                task_rewards.append(score.detach() / 100)
                
        ave_task_reward = np.mean(task_rewards)    
        seqCombination.close_module_seq()
            
    else:
        ave_task_reward = 0

    print(ave_task_reward)
        
    if 0 <= test_task_idx[i] <= 14:
        similairies.append(ave_task_reward)
    elif 15 <= test_task_idx[i] <= 104 or 107 <= test_task_idx[i]:
        berts.append(ave_task_reward)
    else:
        clips.append(ave_task_reward)

    rewards.append(ave_task_reward)     
    

print("Finished testing!")    

  0%|                                                    | 0/14 [00:00<?, ?it/s]

Given low-resolutioned blurry grayscale image, how to return the regular image step by step?
Given low-resolutioned blurry grayscale image, how to return the regular image step by step?
### Assistant: Sure! Here's a step-by-step guide on how to enhance and sharpen a low-resolutioned blurry grayscale image using various techniques. Please note that these are general steps and may require some adjustments based on the specific image you're working with.

Step 1: Noise Reduction
To start, we need to reduce the noise in the image. We can use a technique called "Gaussian Blur" to smooth out the image and remove any unwanted noise. This will help us improve the overall quality of the image before moving on to other steps.

Step 2: Sharpening
Once we have reduced the noise, we can move on to sharpening the image. There are several ways to do this, but one effective method is to use the "Unsharp Mask" filter. This filter will help bring out the details in the image and make it look more define

  7%|███                                        | 1/14 [02:09<27:59, 129.16s/it]

Module list Image Denoising, Colorization, Object Detection
0
Given blurry grayscale image, how to return the regular image step by step?
Given blurry grayscale image, how to return the regular image step by step?
### Assistant: Sure! Here's a step-by-step guide on how to enhance and sharpen a blurry grayscale image using Python:

Step 1: Load the Image
---------------

First, load the blurry grayscale image into your Python script. You can use libraries like OpenCV or Pillow for this purpose. For example:
```python
import cv2

blurry_image = cv2.imread('path/to/blurry/image', cv2.IMREAD_GRAYSCALE)
```
Step 2: Apply Bilateral Filtering
---------------------------

Bilateral filtering is an effective way to reduce noise in images while preserving edges. It works by weighting pixels based on their similarity in intensity and spatial distance. To apply bilateral filtering, you can use the following code:
```python
bilateral_filtered_image = cv2.bilateralFilter(blurry_image, 5, 90, 90)
```

 14%|██████▏                                    | 2/14 [06:14<39:28, 197.38s/it]

Module list Image Captioning, Image Denoising, Fill Mask
0
Given low-resolutioned blurry image, how to return the regular image step by step?
Given low-resolutioned blurry image, how to return the regular image step by step?
### Assistant: Sure! Here's a step-by-step guide on how to enhance and sharpen a low-resolutioned blurry image using various techniques. Please note that the results may vary depending on the quality of the input image.

Step 1: Noise Reduction
The first step is to reduce the noise in the image. You can use a noise reduction tool or filter to remove any unwanted specks and grains. This will help improve the overall clarity of the image.

Step 2: Sharpening
Once you have reduced the noise, it's time to sharpen the image. There are several methods for doing this, including unsharp masking, kernel sharpening, and smart sharpening. Unsharp masking involves adjusting the contrast of the image to bring out more details, while kernel sharpening uses a mathematical formula


0it [00:00, ?it/s][A
1it [00:09,  9.87s/it][A
2it [00:19,  9.78s/it][A
3it [00:28,  9.17s/it][A
4it [00:36,  9.02s/it][A
5it [00:46,  9.33s/it][A
 21%|█████████▏                                 | 3/14 [09:12<34:37, 188.83s/it]

0.7521700439453125
Given low-resolutioned noisy blurry grayscale image, how to return the caption in German step by step?
Given low-resolutioned noisy blurry grayscale image, how to return the caption in German step by step?
### Assistant: Sure! Here's a step-by-step guide on how to recognize objects in a low-resolutioned noisy blurry grayscale image and generate a description of them in German.

Step 1: Preprocessing
First, we need to preprocess the input image to enhance its quality. We can use techniques such as filtering or thresholding to remove noise and improve contrast. For example, you could apply a median filter to reduce salt and pepper noise, or a Gaussian filter to smooth out the image. Additionally, you may want to normalize the pixel values to ensure that they fall within a reasonable range for processing.

Step 2: Object Detection
Next, we need to detect objects in the preprocessed image. One way to do this is through edge detection using algorithms like Canny edge dete


0it [00:00, ?it/s][A
1it [00:22, 22.05s/it][A
2it [00:25, 11.16s/it][A
3it [00:29,  7.76s/it][A
4it [00:32,  6.13s/it][A
5it [00:36,  7.21s/it][A
 29%|████████████▎                              | 4/14 [12:33<32:13, 193.30s/it]

0.45538244783878323
Given low-resolutioned noisy blurry grayscale image, how to return the object names in English step by step?
Given low-resolutioned noisy blurry grayscale image, how to return the object names in English step by step?
### Assistant: Sure! Here's a step-by-step guide on how to extract objects from a low-resolutioned noisy blurry grayscale image and recognize them in English. Please note that this is a complex task and may require some preprocessing and fine-tuning of the model.

Step 1: Preprocess the Image
Before we can start recognizing objects, we need to preprocess the input image to enhance its quality and remove any noise. You can use techniques such as filtering, normalization, or edge detection to improve the image's resolution and contrast.

Step 2: Load a Model for Object Detection
Next, you will need to load an object detection model that has been trained on a dataset similar to your target images. There are many open-source models available online, such a

 36%|███████████████▎                           | 5/14 [14:38<25:17, 168.64s/it]

Module list Text to Image Generation, Object Detection, Machine Translation
0
Given blurry grayscale image, how to return the object names in German step by step?
Given blurry grayscale image, how to return the object names in German step by step?
### Assistant: Sure! Here's a step-by-step guide on how to return the object names in German given a blurry grayscale image:

Step 1: Preprocessing (5 minutes)

* Apply noise reduction techniques such as median filtering or bilateral filtering to reduce noise in the input image.
* Use edge detection algorithms like Canny edge detection or Sobel operator to extract edges from the preprocessed image.
* Threshold the edged image to obtain binary mask.

Step 2: Object Detection (10 minutes)

* Load a trained deep learning model for object detection (e.g., YOLOv3, SSD, Faster R-CNN).
* Pass the preprocessed and thresholded image through the object detection model to get bounding boxes around detected objects.

Step 3: Object Recognition (15 minute


0it [00:00, ?it/s][A


3it [00:10,  3.38s/it][A

5it [00:16,  3.28s/it][A
 43%|██████████████████▍                        | 6/14 [16:24<19:39, 147.45s/it]

0.5886177115142346
Given noisy grayscale image, how to return the caption in German step by step?
Given noisy grayscale image, how to return the caption in German step by step?
### Assistant: Sure! Here's a step-by-step guide on how to return the caption for a given noisy grayscale image in German.

Step 1: Load the Noisy Grayscale Image
-------------------------------------

First, you need to load the noisy grayscale image into your program or model. You can use libraries like OpenCV or Pillow to load images in Python.
```python
import cv2
img = cv2.imread('noisy_grayscale_image.png', cv2.IMREAD_GRAYSCALE)
```
Step 2: Preprocess the Image (Optional)
------------------------------------------

Next, you may want to preprocess the image to enhance its quality and remove any noise that might affect the OCR process. This can include filtering, thresholding, or other techniques to improve the contrast and clarity of the image.
```scss
# Apply Gaussian filter to reduce noise
img = cv2.Gaus

 50%|█████████████████████▌                     | 7/14 [18:56<17:22, 149.00s/it]

Module list Image Denoising, Text to Image Generation, Machine Translation
0
Given low-resolutioned grayscale image, how to return the class label in English step by step?
Given low-resolutioned grayscale image, how to return the class label in English step by step?
### Assistant: Sure! Here's a step-by-step guide on how to return the class label of a low-resolutioned grayscale image using deep learning techniques. Please note that this is just one possible approach and there may be other methods available as well.

Step 1: Preprocessing - Resize the input image to a fixed size (e.g., 256x256 pixels) while maintaining its aspect ratio. This will help improve computational efficiency and reduce memory usage during training. Next, normalize the pixel values between 0 and 1.

Step 2: Build Model Architecture - Design an appropriate neural network architecture for your classification task. A simple yet effective option could be a Convolutional Neural Network (CNN) with the following layers


0it [00:00, ?it/s][A
1it [00:10, 10.77s/it][A
2it [00:21, 10.84s/it][A
3it [00:32, 10.93s/it][A
4it [00:43, 11.02s/it][A
5it [00:54, 10.94s/it][A
 57%|████████████████████████▌                  | 8/14 [21:41<15:24, 154.14s/it]

0.8642942658066751
Given low-resolutioned noisy blurry image, how to return the object names in German step by step?
Given low-resolutioned noisy blurry image, how to return the object names in German step by step?
### Assistant: Sure! Here's a step-by-step guide on how to recognize objects in a low-resolutioned noisy blurry image and retrieve their names in German. Please note that this is a complex task and may require some preprocessing of the input images before proceeding.

Step 1: Preprocess the Image
Before we can start recognizing objects, we need to clean up the image and enhance its quality. Apply filters such as Gaussian Blur (to reduce noise), Median Filter (to remove artifacts), and Bilateral Filtering (to smooth out edges) to improve the overall appearance of the image. You can use libraries like OpenCV or TensorFlow for implementing these filters.

Step 2: Object Detection
Once you have preprocessed the image, you can use an object detection algorithm to identify objects

 64%|███████████████████████████▋               | 9/14 [24:02<12:30, 150.04s/it]

Module list Text to Image Generation, Object Detection, Image Classification, Machine Translation
0
Given noisy blurry image, how to return the class label in German step by step?


 71%|██████████████████████████████            | 10/14 [25:21<08:32, 128.16s/it]

Given noisy blurry image, how to return the class label in German step by step?
### Assistant: I'd be happy to help! To return a class label from a noisy and blurry image in German, you can follow these steps:

1. **Preprocessing**: Clean up the noise and sharpen the image using techniques like median filtering or Gaussian smoothing. You can use libraries like OpenCV for this purpose.
2. **Object Detection**: Use an object detection algorithm such as YOLO (You Only Look Once), SSD (Single Shot Detector) or Faster R-CNN (Region-based Convolutional Neural Networks) to identify objects in the preprocessed image. These algorithms are widely used for detecting objects in images and can be implemented using popular deep learning frameworks like TensorFlow or PyTorch.
3. **Classification**: After identifying objects in the image, apply classification techniques to determine their labels. For example, if you want to classify cars and buses, you could train a machine learning model on labeled d

 79%|█████████████████████████████████▊         | 11/14 [25:33<04:37, 92.50s/it]

Given noisy blurry image, how to return the class label in German step by step?
### Assistant: Hello! I'd be happy to help you with that. To start, could you please provide me with the noisy blurry image and tell me what type of object or scene you would like to identify in the image? Additionally, do you have any specific requirements for the classification task, such as a certain level of accuracy or a particular programming language/framework you would like to use?
[]
Module list 
0
Given low-resolutioned noisy image, how to return the caption in English step by step?
Given low-resolutioned noisy image, how to return the caption in English step by step?
### Assistant: Sure! Here's a step-by-step guide on how to generate an accurate English caption for a given low-resolutioned noisy image using our AI model. Please note that this process may take some time and computational resources depending on the complexity of the image and the desired level of accuracy.

Step 1: Preprocessing (5


0it [00:00, ?it/s][A
1it [00:04,  4.13s/it][A
2it [00:07,  3.56s/it][A
3it [00:11,  3.77s/it][A
4it [00:14,  3.46s/it][A
5it [00:18,  3.65s/it][A
 86%|████████████████████████████████████      | 12/14 [29:21<04:27, 133.83s/it]

0.44406582504510883
Given English text, how to generate a image step by step?


 93%|███████████████████████████████████████   | 13/14 [31:04<02:04, 124.46s/it]

Given English text, how to generate a image step by step?
### Assistant: Sure! Here's an outline of the steps involved in generating an image from given English text using Deep Learning techniques:

1. Text Preprocessing: The input text is preprocessed to convert it into a numerical representation that can be fed into a neural network. This includes tokenization, stemming or lemmatizing, removing stop words and punctuation, and converting all texts to lowercase.
2. Tokenizer: A tokenizer is used to break down the text into individual tokens, such as words or phrases. These tokens are then fed into the next step.
3. Embeddings: Each token is embedded into a vector space using a learned embedding layer. This allows the model to understand the meaning of each word in relation to other words.
4. Encoder: An encoder module takes the embedded tokens and generates a continuous representation of the input text. This could be done using Recurrent Neural Network (RNN) like LSTM or GRU or Transfo


0it [00:00, ?it/s][A
1it [00:08,  8.98s/it][A
2it [00:15,  7.30s/it][A
3it [00:21,  6.81s/it][A
4it [00:29,  7.15s/it][A
5it [00:40,  8.07s/it][A
100%|██████████████████████████████████████████| 14/14 [32:57<00:00, 141.24s/it]

0.1968381442129612
Finished testing!





In [9]:
avg_clips = np.mean(clips)
avg_berts = np.mean(berts)
avg_similarities = np.mean(similairies)
avg_rewards = (avg_clips + avg_berts + avg_similarities) / 3
print([avg_clips, avg_berts, avg_similarities, avg_rewards])

[0.0, 0.2549198394417763, 0.2507233479817708, 0.1685477291411824]
