In [3]:
from transformers import MarianMTModel, MarianTokenizer
import torch

# Load the tokenizer and model
model_name_translate= "Helsinki-NLP/opus-mt-vi-en"  # English to Vietnamese
tokenizer_translate = MarianTokenizer.from_pretrained(model_name_translate)
model_translate = MarianMTModel.from_pretrained(model_name_translate)
model_translate.to("cuda")  # Move the model to the GPU

def translate_text(text, src_lang="vi", tgt_lang="en"):
    # Tokenize the input text
    tokenized_text = tokenizer_translate(text, return_tensors="pt", padding=True, truncation=True).to("cuda")
    # Generate translation
    with torch.no_grad():
        translation = model_translate.generate(**tokenized_text)
    # Decode the generated tokens into text
    translated_text = tokenizer_translate.decode(translation[0], skip_special_tokens=True)
    return translated_text

# Example usage
# text = "This is a test sentence for translation."
text = "chúng tôi là người Việt Nam"
translated_text = translate_text(text)
print(translated_text)


We're Vietnamese.


In [2]:
import requests
from PIL import Image

url = "https://media.newyorker.com/cartoons/63dc6847be24a6a76d90eb99/master/w_1160,c_limit/230213_a26611_838.jpg"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
# display(image.resize((596, 437)))

In [3]:
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch

processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()
inputs = processor(image, return_tensors="pt").to(device, torch.float16)
with torch.no_grad():
    generated_ids = model.generate(**inputs, max_new_tokens=20)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[
    0
].strip()
print(generated_text)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Expanding inputs for image tokens in BLIP-2 should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Both `max_new_tokens` (=20) and `max_length`(=51) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


two cartoon monsters sitting around a campfire


In [4]:
import os
from tqdm import tqdm


def scan_file_path_in_folder(directory):
    lst = []
    for root, dirs, files in os.walk(directory):
        # check files in the directory
        for file in files:
            lst.append(os.path.join(root, file))
    return lst


def scan_file_name_in_folder(directory):
    lst = []
    for root, dirs, files in os.walk(directory):
        # check files in the directory
        for file in files:
            # remove the extension
            file_name = os.path.splitext(file)[0]
            lst.append(file_name)
    return lst


audio_text = r"Data\rawData\Audio_text"

In [5]:
path_lst = scan_file_path_in_folder("Data\\PreporcessData\\Audio_key_text\\L01_V001")
name_lst = scan_file_name_in_folder(audio_text)
print(name_lst)

['L01_V001', 'L01_V002', 'L01_V003', 'L01_V004', 'L01_V005', 'L01_V006', 'L01_V007', 'L01_V008', 'L01_V009', 'L01_V010', 'L01_V011', 'L01_V012', 'L01_V013', 'L01_V014', 'L01_V015', 'L01_V016', 'L01_V017', 'L01_V018', 'L01_V019', 'L01_V020', 'L01_V021', 'L01_V022', 'L01_V023', 'L01_V024', 'L01_V025', 'L01_V026', 'L01_V027', 'L01_V028', 'L01_V029', 'L01_V030', 'L01_V031', 'L02_V001', 'L02_V002', 'L02_V003', 'L02_V004', 'L02_V005', 'L02_V006', 'L02_V007', 'L02_V008', 'L02_V009', 'L02_V010', 'L02_V011', 'L02_V012', 'L02_V013', 'L02_V014', 'L02_V015', 'L02_V016', 'L02_V017', 'L02_V018', 'L02_V019', 'L02_V020', 'L02_V021', 'L02_V022', 'L02_V023', 'L02_V024', 'L02_V025', 'L02_V026', 'L02_V027', 'L02_V028', 'L02_V029', 'L02_V030', 'L02_V031', 'L03_V001', 'L03_V002', 'L03_V003', 'L03_V004', 'L03_V005', 'L03_V006', 'L03_V007', 'L03_V008', 'L03_V009', 'L03_V010', 'L03_V011', 'L03_V012', 'L03_V013', 'L03_V014', 'L03_V015', 'L03_V016', 'L03_V017', 'L03_V018', 'L03_V019', 'L03_V020', 'L03_V021', 'L0

In [6]:
from transformers import logging
# Optionally, you can suppress all warnings from the transformers library
logging.set_verbosity_error()

In [7]:
from concurrent.futures import ThreadPoolExecutor


def process_image(i):
    name = name_lst[i]
    path_lst = scan_file_path_in_folder(
        f"Data\\rawData\\Keyframes\\Keyframes_{name[0:3]}\\keyframes\\{name}"
    )
    sub_name_lst = scan_file_name_in_folder(
        f"Data\\rawData\\Keyframes\\Keyframes_{name[0:3]}\\keyframes\\{name}"
    )

    for path, sub_name in zip(path_lst, sub_name_lst):
        out_path = (
            f"Data\\PreporcessData\\keyframes_ImageCaption\\{name}_{sub_name}.txt"
        )

        if os.path.exists(out_path):
            continue

        try:
            # open the image
            image = Image.open(path).convert("RGB")
            # Preprocess the image
            inputs = processor(image, return_tensors="pt").to(device, torch.float16)

            # Generate text
            with torch.no_grad():
                generated_ids = model.generate(**inputs, max_new_tokens=20)
                generated_text = processor.batch_decode(
                    generated_ids, skip_special_tokens=True
                )[0].strip()
                text = translate_text("i " + generated_text)

            # Translate text
            # text = translator.translate("i " + generated_text, src="en", dest="vi").text

            # Write the text to the file
            with open(out_path, "w", encoding="utf-8") as f:
                f.write(text)

        except Exception as e:
            print(f"Error processing {path}: {e}")


# Using ThreadPoolExecutor to process images in parallel
with ThreadPoolExecutor(
    max_workers=16
) as executor:  # Adjust the number of workers based on your CPU
    list(
        tqdm(
            executor.map(process_image, range(0, len(name_lst), 1)),
            total=len(name_lst),
        )
    )

100%|██████████| 363/363 [00:01<00:00, 307.55it/s]


In [8]:
# for i in tqdm(range(0, len(name_lst), 2)):
#     name = name_lst[i]
#     path_lst = scan_file_path_in_folder(
#         f"Data\\rawData\\Keyframes\\Keyframes_{name[0:3]}\\keyframes\\{name}"
#     )
#     sub_name_lst = scan_file_name_in_folder(
#         f"Data\\rawData\\Keyframes\\Keyframes_{name[0:3]}\\keyframes\\{name}"
#     )
#     for path, sub_name in zip(path_lst, sub_name_lst):
#         out_path = (
#             f"Data\\PreporcessData\\keyframes_ImageCaption\\{name}_{sub_name}.txt"
#         )
#         # open the image
#         image = Image.open(path).convert("RGB")
#         # check if the file is already exist
#         if os.path.exists(out_path):
#             continue

#         inputs = processor(image, return_tensors="pt").to(device, torch.float16)
#         with torch.no_grad():
#             generated_ids = model.generate(**inputs, max_new_tokens=20)
#         generated_text = processor.batch_decode(
#             generated_ids, skip_special_tokens=True
#         )[0].strip()
#         text = translator.translate("i " + generated_text, src="en", dest="vi").text
#         # write the text to the file
#         with open(out_path, "w", encoding="utf-8") as f:
#             f.write(text)