In [12]:
import json
import random

def split_dataset(json_file_path, train_percentage=0.8):
    """
    Randomly splits a JSON dataset into training and testing sets.

    Args:
        json_file_path: Path to the JSON file.
        train_percentage: Percentage of data to be used for training (default: 80%).

    Returns:
        A tuple containing two lists: (train_data, test_data).  Returns None if there's an error.
        Each list contains the JSON objects (dictionaries) for that split.
    """

    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:  # Handle potential encoding issues
            data = json.load(f)

        if not isinstance(data, list):  # Check if the JSON data is a list of objects
            print("Error: JSON data must be a list of objects.")
            return None


        random.shuffle(data)  # Shuffle the data in place to ensure randomness

        train_size = int(len(data) * train_percentage)
        train_data = data[:train_size]
        test_data = data[train_size:]

        return train_data, test_data

    except FileNotFoundError:
        print(f"Error: File not found at {json_file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in {json_file_path}")
        return None
    except Exception as e: # Catch other potential errors
        print(f"An unexpected error occurred: {e}")
        return None


def save_json(data, output_file_path):
    """Saves data to a JSON file."""
    try:
        with open(output_file_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4, ensure_ascii=False)  # indent for pretty printing, ensure_ascii for Unicode
        print(f"Data saved to {output_file_path}")
    except Exception as e:
        print(f"Error saving JSON to {output_file_path}: {e}")



# Example usage:
input_file = "/Users/namle/DATN/Sniffer/datasets/ooc.json"  # Replace with your JSON file path
train_file = "train_ooc.json"
test_file = "val_ooc.json"

train_data, test_data = split_dataset(input_file, train_percentage=0.8)

if train_data and test_data:  # Check if the split was successful
    save_json(train_data, train_file)
    save_json(test_data, test_file)

Data saved to train_ooc.json
Data saved to val_ooc.json


In [13]:
import argparse
import os
import random

import numpy as np
import torch
import torch.backends.cudnn as cudnn

import lavis.tasks as tasks
from lavis.common.config import Config
from lavis.common.dist_utils import get_rank, init_distributed_mode, is_main_process
from lavis.common.logger import setup_logger
from lavis.common.optims import (
    LinearWarmupCosineLRScheduler,
    LinearWarmupStepLRScheduler,
)
from lavis.common.registry import registry
from lavis.common.utils import now

from lavis.datasets.builders import *
from lavis.models import *
from lavis.processors import *
from lavis.runners import *
from lavis.tasks import *



In [14]:
import sys
import argparse

# Remove Jupyter's auto-generated arguments
sys.argv = [sys.argv[0]]

parser = argparse.ArgumentParser(description="Training")
# factvqa_newsclip_ft

parser.add_argument("--cfg-path", default = "lavis/projects/instructblip2/train/ooc_ft.yaml", help="path to configuration file.")
parser.add_argument(
    "--options",
    nargs="+",
    help="override some settings in the used config, the key-value pair "
    "in xxx=yyy format will be merged into config file (deprecate), "
    "change to --cfg-options instead.",
)

parser.add_argument(
    "--wname", type=str, default=None, help="wandb name",
)
parser.add_argument(
    "--use_lora", type=bool, default=False, help="whether use lora to train LLM",
)

args = parser.parse_args()
# Print the `args` object in Jupyter Notebook
print("Arguments as Namespace:", args)
print("Arguments as Dictionary:", vars(args))



Arguments as Namespace: Namespace(cfg_path='lavis/projects/instructblip2/train/ooc_ft.yaml', options=None, wname=None, use_lora=False)
Arguments as Dictionary: {'cfg_path': 'lavis/projects/instructblip2/train/ooc_ft.yaml', 'options': None, 'wname': None, 'use_lora': False}


In [15]:
cfg = Config(args)
task = tasks.setup_task(cfg)
run_cfg = cfg.run_cfg
datasets = task.build_datasets(cfg)
# model = task.build_model(cfg)

****************
/Users/namle/DATN/Sniffer/datasets/ooc.json


In [16]:
model_config=cfg.model_cfg

In [17]:
model_cls = registry.get_model_class(model_config.arch)

In [18]:
model_cls

lavis.models.blip2_models.blip2_vicuna_instruct.Blip2VicunaInstruct

In [19]:
model_config

{'arch': 'blip2_vicuna_instruct', 'load_finetuned': False, 'load_pretrained': True, 'pretrained': 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth', 'finetuned': '', 'image_size': 224, 'drop_path_rate': 0, 'use_grad_checkpoint': True, 'vit_precision': 'fp16', 'freeze_vit': True, 'num_query_token': 32, 'llm_model': 'lmsys/vicuna-7b-v1.1', 'prompt': '', 'model_type': 'vicuna7b', 'use_lora': False, 'max_txt_len': 550}

In [20]:
model = model_cls.from_config(model_config)



KeyboardInterrupt: 

In [4]:
datasets

{'ooc': {'train': <lavis.datasets.datasets.ooc_datasets.OOCDataset at 0x32d02ad60>}}

In [5]:
import json

In [6]:
ann = []
ann.extend(json.load(open("/Users/namle/DATN/Sniffer/datasets/ooc.json", "r")))

In [9]:
ann = ann[0]

In [10]:
ann["img_path"]

KeyError: 'img_path'