In [None]:
import os 
!GRADIO_SHARE=1 llamafactory-cli webui

In [1]:
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'maximum_weight_recommendation': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre',
        'cubic foot',
        'cubic inch',
        'cup',
        'decilitre',
        'fluid ounce',
        'gallon',
        'imperial gallon',
        'litre',
        'microlitre',
        'millilitre',
        'pint',
        'quart'}
}

In [2]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from PIL import Image

# Load the dataset
dataframe = pd.read_csv('/dataset/test.csv')

# Get the unique classes from the 'entity_name' column
unique_classes = dataframe['entity_name'].unique()

# Define the total number of samples and calculate samples needed per class
total_samples = 2000
samples_per_class = total_samples // len(unique_classes)

# Sample data for each class, limiting the number to 'samples_per_class'
sampled_data = dataframe.groupby('entity_name').apply(lambda x: x.sample(min(len(x), samples_per_class)))

# Shuffle the dataset to mix samples randomly
sampled_data = sampled_data.sample(frac=1).reset_index(drop=True)

# If the total number of rows is less than 'total_samples' due to unequal class sizes,
# sample the remaining required rows from the entire dataset
if len(sampled_data) < total_samples:
    remaining_samples = total_samples - len(sampled_data)
    additional_data = dataframe.sample(remaining_samples, random_state=1)
    sampled_data = pd.concat([sampled_data, additional_data])

# Function to check if an image is corrupted
def is_image_corrupt(image_path):
    image_path = "../test/" + image_path.split("/")[-1]
    try:
        # Attempt to open and verify the image
        img = Image.open(image_path)
        img.verify()  # Force load image data to check for corruption
        return False
    except (IOError, SyntaxError):
        return True

# Filter out corrupted images
sampled_data = sampled_data[~sampled_data['image_link'].apply(is_image_corrupt)].reset_index(drop=True)

# Save the sampled dataset to a CSV file
sampled_data.to_csv('sampled_dataset.csv', index=False)
print(f"Sampled dataset contains {len(sampled_data)} rows.")

# Reload the sampled dataset
filtered_data = pd.read_csv("sampled_dataset.csv")

# Prepare the list for storing instructions and image paths for JSON generation
data_list = []

for index, row in filtered_data.iterrows():
    image_path = "/../test/" + row["image_link"].split("/")[-1]
    instruction = f"""<image>Extract the {row['entity_name']} of the item and its unit of measurement from the image,
    providing them separately. Ensure that the unit is one of the following: {entity_unit_map[row['entity_name']]}."""

    # Define the message object for JSON
    message_obj = {
        "messages": [
            {'content': instruction, 'role': 'user'},
            {'content': "", 'role': 'assistant'}
        ],
        "images": [image_path]
    }

    # Append the message object to the data list
    data_list.append(message_obj)

# Save the data list as a JSON file
with open("./data/aml1p.json", "w") as json_file:
    json.dump(data_list, json_file, ensure_ascii=False, indent=4)


  sampled_df = df.groupby('entity_name').apply(lambda x: x.sample(min(len(x), samples_per_class)))


Sampled dataset has 2000 rows.


In [54]:
import json

# Define the configuration parameters for the model
config_params = {
  "model_name_or_path": "Qwen/Qwen2-VL-7B-Instruct",
  # Uncomment and specify path if using a custom adapter: "adapter_name_or_path": "/data/poornash/AML/LLaMA-Factory/qwen2vl_lora_old",
  "stage": "sft",  # Set stage to 'sft' for supervised fine-tuning
  "do_predict": True,  # Enable prediction mode
  "do_train": False,  # Disable training
  "finetuning_type": "lora",  # Specify the finetuning method
  "eval_dataset": "aml1p",  # Evaluation dataset name
  "template": "qwen2_vl",  # Template to use for processing data
  "cutoff_len": 1024,  # Maximum length of the input sequence
  "max_samples": 1000000,  # Maximum number of samples to process
  "overwrite_cache": True,  # Overwrite cache to use fresh data
  "n_shot": 5,  # Number of examples to use for few-shot learning
  "preprocessing_num_workers": 16,  # Number of workers for data preprocessing
  "output_dir": "/data/poornash/AML/LLaMA-Factory/predictions",  # Directory to save prediction results
  "overwrite_output_dir": True,  # Overwrite existing files in the output directory
  "per_device_eval_batch_size": 6,  # Batch size for evaluation per device
  "predict_with_generate": True,  # Use generation for prediction
  "ddp_timeout": 180000000  # Timeout setting for distributed data parallelism
}

# Save the configuration parameters to a JSON file
with open("train_qwen2vl.json", "w", encoding="utf-8") as json_file: 
    json.dump(config_params, json_file, ensure_ascii=False, indent=4)


In [7]:
args = {"model_name_or_path": "Qwen/Qwen2-VL-7B-Instruct", 
        "adapter_name_or_path": "qwen2vl_lora", 
        "template": "qwen2_vl", 
        "finetuning_type": "lora", 
        "export_dir": "qwen2vl_2b_instruct_lora_merged", 
        "export_size": 2, 
        "export_device": "cpu" 
        }

with open("merge_qwen2vl.json", "w", encoding="utf-8") as f: 
    json.dump(args, f, ensure_ascii=False, indent=4)

In [9]:
import json

# Function to read a JSONL (JSON Lines) file and load its content into a list
def load_jsonl(file_path):
    content_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            content_list.append(json.loads(line))
    return content_list

# Example usage
jsonl_file_path = '/data/poornash/AML/LLaMA-Factory/predictions/generated_predictions.jsonl'  # Update with the correct file path
data_entries = load_jsonl(jsonl_file_path)