In [1]:
import json
from os import path, getcwd
import pandas as pd

In [2]:
TOP_LEVEL_DIR = getcwd()
TOP_LEVEL_DIR

'/Users/kepler/Documents/EPFL/MA6/Project/qvlm/eval/script_generators'

In [3]:
API_PORT = 8660
API_PORT

8660

In [4]:
MODEL_PARAMS = {
    # LLaVA 1.5-13B

    # 8 bits
    #"model_name": "TheBloke/llava-v1.5-13B-GPTQ:gptq-8bit-32g-actorder_True",
    #"model": "TheBloke_llava-v1.5-13B-GPTQ_gptq-8bit-32g-actorder_True",

    # 4 bits
    #"model_name": "TheBloke/llava-v1.5-13B-GPTQ:gptq-4bit-32g-actorder_True",
    #"model": "TheBloke_llava-v1.5-13B-GPTQ_gptq-4bit-32g-actorder_True",

    # 4 bits - 128g - actorder_True
    "model_name": "TheBloke/llava-v1.5-13B-GPTQ",
    "model": "TheBloke_llava-v1.5-13B-GPTQ",


    "pipeline": "llava-v1.5-13b"
}

In [5]:
SERVER_CMD = [
    "bash",
    "start_linux.sh",
    "--model", MODEL_PARAMS['model'],
    "--multimodal-pipeline", MODEL_PARAMS['pipeline'],
    "--disable_exllama",
    "--loader autogptq", "--no_inject_fused_attention", # Fused attention causes an error
    "--api", "--api-port", f"{API_PORT}",
]

In [6]:
SCITAS_PARAMS = f"""
#!/bin/bash -l

#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --time 36:00:00
#SBATCH --cpus-per-task=10
#SBATCH --partition=gpu
#SBATCH --qos=gpu
#SBATCH --gres=gpu:2
#SBATCH --mem 64G

cd ~/tgw
{' '.join(SERVER_CMD)} &
cd ~/
ipython run_vqav2_TheBloke_llava-v1.5-13B-GPTQ_gptq-4bit-32g-actorder_True.py
"""

In [11]:
SCRIPT_NAME = "run_vqav2_" + MODEL_PARAMS['model']
# Generate SCITAS job script
with open(f'{SCRIPT_NAME}.run', 'w+') as job_file:
    job_file.write(SCITAS_PARAMS)

# Generate actual python script
!jupyter nbconvert --to script run_vqav2.ipynb --output {SCRIPT_NAME}.py \
    -TagRemovePreprocessor.enabled=True --TagRemovePreprocessor.remove_cell_tags noconvert

# 1. Loading the questions

In [4]:
%cd {TOP_LEVEL_DIR}/qvlm

questions_path = 'datasets/VQA_V2/v2_Questions_Test_mscoco/v2_OpenEnded_mscoco_test-dev2015_questions.json'
questions_json = json.load(open(questions_path))
questions_df = pd.DataFrame(questions_json['questions'])
questions_df

/home/warringt/qvlm


Unnamed: 0,image_id,question,question_id
0,262144,What credit card company is on the banner in t...,262144005
1,262144,Is the pitcher wearing a hat?,262144003
2,262144,Is the ball flying towards the batter?,262144000
3,524289,Are the horses playing a game?,524289001
4,524289,What is the color of water in the image?,524289002
...,...,...,...
107389,406773,What is the giraffe resting its head on?,406773002
107390,444850,Why is the woman standing next to the truck?,444850001
107391,554649,Is this a police van?,554649022
107392,372707,Who is wearing their hat backwards?,372707007


# 2. Launching the model server

In [5]:
%cd {TOP_LEVEL_DIR}/tgw

/home/warringt/tgw


In [7]:
import threading
import subprocess
%cd {TOP_LEVEL_DIR}/tgw

def get_model_server_process(params: dict):
  if (not path.exists(params['model'])):
    !python download-model.py {params['model_name']}
  return lambda: subprocess.run(SERVER_CMD, check=True, shell=True, close_fds=True)

/home/warringt/tgw


In [8]:
%cd {TOP_LEVEL_DIR}/qvlm

import socket,time
from eval.connectors import Connector

def wait_for_port(connector: Connector, delay: int = 3, max_retries: int = 1000):
  sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
  conn_info = (connector.url, connector.port)
  result = sock.connect_ex(conn_info)
  counter = max_retries
  while (counter >= 0 and result != 0):
    print(f"Port is not open, retrying in {delay}s...\t({max_retries - counter}/{max_retries})")
    time.sleep(delay)
    result = sock.connect_ex(conn_info)
    counter = counter - 1
  
  if (result == 0):
    print("Port is open!")
    sock.close()
  else:
    print(f"Port was not open after n={max_retries} max retries")
    sock.close()
    exit(1)

/home/warringt/qvlm


In [7]:
#%cd {TOP_LEVEL_DIR}/tgw
# Do not uncomment, this doesn't work yet
#threading.Thread(target=get_model_server_process(PARAMS), daemon=True).start()

/home/warringt/tgw
Downloading the model to models/TheBloke_llava-v1.5-13B-GPTQ_gptq-8bit-32g-actorder_True


# 3. Computing the responses

In [9]:
%cd {TOP_LEVEL_DIR}/qvlm

/home/warringt/qvlm


In [10]:
#from eval.connectors.llamafile import LlamafileConnector
from eval.connectors.textgenerationwebui import TextGenerationWebUIConnector

connector = TextGenerationWebUIConnector('localhost', API_PORT, prompt_format = '{prompt}\nAnswer with a single word or phrase.')

In [12]:
from eval.evaluation.VQAV2 import VQAV2Evaluator

evaluator = VQAV2Evaluator(questions_df.head(100))
evaluator.connect(connector)

wait_for_port(connector, delay=3, max_retries = 200)
time.sleep(30) # The server can take some time to keep booting after the port has been opened...
evaluator.get_responses(f'datasets/VQA_V2/{MODEL_PARAMS["model"]}_responses.jsonl')

Port is not open, retrying in 3s...	(0/200)


KeyboardInterrupt: 