<a href="https://colab.research.google.com/github/aaronjyang/transformers-testing/blob/main/LLaVa-loftus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load model and necessary imports

In [2]:
# Pip installs
!pip install --upgrade -q accelerate bitsandbytes
!pip install git+https://github.com/huggingface/transformers.git
!pip install -q av

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-cvjw66ot
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-cvjw66ot
  Resolved https://github.com/huggingface/transformers.git to commit 94ae1ba5b55e79ba766582de8a199d8ccf24a021
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [3]:
# Imports
from transformers import BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor
import torch
from PIL import Image
import numpy as np
import av
import numpy as np
from huggingface_hub import hf_hub_download
from matplotlib import pyplot as plt
from matplotlib import animation
from IPython.display import HTML

In [4]:
# Load Model

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    "llava-hf/LLaVA-NeXT-Video-7B-hf",
    quantization_config=quantization_config,
    device_map='auto'
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/741 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/209 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/70.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

# Load videos


In [5]:
def read_video_pyav(path, indices):
    '''
    Decode the video with PyAV decoder.

    Args:
        path (str): Path to the video file.
        indices (List[int]): List of frame indices to decode.

    Returns:
        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    container = av.open(path)
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

In [6]:
qualities = ["1080", "720", "480", "360", "240", "144"]
videos = {}
indices = np.arange(0, 51)
for quality in qualities:
  video = read_video_pyav("/content/quality_degradation/" + quality + "p10fps.mp4", indices)
  videos[quality] = video

# Test videos (optional)


In [7]:
# np array with shape (frames, height, width, channels)
video = videos["720"]

fig = plt.figure()
im = plt.imshow(video[0,:,:,:])

plt.close() # this is required to not display the generated image

def init():
    im.set_data(video[0,:,:,:])

def animate(i):
    im.set_data(video[i,:,:,:])
    return im

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=video.shape[0],
                               interval=100)
HTML(anim.to_html5_video())

# Misc. Functions

In [8]:
def generate_response(conversation, video, max_new_tokens = 100, do_sample= True, top_p = 0.9):
  prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
  inputs = processor([prompt], videos=video, padding=True, return_tensors="pt").to(model.device)
  generate_kwargs = {"max_new_tokens": max_new_tokens, "do_sample": do_sample, "top_p": top_p}
  output = model.generate(**inputs, **generate_kwargs)
  generated_text = processor.batch_decode(output, skip_special_tokens=True)
  return generated_text[0][len(prompt) - 7 + 1:]

In [9]:
def ask_question(conversation, video, question):
  next_question = {
      "role": "user",
          "content": [
              {"type": "text", "text": question},
          ],
  }
  conversation.append(next_question)
  response = generate_response(conversation, video)
  assistant_response = {
      "role": "assistant",
      "content": [
          {"type": "text", "text": response},
      ],
  }
  conversation.append(assistant_response)
  return conversation, response

In [10]:
import random
def ask_math_questions(conversation, numQuestions, video):
  for i in range(numQuestions):
    num1 = random.randint(1, 10)
    num2 = random.randint(1, 10)
    next_question = {
      "role": "user",
          "content": [
              {"type": "text", "text": "What is " + str(num1) + " + " + str(num2) +
               "? Please answer in the form \"" +  str(num1) + " + " + str(num2) + " = sum\" with no more and no less words."},
          ],
    }
    conversation.append(next_question)

    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    print(len(processor.tokenizer.tokenize(prompt)))
    inputs = processor([prompt], padding=True, return_tensors="pt").to(model.device)
    generate_kwargs = {"max_new_tokens": 100, "do_sample": True, "top_p": 0.90}
    output = model.generate(**inputs, **generate_kwargs)
    generated_text = processor.batch_decode(output, skip_special_tokens=True)
    response =  generated_text[0][len(prompt) + 1:]
    assistant_response = {
      "role": "assistant",
      "content": [
          {"type": "text", "text": response},
      ],
    }
    conversation.append(assistant_response)
  return conversation

# Experiment

In [12]:
# Questions
questions = [ "Give an account of the accident you have just seen in the provided video.",
              "Does the video happen at day or at night?",
              "What is the weather in the video?",
              "What color were the vehicles involved in the accident?",
              "Were any pedestrians involved in the accident?",
              "Given the following ranges, please pick the letter option that corresponds best to how fast the cars were going when they collided: " +
                "A) 0 - 10 kilometers per hour\n" +
                "B) 10 - 20 kilometers per hour\n" +
                "C) 20 - 30 kilometers per hour\n" +
                "D) 30 - 40 kilometers per hour\n"]

In [11]:
# Questions
questions = [ "Give an account of the accident you have just seen in the provided video.",
              "Given the following ranges, please pick the letter option that corresponds best to how fast the cars were going when they collided: " +
                "A) 0 - 10 kilometers per hour\n" +
                "B) 10 - 20 kilometers per hour\n" +
                "C) 20 - 30 kilometers per hour\n" +
                "D) 30 - 40 kilometers per hour\n"]

In [12]:
def run_experiment(video, questions, numTrials = 1, numMathQuestions = 0):
  outputs = {}
  for i in range(numTrials):
    conversation = []
    conversation = ask_math_questions(conversation, numMathQuestions, video)
    conversation.append(
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Does a car accident occur in the provided video?"},
                {"type": "video"},
                ],
        }
    )
    generated_response = generate_response(conversation, video)
    if "Does a car accident occur in the provided video?" in outputs:
      outputs["Does a car accident occur in the provided video?"].append(generated_response)
    else:
      outputs["Does a car accident occur in the provided video?"] = [generated_response]
    assistant_response = {
        "role": "assistant",
        "content": [
            {"type": "text", "text": generated_response},
        ],
    }
    conversation.append(assistant_response)
    for question in questions:
      conversation, response = ask_question(conversation, video, question);
      if question in outputs:
        outputs[question].append(response)
      else:
        outputs[question] = [response]
  return outputs




In [14]:
o = run_experiment(videos["720"], questions, 1, 0)

In [15]:
o

{'Does a car accident occur in the provided video?': ['Yes, there is an accident scene visible in the video. The video appears to have been taken from inside a vehicle, possibly a car, facing the windshield, as there are dashcam recordings shown on the screen. The video shows a traffic light on a pole, and several cars, including a red car and a silver car, are involved in the accident. The video also shows that there were at least two individuals visible in the car from which the recording was taken, with at least'],
 'Give an account of the accident you have just seen in the provided video.': ['In the video, the camera is mounted in the car and shows a scene where a red car is stopped in traffic, and it is being hit by another car that appears to be a silver vehicle. The video also shows the immediate aftermath of the collision, with one individual standing on the side of the road with their back towards the vehicle and the other seemingly uninjured sitting in the car. The second per

In [16]:
import pandas as pd
pd.DataFrame(o)

Unnamed: 0,Does a car accident occur in the provided video?,Give an account of the accident you have just seen in the provided video.,"Given the following ranges, please pick the letter option that corresponds best to how fast the cars were going when they collided: A) 0 - 10 kilometers per hour\nB) 10 - 20 kilometers per hour\nC) 20 - 30 kilometers per hour\nD) 30 - 40 kilometers per hour\n"
0,"Yes, there is an accident scene visible in the...","In the video, the camera is mounted in the car...",A


In [None]:
for question in questions:
  next_question = {
      "role": "user",
          "content": [
              {"type": "text", "text": question},
          ],
  }
  conversation.append(next_question)
  response = generate_response(conversation, videos["720"])
  outputs[question] = response

  assistant_response = {
      "role": "assistant",
      "content": [
          {"type": "text", "text": response},
      ],
  }
  conversation.append(assistant_response)


In [None]:
conversation

[{'role': 'user',
  'content': [{'type': 'text',
    'text': 'Does a car accident occur in the provided video?'},
   {'type': 'video'}]},
 {'role': 'assistant',
  'content': [{'type': 'text',
    'text': 'Yes, a car accident does occur in the video, as a car is seen colliding with another vehicle while passing it.'}]},
 {'role': 'user',
  'content': [{'type': 'text',
    'text': 'Give an account of the accident you have just seen in the provided video.'}]},
 {'role': 'assistant',
  'content': [{'type': 'text',
    'text': "In the video, a car is seen accelerating and passing another vehicle. However, it doesn't pass by safely and collides with the second vehicle, resulting in an accident. The video clearly shows the collision and the damages sustained by both the vehicles. It's important to note that accidents can occur in any situation, and the importance of obeying traffic rules and being vigilant is always emphasized."}]},
 {'role': 'user',
  'content': [{'type': 'text',
    'text':

In [None]:
outputs

{'Does a car accident occur in the provided video?': 'Yes, a car accident does occur in the video, as a car is seen colliding with another vehicle while passing it.',
 'Give an account of the accident you have just seen in the provided video.': "In the video, a car is seen accelerating and passing another vehicle. However, it doesn't pass by safely and collides with the second vehicle, resulting in an accident. The video clearly shows the collision and the damages sustained by both the vehicles. It's important to note that accidents can occur in any situation, and the importance of obeying traffic rules and being vigilant is always emphasized.",
 'Does the video happen at day or at night?': 'The video appears to take place during the day, as the lighting is bright and clear, making it possible to see the details of the accident and the surrounding environment.',
 'What is the weather in the video?': 'The weather in the video appears to be clear and sunny, indicating that the video migh

In [None]:
conversation = [{
      "role": "user",
          "content": [
              {"type": "text", "text": "What is 2 + 1?"},
          ],
  }]

In [None]:
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor([prompt], padding=True, return_tensors="pt").to(model.device)
generate_kwargs = {"max_new_tokens": 100, "do_sample": True, "top_p": 0.90}
output = model.generate(**inputs, **generate_kwargs)
generated_text = processor.batch_decode(output, skip_special_tokens=True)

In [None]:
generated_text

['USER: What is 2 + 1? ASSISTANT: The sum of 2 and 1 is 3. округу']

In [None]:
conversation = ask_math_questions(conversation, 20)

TypeError: ask_math_questions() missing 1 required positional argument: 'video'

In [17]:
!pip install numba

from numba import cuda
device = cuda.get_current_device()
device.reset()
del model

