<a href="https://colab.research.google.com/github/Yogesh914/cv-model-exploration/blob/main/in_context_learning_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# In-Context Learning With Gemma-7b 🦋

## Intial setup

In [None]:
!pip install transformers pydub accelerate bitsandbytes hf_transfer

In [None]:
!pip install -U transformers

In [None]:
import torch
import transformers
from IPython.display import Markdown, display
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from google.colab import userdata
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from moviepy.editor import VideoFileClip
import numpy as np
import os
from pydub import AudioSegment
import pandas as pd

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

## Transcription Using Whisper v3 (large)

In [None]:
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, use_safetensors=True, torch_dtype=torch_dtype, low_cpu_mem_usage=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=500,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    device=device,
    torch_dtype=torch_dtype
)

In [None]:
def process_video(video_file):
    video = VideoFileClip(video_file)
    audio = video.audio

    audio_segment = AudioSegment.from_file(video_file, format="mp4")
    audio_segment = audio_segment.set_frame_rate(16000)
    audio_array = np.array(audio_segment.get_array_of_samples())

    if audio_segment.channels == 2:
        audio_array = audio_array.reshape((-1, 2))
        audio_array = audio_array.mean(axis=1)
    audio_array = audio_array.astype(np.float32) / (2**15)

    result = pipe(audio_array)
    return result["text"]

In [None]:
data_folder = '/content/drive/MyDrive/Colab Notebooks/data/beta_vids'

captions = []
video_files = sorted(os.listdir(data_folder))
for video_file in video_files:
    if not video_file.endswith('.mp4'):
        continue
    video_path = os.path.join(data_folder, video_file)
    caption = process_video(video_path)
    captions.append(caption)

df_captions = pd.DataFrame({'Captions': captions})
df_captions

## Curating Dataset

In [None]:
ema_survey = pd.read_csv(os.path.join("/content/drive/MyDrive/Colab Notebooks/data/ema.csv"))
filtered_columns = ['ema_aware', 'ema_support', 'ema_insight', 'ema_fulfilled', 'ema_hopeless', 'ema_anxious', 'Trigger.Index']
df_filtered = ema_survey[filtered_columns]

df_final = df_filtered[ema_survey['User.Email'] == '']
df_final = df_final.dropna()
df_final.reset_index(drop=True, inplace=True)

In [None]:
merge = pd.concat([df_captions, df_final], axis=1)
merge.to_csv(os.path.join("/content/drive/MyDrive/Colab Notebooks/data/merged_captions.csv"), index=False)
merge

## Creating Custom Prompt

In [None]:
df = merge

def format_prompt(row):
    return f"Caption: {row['Captions']}\n" \
           f"Had you noticed you were feeling this way before we asked?: {int(row['ema_aware'])}\n" \
           f"Did you feel you were supported by others?: {int(row['ema_support'])}\n" \
           f"Did you recognize how your feelings were influencing your outlook on things?: {int(row['ema_insight'])}\n" \
           f"How fulfilled did you feel?: {int(row['ema_fulfilled'])}\n" \
           f"How hopeless did you feel?: {int(row['ema_hopeless'])}\n" \
           f"How anxious did you feel?: {int(row['ema_anxious'])}"

prompt = "\n\n".join(df.iloc[3:6].apply(format_prompt, axis=1))

instructions = "\n\nBased on the previous entries, predict the ratings for the following caption on a scale of 1 to 5 and make sure to give only the your answer in json format and nothing else:"
last_caption = df.iloc[-1]['Captions']

prompt += f"{instructions}\n\nCaption: {last_caption}\n" \
          "Had you noticed you were feeling this way before we asked?: \n" \
          "Did you feel you were supported by others?: \n" \
          "Did you recognize how your feelings were influencing your outlook on things?: \n" \
          "How fulfilled did you feel?: \n" \
          "How hopeless did you feel?: \n" \
          "How anxious did you feel?: "

print(prompt)

## Prompting Gemma-7b-it

In [None]:
os.environ ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'
torch.set_default_device(device)

In [None]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-it", token=userdata.get('hgemma'))

model = AutoModelForCausalLM.from_pretrained("google/gemma-7b-it",
                                             torch_dtype="auto",
                                             device_map="auto", token=userdata.get('hgemma')
                                             )

In [None]:
chat = [
    { "role": "user", "content": prompt },
]

input = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
input

In [None]:
inputs = tokenizer.encode(prompt, add_special_tokens=True, return_tensors="pt")

outputs = model.generate(input_ids=inputs.to("cuda"),
                         max_new_tokens=512)

text = tokenizer.decode(outputs[0],skip_special_tokens=True, clean_up_tokenization_spaces=True)
display(Markdown(text))