###  Audio Extraction

In [21]:
import os
import cv2  
import torch
import whisper
import easyocr
import subprocess
import numpy as np
from PIL import Image
from tqdm import tqdm
from torchvision import transforms
from transformers import CLIPProcessor, CLIPModel
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import T5Tokenizer, T5ForConditionalGeneration


video_path = "sample.mp4" 
audio_path = "audio.wav"  
output_folder = "whisper_out"

In [None]:

# Extract Audio from Video
subprocess.run(["ffmpeg", "-y",  "-i", video_path, "-ac", "1", "-ar", "16000", audio_path])
model = whisper.load_model("medium")


# Transcribe returns (segments + timestamps + confidence)
result = model.transcribe(audio_path, verbose=True)


# apply bg music filter, threshold for Bg music I used is 0.3 confidance 
filtered_lines = []
for segment in result["segments"]:
    if segment.get("avg_logprob", 0) > -1.2:  # so the model is using the avg log Probability, so log (0.3) =-1.2 so that is why i have to use its after conversion 
        start = segment["start"]
        end = segment["end"]
        text = segment["text"].strip()
        timestamp = "[" + str(round(start, 2)) + " - " + str(round(end, 2)) + "]"
        filtered_lines.append(timestamp + " " + text)


output_file = os.path.join(output_folder, "filtered.txt")
with open(output_file, "w", encoding="utf-8") as f:
    for line in filtered_lines:
        f.write(line + "\n")



if not filtered_lines:
    print("No speech detected")
else:
    print(f"{len(filtered_lines)}")


Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: English
[00:00.000 --> 00:04.680]  You know AI trends are moving very fast.
[00:04.680 --> 00:06.560]  It's almost on a daily basis.
[00:06.560 --> 00:09.620]  But I'm here to help your brand to catch the latest trend.
[00:09.620 --> 00:11.920]  So let me show you AI Capsule.
[00:11.920 --> 00:13.700]  It's an amazing solution.
[00:13.700 --> 00:16.560]  So here, as usual, that's the live view.
[00:16.560 --> 00:17.560]  They will place themselves.
[00:17.560 --> 00:20.880]  Let me take a pose.
[00:20.880 --> 00:27.480]  And now the AI will start to, in the back end, transform my photo into the capsule.
[00:27.480 --> 00:32.520]  And what's interesting about this capsule during your event is that we can put your
[00:32.520 --> 00:38.640]  prospect inside that capsule and inside that capsule will be the product that you are providing
[00:38.640 --> 00:39.640]  in the market.


###  Extract Unique Frames with CLIP

In [11]:
video_path = "./sample.mp4"
frame_output_dir = "keyframes"

clip_model_name = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
clip_model = CLIPModel.from_pretrained(clip_model_name).to("cpu")
clip_processor = CLIPProcessor.from_pretrained(clip_model_name)

video_capture = cv2.VideoCapture(video_path)



frames_per_second = int(video_capture.get(cv2.CAP_PROP_FPS))
print()
print("Frames in 1 sec = ", frames_per_second)
total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
print()
print("total_frames = ", total_frames)
video_duration = total_frames // frames_per_second  
print()
print("Length sec = ", video_duration)
print()
print()



frame_index = 0
current_second = 0
saved_frames = []
previous_clip_embedding = None


for current_second in range(video_duration):
    frame_position = current_second * frames_per_second
    video_capture.set(cv2.CAP_PROP_POS_FRAMES, frame_position)
    success, frame = video_capture.read()

    if success == False:
        break

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(rgb_frame)
    model_inputs = clip_processor(images=pil_image, return_tensors="pt")

    model_inputs = model_inputs.to("cpu")
   
    with torch.no_grad():
        image_features = clip_model.get_image_features(**model_inputs)
        image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)


    # If the frame has a lot of similarities as previous then there is no need to save it 
    should_save_frame = False

    if previous_clip_embedding is None:
        should_save_frame = True
    else:
        similarity_score = torch.nn.functional.cosine_similarity(image_features, previous_clip_embedding)
        similarity_score = similarity_score.item()
        if similarity_score < 0.85:
            should_save_frame = True



    if should_save_frame == True:
        output_filename = "frame_" + str(frame_index).zfill(4) + ".jpg"
        output_path = os.path.join(frame_output_dir, output_filename)
        pil_image.save(output_path)
        saved_frames.append(output_path)
        previous_clip_embedding = image_features
        frame_index = frame_index + 1


print(f"{len(saved_frames)} unique keyframes ")


Frames in 1 sec =  23

total_frames =  2213

Length sec =  96


54 unique keyframes 


### Captioning Keyframes 

In [14]:
frame_folder = "keyframes"            
output_folder = "captions"           
output_file = os.path.join(output_folder, "blip_captions.txt") 
device = torch.device("cpu")

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", use_safetensors=True)
model = model.to(device)

captions = []

for file_name in sorted(os.listdir(frame_folder)):

    if file_name.lower().endswith(".jpg"):
        image_path = os.path.join(frame_folder, file_name)
        image = Image.open(image_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt")
        inputs = inputs.to(device)

        with torch.no_grad():
            output = model.generate(**inputs, synced_gpus=False)
            caption = processor.decode(output[0], skip_special_tokens=True)

        full_caption = file_name + ": " + caption
        captions.append(full_caption)

        print(file_name, " : " , caption)



with open(output_file, "w", encoding="utf-8") as file:
    for line in captions:
        file.write(line + "\n")



frame_0000.jpg  :  a city with skyscrapers and cars
frame_0001.jpg  :  a man standing in front of a window looking out at the city
frame_0002.jpg  :  a man in a white shirt and sunglasses standing in front of a window
frame_0003.jpg  :  a man in a white shirt and sunglasses standing in front of a window
frame_0004.jpg  :  a man in a white shirt and sunglasses stands in front of a window
frame_0005.jpg  :  a man in a white shirt and sunglasses standing in front of a window
frame_0006.jpg  :  a man in a white shirt and sunglasses stands in front of a large screen
frame_0007.jpg  :  a man in a white shirt is holding a black tablet
frame_0008.jpg  :  a man is holding a tablet computer in his hand
frame_0009.jpg  :  a large screen with a picture of a man in a suit
frame_0010.jpg  :  a person is using a video camera to take pictures
frame_0011.jpg  :  a man is using a large screen to see a video
frame_0012.jpg  :  a man in a white lab coat is looking at a computer screen
frame_0013.jpg  :  a

### Sequential Captioning (5 Frame treated as a short clip) used to capture motions

In [None]:
# IGNORE THIS CODE CELL THIS INDICATES AN UNSOLVED QUERY IN MY MIND

# prompt = f"summarize: Write a concise YouTube video description based on these captions: {combined}. Focus on the product, service, or main concept shown. Describe its features, benefits, and how it’s used in the scene. Include a call to action like subscribing or visiting a website. Ignore details about people’s appearance unless relevant to the product."

#  Window 1  A man in a white shirt and sunglasses stands in front of a large screen. Describe the product, service, or main concept.
#  Window 2  A man in a white shirt is holding a tablet computer in his hand. A person is using a video camera to take pictures a man is using the computer screen a large screen with a picture of the man in the lab. Focus on the product, service, or main concept shown. Describe its features, benefits, and how it’s used in the scene.
#  Window 3  Describe the product, service, or main concept.
#  Window 4  A man in a white shirt and sunglasses is standing in front of a computer. He is holding a cigarette and holding his hands up. Explain the product, service, or main concept.
#  Window 5  A man in a white shirt and sunglasses stands in front of a window. A person holding a cell phone in their hand is pointing at a large screen. Describe its features, benefits, and how it’s used in the scene.
#  Window 6  A man in a white shirt and sunglasses is standing in front of a small house. Focus on the product, service, or main concept shown. Describe its features, benefits, and how it’s used in the scene.
#  Window 7  Describe the product, service, or main concept of the product.
#  Window 8  Describe the product’s features, benefits, and how it’s used.

In [20]:
input_file = "captions/blip_captions.txt"
output_file = "captions/window_summaries.txt"
window_size = 5
device = torch.device("cpu")

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small").to(device)

with open(input_file, "r", encoding="utf-8") as file:
    captions = [line.strip() for line in file.readlines() if line.strip()]

summaries = []

for start_index in range(0, len(captions), window_size):

    window = captions[start_index : start_index + window_size]
    combined_text = ""

    for caption in window:
        parts = caption.split(":", 1)

        if len(parts) == 2:
            text_only = parts[1].strip()
            combined_text += text_only + " "

    prompt = "summarize and don't add word like Summary: in the output: " + combined_text.strip()

    encoded_input = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    input_ids = encoded_input["input_ids"]
    attention_mask = encoded_input["attention_mask"]

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=100,
            num_beams=6,
            no_repeat_ngram_size=3,
            early_stopping=True,
            synced_gpus=False
        )

    summary_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    window_number = (start_index // window_size) + 1
    full_summary = f"Window {window_number}: {summary_text}"
    summaries.append(full_summary)

    print(full_summary)

with open(output_file, "w", encoding="utf-8") as file:
    for summary in summaries:
        file.write(summary + "\n")



Window 1: looking out at the city a man in a white shirt and sunglasses standing in front of a window
Window 2: a man in a white shirt and sunglasses standing in front of a window
Window 3: a person is using a video camera to take pictures
Window 4: a man points at a screen displaying a video game
Window 5: a man in a white shirt and sunglasses is standing in front of a computer
Window 6: a man in a suit is standing in front of a large window
Window 7: a man in a white shirt is pointing at a large screen
Window 8: A man in a white shirt and sunglasses stands in an office.
Window 9: a small house in the middle of a forest at night a large orange egg with the word ' home ' on it
Window 10: a person is reflected in a mirror a bottle with the word ' i do ' on it
Window 11: a toy in a glass ball with the word ' botme ' a black background with a crescent in the middle


### OCR to get the text from frames

In [22]:

frame_folder = "keyframes"                  
output_folder = "ocr"                    
output_file = os.path.join(output_folder, "all_ocr.txt") 

reader = easyocr.Reader(['en'], gpu=False)
ocr_results = []

for file_name in sorted(os.listdir(frame_folder)):
    if file_name.lower().endswith(".jpg"):
        
        image_path = os.path.join(frame_folder, file_name)
        text_list = reader.readtext(image_path, detail=0) 
        
        # data processing like I am handling white sapaces etc  
        cleaned_texts = []
        for text in text_list:
            if text.strip():
                cleaned_texts.append(text.strip())

        if cleaned_texts:
            final_line = " | ".join(cleaned_texts)
            ocr_results.append(f"{file_name}: {final_line}")
            print(f"{file_name} : {final_line}")

        else:
            print(f"{file_name} : No text ")


with open(output_file, "w", encoding="utf-8") as file:
    for result_line in ocr_results:
        file.write(result_line + "\n")

Using CPU. Note: This module is much faster with a GPU.


frame_0000.jpg : No text 
frame_0001.jpg : #XoxO
frame_0002.jpg : #XoxQ
frame_0003.jpg : No text 
frame_0004.jpg : No text 
frame_0005.jpg : EXOXQ
frame_0006.jpg : UOKO | iboothme
frame_0007.jpg : boathne | Etee
frame_0008.jpg : iboothme
frame_0009.jpg : No text 
frame_0010.jpg : No text 
frame_0011.jpg : No text 
frame_0012.jpg : 3
frame_0013.jpg : No text 
frame_0014.jpg : Sed | ibootme
frame_0015.jpg : DASP | iboothme | FND
frame_0016.jpg : El | SeNd | Iothne
frame_0017.jpg : ibooothne
frame_0018.jpg : No text 
frame_0019.jpg : No text 
frame_0020.jpg : No text 
frame_0021.jpg : No text 
frame_0022.jpg : No text 
frame_0023.jpg : Fiboothme
frame_0024.jpg : iboothme
frame_0025.jpg : ix
frame_0026.jpg : No text 
frame_0027.jpg : No text 
frame_0028.jpg : No text 
frame_0029.jpg : btnn
frame_0030.jpg : iboothme
frame_0031.jpg : iboothme | Avat | Iboothma | Conntm
frame_0032.jpg : Gicims | iboothme
frame_0033.jpg : ibothne | Dar | contrm
frame_0034.jpg : No text 
frame_0035.jpg : No tex

### Merging all 

In [23]:
transcript_path = "whisper_out/filtered.txt"
captions_path = "captions/blip_captions.txt"
ocr_path = "ocr/all_ocr.txt"
summaries_path = "captions/window_summaries.txt"
output_path = "facts/facts.txt"
output = open(output_path, "w", encoding="utf-8")


# transcript 
output.write("\nTranscript\n")
file1 = open(transcript_path, "r", encoding="utf-8")

for line in file1:
    line = line.strip()
    if line:
        output.write("- " + line + "\n")

file1.close()


# image captions 
output.write("\nImage Captions\n")
file2 = open(captions_path, "r", encoding="utf-8")

for line in file2:
    line = line.strip()
    if line:
        output.write("- " + line + "\n")

file2.close()



# OCR text 
output.write("\nOCR Text\n")
file3 = open(ocr_path, "r", encoding="utf-8")

for line in file3:
    line = line.strip()
    if line:
        output.write("- " + line + "\n")

file3.close()



# Clips summary
output.write("\nFrame Summaries\n")
file4 = open(summaries_path, "r", encoding="utf-8")

for line in file4:
    line = line.strip()
    if line:
        output.write("- " + line + "\n")
file4.close()

### Multi-layer Prompting System
- 3 descriptions 
- Rerank all desscriptions using CLIP model(on visual similarity to keyframes) and select only best one 
- Add contact info + hashtags 


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, CLIPProcessor, CLIPModel
facts_file       = "facts/facts.txt"
keyframes_folder = "keyframes"
output_file      = "final_description.txt"
device           = torch.device("cpu")
# LLM Model
model_name = "google/flan-t5-large"
tokenizer  = T5Tokenizer.from_pretrained(model_name)
t5_model   = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
t5_model.eval()

# Clip Model
clip_name      = "openai/clip-vit-base-patch32"
clip_model     = CLIPModel.from_pretrained(clip_name).to(device)
clip_processor = CLIPProcessor.from_pretrained(clip_name)
clip_model.eval()

with open(facts_file, "r", encoding="utf-8") as f:
    facts_text = f.read().strip()

sample_examples = """
Sample 1:
#DigitalArt #BrandActivation #PhotoBoothTechnology
When a leading paint manufacturer challenged us to create an unforgettable brand experience, we delivered with the AI Mural Painting Photo Experience…
Key Highlights
• AI-enhanced photography with vivid painting effects
• Interactive color selection tailored to brand palettes
• Live multi-screen mural display with rotating user content
Looking to transform your next event?

Sample 2:
Client: Huda Beauty  
With the Claw Machine by iboothme, you can:
• Create interactive experiences with branded quizzes and games
• Distribute products intelligently while capturing valuable data
Ready to take your brand activation to the next level?

Sample 3:
#AICapsule #capsuletrend #AIPhotoBooth
Meet the AI Capsule — your new secret weapon for events…
How it works:
• Guests register and enter details
• AI transforms images into a branded capsule, live
Perfect for product launches, retail activations, experiential events.
"""


candidates = []

for i in range(3):
    prompt = (
        "Write a concise (around 150 words) marketing description in the style of these examples:\n"
        + sample_examples
        + "\nFACTS:\n"
        + facts_text
        + "\n\nYOUR DESCRIPTION:"
    )
    enc = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)

    with torch.no_grad():
        out_ids = t5_model.generate(
            input_ids=enc["input_ids"],
            attention_mask=enc["attention_mask"],
            max_new_tokens=100,
            num_beams=4,
            no_repeat_ngram_size=3,
            early_stopping=True
        )

    text = tokenizer.decode(out_ids[0], skip_special_tokens=True).strip()
    candidates.append(text)
    print(f"Candidate {i+1}:\n{text}\n")




# Slecting the best

images = []

for fn in sorted(os.listdir(keyframes_folder)):

    if fn.lower().endswith((".jpg", ".png")):
        images.append(Image.open(os.path.join(keyframes_folder, fn)).convert("RGB"))

    if len(images) == 3:
        break

scores = []

if images:

    for idx, desc in enumerate(candidates):
        clip_in = clip_processor(
            text=[desc],
            images=images,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=77
        ).to(device)
       
        with torch.no_grad():
            score = clip_model(**clip_in).logits_per_text.mean().item()
        scores.append(score)
    best_index = scores.index(max(scores))

else:
    print("No keyframes found, defaulting to candidate 1")
    best_index = 0

print(f"\nSelected candidate: {best_index+1}\n")
final_description = candidates[best_index]



# hashtags
print("Generating hashtags...\n")
tag_prompt = "Extract 10-15 hashtags from this text:\n" + final_description
enc = tokenizer(tag_prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
with torch.no_grad():
    tag_ids = t5_model.generate(
        input_ids=enc["input_ids"],
        attention_mask=enc["attention_mask"],
        max_new_tokens=50,
        do_sample=False
    )
raw_tags = tokenizer.decode(tag_ids[0], skip_special_tokens=True)
all_tags = [t for t in raw_tags.replace(",", " ").split() if t.startswith("#")][:15]
print(f"Parsed hashtags: {all_tags}\n")
if all_tags:
    final_description += "\n\n" + " ".join(all_tags)



# contact info 
if "info@iboothme.com" not in final_description:
    final_description += (
        "\n\nContact: info@iboothme.com | Phone: +971 4 448 8563 | https://www.iboothme.com"
    )

with open(output_file, "w", encoding="utf-8") as f:
    f.write(final_description)
print("Final description saved :", output_file)
print()
print("Final description")
print(final_description)


Candidate 1:
AI-enhanced photography with vivid painting effects. Live multi-screen mural display with rotating user content. Interactive quizzes and games.

Candidate 2:
AI-enhanced photography with vivid painting effects. Live multi-screen mural display with rotating user content. Interactive quizzes and games.

Candidate 3:
AI-enhanced photography with vivid painting effects. Live multi-screen mural display with rotating user content. Interactive quizzes and games.

Score for candidate 1: 21.1954
Score for candidate 2: 21.1954
Score for candidate 3: 21.1954

Selected candidate: 1

Generating hashtags...

Parsed hashtags: ['#iphone', '#iphone', '#iphone', '#iphone', '#iphone', '#iphone', '#iphone', '#iphone', '#iphone', '#iphone', '#iphone', '#iphone', '#iphone', '#iphone', '#iphone']

Appended contact info.

Final description saved to: final_description.txt

===== FINAL DESCRIPTION =====

AI-enhanced photography with vivid painting effects. Live multi-screen mural display with rotat

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

input_file  = "final_description.txt"
output_file = "expanded_description.txt"
device      = torch.device("cpu")

model_name = "google/flan-t5-large"
tokenizer  = T5Tokenizer.from_pretrained(model_name)
model      = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
model.eval()

with open(input_file, "r", encoding="utf-8") as f:
    text = f.read().strip()

sentences = [s.strip() for s in text.split('.') if s.strip()]
main_sentences = sentences[:3]

prompt = (
    "You are a senior marketing copywriter.\n"
    "Below are three main feature sentences. For each one, write 2–3 sentences explaining:\n"
    "- Why this feature matters\n"
    "- How it impacts marketing and brand image\n"
    "- What it does for the target audience\n\n"
)
for i, sent in enumerate(main_sentences, 1):
    prompt += f"{i}. {sent}.\n"

prompt += "\nCompose these expansions into a single flowing marketing paragraph, no hashtags:\n\n"

enc = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
with torch.no_grad():
    out = model.generate(
        input_ids=enc["input_ids"],
        attention_mask=enc["attention_mask"],
        min_length=150,
        max_new_tokens=200,
        num_beams=4,
        no_repeat_ngram_size=3,
        early_stopping=True
    )
expanded = tokenizer.decode(out[0], skip_special_tokens=True).strip()

expanded += "\n\nContact:\n- info@iboothme.com\n- +971 4 448 8563\n- https://www.iboothme.com"

with open(output_file, "w", encoding="utf-8") as f:
    f.write(expanded)

print("Expanded description saved to:", output_file)
print("\nEXPANDED DESCRIPTION \n")
print(expanded)


Expanded description saved to: expanded_description.txt

===== EXPANDED DESCRIPTION =====

1. AI-enhanced photography with vivid painting effects. 2. Live multi-screen mural display with rotating user content. 3. Interactive quizzes and games. Each feature has a different purpose, but they all have one thing in common: They all have a big impact on marketing and brand image. The first feature focuses on AI. The second feature is a live mural display. The third feature is an interactive quiz and game. The final feature is the last one, but it's the most important one. It's about how it affects marketing, brand image, and what it does for the target audience. For example, the first feature is about how AI can be used to create a mural.

Contact:
- info@iboothme.com
- +971 4 448 8563
- https://www.iboothme.com


### Fix to hastag bug

In [44]:
import re
from collections import Counter

with open("expanded_description.txt", "r", encoding="utf-8") as f:
    text = f.read()

stopwords = {
    "about","after","again","against","among","around","because","before","being",
    "below","between","both","could","during","each","first","found","from","have",
    "having","however","into","other","over","through","under","while","which","your",
    "their","there","where","with","this","that","these","those","would","should"
}

words = re.findall(r"\b[a-zA-Z]{5,}\b", text.lower())

candidates = [w for w in words if w not in stopwords]
most_common = Counter(candidates).most_common(15)
hashtags = [f"#{word}" for word, count in most_common]
final_text = text.strip() + "\n\n" + " ".join(hashtags)

with open("expanded_with_hashtags.txt", "w", encoding="utf-8") as f:
    f.write(final_text)

print("Generated hashtags:", hashtags)
print("KINDLYY SEE expanded_with_Hastags.txt for my Literally the final output")


Generated hashtags: ['#feature', '#mural', '#display', '#interactive', '#marketing', '#brand', '#image', '#iboothme', '#enhanced', '#photography', '#vivid', '#painting', '#effects', '#multi', '#screen']
KINDLYY SEE expanded_with_Hastags.txt for my Literally the final output
