Detecting Advertisement from image

In [13]:
!pip3 install -q -U transformers
!pip3 install -q -U torch
!pip3 install -q -U huggingface_hub
!pip3 install -q -U dotenv

In [14]:
from huggingface_hub import login
import os
from dotenv import load_dotenv

hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
from transformers import pipeline
import torch

device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

model_id = "google/gemma-3-4b-it"

pipe = pipeline(
    "image-text-to-text",
    model=model_id,
    device=device,          
    torch_dtype=torch.bfloat16
)

Using device: mps


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use mps


In [18]:
messages = [
    {
        "role": "system",
        "content": [
            {"type": "text", "text": (
                "You are a potential customer looking at Google review images. "
                # "Always respond in two clearly labeled sections:\n"
                # "Answer: <short, factual yes/no/N/A>\n"
                # "Rationale: <explain based on visible features>"
            )}
        ]
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://lh5.googleusercontent.com/p/AF1QipMBzN4BJV9YCObcw_ifNzFPm-u38hO3oimOA8Fb=w150-h150-k-no-p"},
            {"type": "text", "text": (
                "Describe the image in detail."
            )}
        ]
    }
]

output = pipe(text=messages, max_new_tokens=128)
print(output[0]["generated_text"][-1]["content"])

Okay, here's a detailed description of the image I’m seeing:

**Overall Impression:** The image appears to be an interior shot of what looks like a reception or waiting area, possibly for a medical or wellness practice. It has a modern and somewhat minimalist aesthetic.

**Key Elements:**

*   **Ceiling:** The ceiling is a neutral gray, with a slightly textured or popcorn finish. There are recessed lighting fixtures evenly spaced across the ceiling.
*   **Lighting:** There are four pendant lights hanging down from the ceiling. They have a distinctive design – a dark base with a light wooden or painted top.


Image Analysis

Created two columns via prompt engineering:
- Full description of the image
- Check if the image looks like an advertisement

In [None]:
# Take first 3 reviews
pics_test = pics.head(3).copy()

for col in ["Image_Description", "Is_Advertisement"]:
    if col not in pics.columns:
        pics[col] = ""

advertisement_examples = [
    "Billboard with product name and price",
    "Social media post promoting a sale",
    "Banner showing a company logo with a slogan",
    "Flyer with a discount coupon"
]

for idx, row in pics_test.iterrows():
    pics_list = ast.literal_eval(row['pics_collapsed'])

    answers = []
    descriptions = []
    
    for image_url in pics_list:
        messages = [
            {
                "role": "system",
                "content": [
                    {"type": "text", "text": (
                        "You are an AI assistant that classifies images as advertisements. "
                        "Always respond in two clearly labeled sections:\n"
                        "Answer: <Yes/No/N/A>\n"
                        "Description: Two sentences to describe what’s happening in the photo, "
                        "and hence the rationale.\n"
                        f"Examples of advertisements include: {', '.join(advertisement_examples)}"
                    )}
                ]
            },
            {
                "role": "user",
                "content": [
                    {"type": "image", "url": image_url},
                    {"type": "text", "text": "Is this image an advertisement? If unclear, say Answer: N/A."}
                ]
            }
        ]

        output = pipe(text=messages, max_new_tokens=128)
        response = output[0]["generated_text"][-1]["content"]

        # Default values
        answer, rationale = "N/A", ""

        for line in response.splitlines():
            if line.startswith("Answer:"):
                answer = line.split(":", 1)[1].strip()
            elif line.startswith("Description:"):
                rationale = line.split(":", 1)[1].strip()

        answers.append(answer)
        descriptions.append(rationale)

    # Final decision rule
    if "Yes" in answers:
        final_answer = "Yes"
    elif "No" in answers:
        final_answer = "No"
    else:
        final_answer = "N/A"

    pics.at[idx, 'Is_Advertisement'] = final_answer
    pics.at[idx, 'Image_Description'] = " | ".join(descriptions)

print(pics[['user_id', 'Image_Description', 'Is_Advertisement']].head(3))

                 user_id                                   Full_Description  \
0  101856865551768948430  The image shows a close-up of a rusty motorcyc...   
1  107787438275893909028  The image shows a family holding a flag of Col...   
2  108564695687320799964  The image shows a person skiing in a snowy lan...   

  Is_Advertisement  
0               No  
1              Yes  
2               No  


Create column for key words for the image:
- setting
- objects
- business type
- activities
- text

In [62]:
# Use .loc to avoid SettingWithCopyWarning
pics_test.loc[:, 'Key_Words'] = ""

# Prompt template
PROMPT = """You are an image analyst. 
Describe this image in detail in the following structured format only:

- Setting: <indoor/outdoor, type of place>
- Objects: <things visible>
- Business type: <restaurant, shop, etc., best guess or N/A>
- Activities: <what is happening>
- Text: <any text visible in the image or N/A>

Do not include any extra commentary, questions, or greetings.  
If unsure about a category, write 'N/A'.
Here is the image: <image>
"""

# Loop through first 3 rows
for idx, row in pics_test.iterrows():
    pics_list = ast.literal_eval(row['pics_collapsed'])  # convert string to list
    descriptions = []

    for i, image_url in enumerate(pics_list, start=1):
        messages = [
            {
                "role": "system",
                "content": [
                    {"type": "text", "text": PROMPT}
                ]
            },
            {
                "role": "user",
                "content": [
                    {"type": "image", "url": image_url},
                    {"type": "text", "text": "Describe the image in the structured format above."}
                ]
            }
        ]

        try:
            output = pipe(text=messages, max_new_tokens=200)
            desc = output[0]["generated_text"][-1]["content"]
            descriptions.append(f"--- Image {i} ---\n{desc}")
        except Exception as e:
            descriptions.append(f"--- Image {i} ---\nN/A (Error processing image: {e})")

    # Combine multiple images into a single string per review
    pics_test.loc[idx, 'Key_Words'] = "\n\n".join(descriptions)

# View results
for idx, row in pics_test.iterrows():
    print(f"Review {idx}:")
    print(row['Key_Words'])
    print("="*80)

Review 0:
--- Image 1 ---
- Setting: Outdoor, garage
- Objects: Brake rotor, bolts, wheel hub, tire, concrete floor
- Business type: N/A
- Activities: Maintenance or repair
- Text: N/A

--- Image 2 ---
- Setting: Indoor
- Objects: Hand, bolt, metal, thread
- Business type: N/A
- Activities: N/A
- Text: N/A
Review 1:
--- Image 1 ---
- Setting: Outdoor, roadside
- Objects: Four people, a large flag, a van, a building, trees, a road, a utility pole
- Business type: N/A
- Activities: Group photo
- Text: “FROM RECOVERY STATE”

--- Image 2 ---
- Setting: Indoor
- Objects: Sign, purple fabric, logos (Recovery Support, Readdy, Phoenix Recovery, and Quantum Health)
- Business type: N/A
- Activities: N/A
- Text: RECOVERY SUPPORT
Review 2:
--- Image 1 ---
- Setting: Outdoor, ski resort
- Objects: Person, skis, snow, trees, buildings, ski lift
- Business type: N/A
- Activities: Skiing, taking a photo
- Text: N/A

--- Image 2 ---
- Setting: Outdoor, snow-covered landscape
- Objects: Snow, trees, sk

In [None]:
print(pics_test[['user_id', 'Image_Description', 'Key_Words','Is_Advertisement']])

                 user_id Full_Description  \
0  101856865551768948430                    
1  107787438275893909028                    
2  108564695687320799964                    

                                           Key_Words Is_Advertisement  
0  --- Image 1 ---\n- Setting: Outdoor, garage\n-...               No  
1  --- Image 1 ---\n- Setting: Outdoor, roadside\...              Yes  
2  --- Image 1 ---\n- Setting: Outdoor, ski resor...               No  


Create a 

In [None]:
pics_test.loc[:, 'Key_Words'] = ""

# Prompt template
PROMPT = """You are an image analyst. 
Describe this image in detail in the following structured format only:

- Setting: <indoor/outdoor, type of place>
- Objects: <things visible>
- Business type: <restaurant, shop, etc., best guess or N/A>
- Activities: <what is happening>
- Text: <any text visible in the image or N/A>

Do not include any extra commentary, questions, or greetings.  
If unsure about a category, write 'N/A'.
Here is the image: <image>
"""