#Classification System
The goal of this classification system is to determine whether a given image is a meme or not.

In [None]:
import torch
import os
import shutil
import cv2
import math
import numpy as np
import matplotlib.pyplot as plt


## Taking input for image

In [None]:
img_id=input()
img_path = f"../data/img/{img_id}.png"
# Check if the input image exists
if not os.path.exists(img_path):
    print("Input image not found.")
    raise(NameError)


file_dir="curdir"
if os.path.exists(file_dir):
    shutil.rmtree(file_dir)
os.makedirs(file_dir)

# Define the output image path
output_image_path = os.path.join(file_dir, "caption.jpg")
# Copy the input image to the output folder with the specified name
shutil.copy(img_path, output_image_path)


In [None]:
import keras_ocr
# helper Function to calculate midpoint of a line
def midpoint(x1, y1, x2, y2):
    x_mid = int((x1 + x2) / 2)
    y_mid = int((y1 + y2) / 2)
    return (x_mid, y_mid)


# Initialize keras-ocr pipeline
pipeline = keras_ocr.pipeline.Pipeline()

# Path to the image with text
image_path = 'curdir/caption.jpg'

# Read the image
image = keras_ocr.tools.read(image_path) 

# Recognize text in the image
predictions = pipeline.recognize([image])

# Create a mask for inpainting
mask = np.zeros(image.shape[:2], dtype="uint8")

# Iterate through predicted text regions and create mask
for box in predictions[0]:
    x0, y0 = box[1][0]
    x1, y1 = box[1][1] 
    x2, y2 = box[1][2]
    x3, y3 = box[1][3]
    
    # Calculate midpoints for line drawing
    x_mid0, y_mid0 = midpoint(x1, y1, x2, y2)
    x_mid1, y_mid1 = midpoint(x0, y0, x3, y3)
    
    # Calculate thickness based on line length
    thickness = int(math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2))
    
    # Draw line on mask
    cv2.line(mask, (x_mid0, y_mid0), (x_mid1, y_mid1), 255, thickness)

# Inpaint the text regions
inpainted_image = cv2.inpaint(image, mask, 7, cv2.INPAINT_NS)


# Save the image without text
cv2.imwrite('curdir/nocaption.jpg', cv2.cvtColor(inpainted_image, cv2.COLOR_BGR2RGB))

### Text Extraction

The text is extracted from the dataset as it is provided, but text extraction using OCR is implemented in the text_recognition for more general applications.

In [None]:
import jsonlines


file_paths = ['../data/train.jsonl', '../data/test.jsonl','../data/dev.jsonl'] 

given_text = None
for file_path in file_paths:
        with jsonlines.open(file_path) as reader:
            for obj in reader:
                # print(obj["id"])
                
                if obj["id"] == int(img_id):
                    
                    given_text=obj["text"]
                    

print(given_text)


### Object Detection without caption
Yolov8 from the ultralytics module was used as its pretrained model gave the highest accuracy

In [None]:
import matplotlib.image as mpimg
from ultralytics import YOLO
import numpy as np
model = YOLO('yolov8m.pt')

predictions = model(source="curdir/nocaption.jpg", show=False, conf=0.5, save=True,project='curdir', name='no_captions',save_txt=None)
labels={0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'}

# print(predictions)
# print(predictions[0])
labels_seen=[]
prompt_caption_gen=" "
for idx, prediction in enumerate(predictions[0].boxes.xywhn): 
    cls = int(predictions[0].boxes.cls[idx].item())
   
    label=labels[cls]
    
    if cls!=0 and cls not in labels_seen:
        prompt_caption_gen+=label+" "
    labels_seen.append(cls)
    
          
        
          
# print(prompt_caption_gen)   
image = mpimg.imread("curdir/no_captions/nocaption.jpg")

plt.imshow(image)
plt.axis('off')  # Turn off axis
plt.show()

### Race Detection
using DeepFace module , in paticular the gender and race and emotion detector. The model also supports detection of age and emotion

In [None]:
from deepface import DeepFace

print(prompt_caption_gen)   
analysis = DeepFace.analyze(img_path, actions = ["gender", "race","emotion"],enforce_detection=False)
print(analysis)
tags_seen=[]
for person in analysis:
    if person["dominant_gender"] not in tags_seen and person["dominant_race"] not in tags_seen : 
        prompt_caption_gen += person["dominant_race"] + " " + person["dominant_gender"] + " " 
        tags_seen.append(person["dominant_race"])
        tags_seen.append(person["dominant_gender"])

    
print(prompt_caption_gen)


### Caption generation

In [None]:

from promptcap import PromptCap

model = PromptCap("tifa-benchmark/promptcap-coco-vqa")  # also support OFA checkpoints. e.g. "OFA-Sys/ofa-large"

image = "curdir/nocaption.jpg"
caption_generated=model.caption(prompt_caption_gen, image)
print(caption_generated)

### Sentence Similarity using semantic analysis

Here, sentence similarity is used as the measure to determine whether an image is a meme or not. The two text fields are:

1. The given text in the image.
2. Caption generated based on the images, generated with the context of races and objects within the image.


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
model= SentenceTransformer('bert-base-nli-mean-tokens')

vector=model.encode([given_text,caption_generated])

print(cosine_similarity([vector[0]],vector[1:]))