### Captioning Images

In [3]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
import torch
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm
2023-10-06 16:02:14.547442: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-06 16:02:14.698437: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
modelAddress = "myModel/snapshots/dc68f91c06a1ba6f15268e5b9c13ae7a7c514084/"

In [5]:
model = VisionEncoderDecoderModel.from_pretrained(modelAddress)

In [6]:
feature_extractor = ViTFeatureExtractor.from_pretrained(modelAddress)
tokenizer = AutoTokenizer.from_pretrained(modelAddress)



In [7]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_featur

In [8]:
max_length = 16
num_beams = 4

gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

In [9]:
def predict_step(imagePath):
    image = Image.open(imagePath)
    if image.mode != "RGB":
        image = image.convert("RGB")
    
    pixelValue = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)

    outputId = model.generate(pixelValue, **gen_kwargs)

    pred = tokenizer.decode(outputId[0], skip_special_tokens=True)
    preds = pred.strip()

    return preds


In [10]:
predict_step("testImages/test1.jpg")

'a woman walking down the street with a suitcase'

---

#### Text to speech

In [11]:
# Import the required module for text   
from gtts import gTTS # to speech conversion
import os # This module is imported so that we can play the converted audio  

In [12]:
# The text that you want to convert to audio 
mytext = predict_step("testImages/test1.jpg")

In [13]:
# Language in which you want to convert 
language = 'en'

In [14]:
# Passing the text and language to the engine,  
# here we have marked slow=False. Which tells  
# the module that the converted audio should  
# have a high speed 
myobj = gTTS(text=mytext, lang=language, slow=False) 
myobj.save("welcome.mp3") 

---

In [15]:
n = 8
for i in range(1, n + 1):
    print(i, predict_step(f"testImages/test{i}.jpg"), sep = " ---> ")

1 ---> a woman walking down the street with a suitcase
2 ---> a red stop sign sitting on the side of a road
3 ---> a red traffic light sitting on the side of a road
4 ---> a man sitting in a chair with a remote in his hand
5 ---> a bed room filled with lots of blankets and pillows
6 ---> a broken pipe sitting on the side of a road
7 ---> a car driving down a road with a car behind it
8 ---> people sitting around a table


### Face Recognision

In [16]:
import face_recognition
import cv2
import numpy as np
import os
import math

In [17]:
def faceConfidence(face_distance, face_match_threshold=0.6):
    rang = (1 - face_match_threshold)
    linear_val = (1 - face_distance) / (rang * 2)

    if face_distance > face_match_threshold:
        return str(round(linear_val * 100, 2)) + "%"
    else:
        value = (linear_val + ((1 - linear_val) * math.pow((linear_val - 0.5) * 2, 0.2))) * 100
        return str(round(value, 2)) + "%"

In [18]:
def faceConfidence(face_distance, face_match_threshold=0.6):
    rang = (1 - face_match_threshold)
    linear_val = (1 - face_distance) / (rang * 2)

    if face_distance > face_match_threshold:
        return str(round(linear_val * 100, 2)) + "%"
    else:
        value = (linear_val + ((1 - linear_val) * math.pow((linear_val - 0.5) * 2, 0.2))) * 100
        return str(round(value, 2)) + "%"


class faceRecognition:
    face_locations = []

    face_encodings = []
    face_names = []

    known_face_encodings = []
    known_face_names = []


    def __init__(self):
        self.encode_faces()


    def encode_faces(self):
        for image in os.listdir("faces"):
            face_image = face_recognition.load_image_file("faces/" + image)
            face_encoding = face_recognition.face_encodings(face_image)[0]

            self.known_face_encodings.append(face_encoding)
            self.known_face_names.append(image.split(".")[0])
        
    def run_recognition(self, imgPath):
        frame = cv2.imread(imgPath)

        small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25)
        rgb_small_frame = small_frame[:, :, ::-1] # BGR to RGB

        self.face_locations = face_recognition.face_locations(rgb_small_frame)
        self.face_encodings = face_recognition.face_encodings(rgb_small_frame, self.face_locations)

        self.face_names = []
        flag = False
        name = "Unknown"
        confidence = "0%"
        for face_encoding in self.face_encodings:
            matches = face_recognition.compare_faces(self.known_face_encodings, face_encoding)
            name = "Unknown"
            confidence = "0%"

            face_distances = face_recognition.face_distance(self.known_face_encodings, face_encoding)
            best_match_index = np.argmin(face_distances)

            if matches[best_match_index]:
                name = self.known_face_names[best_match_index]
                confidence = faceConfidence(face_distances[best_match_index])
                flag = True
                break
        
        if not flag:
            return "Unrecognized Face !", -1
        return name, confidence



In [19]:
fr = faceRecognition()

In [20]:
fr.run_recognition("testImages/test1.jpg")

('Unrecognized Face !', -1)

### -- Testing --

In [21]:
def predict(imagePath):
    preds = predict_step(imagePath)
    person, confidence = fr.run_recognition(imagePath)

    print("\n")

    if confidence is None:
        print("Person : Unidentified")
    else:
        print("Person :", person)
        print("Confidence :", confidence)

    print("\n")

    print("Caption :", preds)


In [22]:
predict("testImages/test9.jpeg")



Person : Unrecognized Face !
Confidence : -1


Caption : a laptop computer sitting on top of a table


---

## Caption generation in live cam

In [23]:
# prediction
def predictCaption(image):
    # image = Image.open(imagePath)
    image = Image.fromarray(image)
    if image.mode != "RGB":
        image = image.convert("RGB")
    
    pixelValue = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)

    outputId = model.generate(pixelValue, **gen_kwargs)

    pred = tokenizer.decode(outputId[0], skip_special_tokens=True)
    preds = pred.strip()


    # -- Face recognision
    

    return preds


In [24]:
import cv2
import numpy as np

In [25]:
# Initialize the webcam
try:
    cap = cv2.VideoCapture(0)

    frameCount = 0

    records = set()

    while True:
        # Read each frame from the webcam
        _, frame = cap.read()

        x, y, c = frame.shape

        # Flip the frame vertically
        # frame = cv2.flip(frame, 1)
        framergb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        if frameCount % 10 == 0:
            className = predictCaption(framergb)
            records.add(className)


        # # show the prediction on the frame
        cv2.putText(frame, className, (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 
                    0.5, (0,0,255), 2, cv2.LINE_AA)

        # Show the final output
        cv2.imshow("Output", frame) 


        if cv2.waitKey(1) == ord('q'): # Quits the program by breaking the loop
            break

        frameCount += 1
        frameCount %= 100

    # release the webcam and destroy all active windows
    cap.release()
    cv2.destroyAllWindows()

    recordFile = open("records.txt", "w")
    for i in records:
        recordFile.write(i + "\n")
    recordFile.close()

except Exception as e:
    cap.release()
    cv2.destroyAllWindows() 
    print("An error occured !")
    print(e)


# To quit the program, press 'q'

qt.qpa.plugin: Could not find the Qt platform plugin "wayland" in "/home/alok/.local/lib/python3.10/site-packages/cv2/qt/plugins"


## Face recognision in liive cam

In [26]:
import face_recognition
import cv2
import numpy as np
import os, sys
import math

In [27]:
def faceConfidence(face_distance, face_match_threshold=0.6):
    rang = (1 - face_match_threshold)
    linear_val = (1 - face_distance) / (rang * 2)

    if face_distance > face_match_threshold:
        return str(round(linear_val * 100, 2)) + "%"
    else:
        value = (linear_val + ((1 - linear_val) * math.pow((linear_val - 0.5) * 2, 0.2))) * 100
        return str(round(value, 2)) + "%"

In [28]:
class faceRecognition:
    face_locations = []

    face_encodings = []
    face_names = []

    known_face_encodings = []
    known_face_names = []

    process_current_frame = True

    def __init__(self):
        self.encode_faces()


    def encode_faces(self):
        for image in os.listdir("faces"):
            face_image = face_recognition.load_image_file("faces/" + image)
            face_encoding = face_recognition.face_encodings(face_image)[0]

            self.known_face_encodings.append(face_encoding)
            self.known_face_names.append(image.split(".")[0])
        
        print(self.known_face_names)

    def run_recognition(self):
        video_capture = cv2.VideoCapture(0)

        if not video_capture.isOpened():
            print("Cannot open camera")
            exit()

        frameCount = 0

        className = ""
        
        while True:
            ret, frame = video_capture.read()

            if frameCount % 10 == 0:
                framergb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)                
                className = predictCaption(framergb)

            if self.process_current_frame:
                small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25)
                rgb_small_frame = small_frame[:, :, ::-1] # BGR to RGB

                self.face_locations = face_recognition.face_locations(rgb_small_frame)
                self.face_encodings = face_recognition.face_encodings(rgb_small_frame, self.face_locations)

                self.face_names = []
                for face_encoding in self.face_encodings:
                    matches = face_recognition.compare_faces(self.known_face_encodings, face_encoding)
                    name = "Unknown"
                    confidence = "0%"

                    face_distances = face_recognition.face_distance(self.known_face_encodings, face_encoding)
                    best_match_index = np.argmin(face_distances)

                    if matches[best_match_index]:
                        name = self.known_face_names[best_match_index]
                        confidence = faceConfidence(face_distances[best_match_index])
                    
                    self.face_names.append(name + " " + confidence)
            
            self.process_current_frame = not self.process_current_frame

            for (top, right, bottom, left), name in zip(self.face_locations, self.face_names):
                top *= 4
                right *= 4
                bottom *= 4
                left *= 4

                cv2.rectangle(frame, (left, top), (right, bottom), (0, 0, 255), 2)

                cv2.rectangle(frame, (left, bottom - 35), (right, bottom), (0, 0, 255), cv2.FILLED)

                font = cv2.FONT_HERSHEY_DUPLEX
                cv2.putText(frame, name, (left + 6, bottom - 6), font, 0.8, (255, 255, 255), 1)
                cv2.putText(frame, className, (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 
                            0.5, (0,0,255), 2, cv2.LINE_AA)
                
            
            cv2.imshow("Face Recognition", frame)

            frameCount += 1
            frameCount %= 100

            if cv2.waitKey(1) == ord('q'):
                break
        
        video_capture.release()
        cv2.destroyAllWindows()

In [31]:
fr = faceRecognition()
fr.run_recognition()

['alok', 'einstien', 'alok', 'einstien', 'alok', 'einstien']
