# Downloading the data set by using kaggle

Data set contains the image of person contains their insta username as well as piture name   
which is scrapted using the instagram private api and browser extensations  

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("adcollege/face-data")

print("Path of dataset files:", path)

In [None]:
import os

dataset_path = "/root/.cache/kagglehub/datasets/adcollege/face-data/versions/1"
print(os.listdir(dataset_path))  # List all files in the dataset folder


# Dependencies to install


In [None]:
# dependencies to install
!pip install faiss-cpu
!pip install facenet-pytorch
!pip install gradio
!pip install --upgrade Pillow torchvision


# Data filtering  
# Seperate the picture with face or not having the face

In [None]:
import os
from facenet_pytorch import MTCNN
from PIL import Image
from IPython.display import display
import shutil
import zipfile

# Initialize MTCNN for face detection
mtcnn = MTCNN(keep_all=True)

# Directory containing images
image_dir = "/content/extracted_files/downloaded_pics"  # Replace with your directory

# Create directories for face and no-face images
face_dir = "faces"
no_face_dir = "no_faces"
os.makedirs(face_dir, exist_ok=True)
os.makedirs(no_face_dir, exist_ok=True)

for filename in os.listdir(image_dir):
    if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
        img_path = os.path.join(image_dir, filename)
        img = Image.open(img_path).convert('RGB')

        # Detect faces in the image
        boxes, _ = mtcnn.detect(img)

        # Move image to appropriate folder
        if boxes is not None:
            shutil.copy(img_path, os.path.join(face_dir, filename))
        else:
            shutil.copy(img_path, os.path.join(no_face_dir, filename))

# Zip the folders
def zip_folder(folder_path, zip_filename):
    with zipfile.ZipFile(zip_filename, 'w') as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                zipf.write(os.path.join(root, file),
                           os.path.relpath(os.path.join(root, file),
                                           os.path.join(folder_path, '..')))

zip_folder(face_dir, "faces.zip")
zip_folder(no_face_dir, "no_faces.zip")

print("Images processed and zipped successfully!")

# Main program to search the similar face of image in dataset

In [None]:
import os
import torch
import numpy as np
import faiss
from PIL import Image
import gradio as gr
from facenet_pytorch import MTCNN, InceptionResnetV1
from tqdm import tqdm

# Path of images
IMAGE_FOLDER = "/root/.cache/kagglehub/datasets/adcollege/face-data/versions/1/good_faces"

# Initialize models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mtcnn = MTCNN(device=device)
resnet = InceptionResnetV1(pretrained='vggface2', device=device).eval()

# Load images
def load_images_from_folder(folder):
    images, image_paths = [], []
    for filename in os.listdir(folder):
        if filename.lower().endswith(('png', 'jpg', 'jpeg')):
            img_path = os.path.join(folder, filename)
            img = Image.open(img_path).convert("RGB")
            images.append(img)
            image_paths.append(img_path)
    return images, image_paths

images, image_paths = load_images_from_folder(IMAGE_FOLDER)

# Extract embeddings
def extract_embeddings(images):
    embeddings, valid_paths = [], []
    for img, path in tqdm(zip(images, image_paths), total=len(images)):
        face = mtcnn(img)
        if face is not None:
            face = face.unsqueeze(0).to(device)
            emb = resnet(face).detach().cpu().numpy()
            embeddings.append(emb)
            valid_paths.append(path)
    return np.vstack(embeddings), valid_paths

embeddings, valid_paths = extract_embeddings(images)

# Store embeddings in FAISS
index = faiss.IndexFlatL2(512)
index.add(embeddings.astype(np.float32))

# Function to find similar images
def find_similar(image):
    img = Image.open(image).convert("RGB")
    face = mtcnn(img)
    if face is None:
        return "No face detected!", None, "", None, "", None, "", None, "", None, ""

    face = face.unsqueeze(0).to(device)
    emb = resnet(face).detach().cpu().numpy().astype(np.float32)
    distances, indices = index.search(emb, 5)

    similar_images_paths = [valid_paths[i] for i in indices[0]]
    similar_images = [Image.open(img_path).convert("RGB") for img_path in similar_images_paths]

    scores = [round(1 / (1 + dist), 3) if dist != 0 else 1.0 for dist in distances[0]]  # Normalize similarity scores

    instagram_links = [
        f"Match {i+1} - Score: {scores[i]}\n[View Profile](https://www.instagram.com/{os.path.splitext(os.path.basename(path))[0]})"
        for i, path in enumerate(similar_images_paths)
    ]

    return (
        similar_images[0], instagram_links[0],
        similar_images[1], instagram_links[1],
        similar_images[2], instagram_links[2],
        similar_images[3], instagram_links[3],
        similar_images[4], instagram_links[4]
    )

# Gradio UI
iface = gr.Interface(
    fn=find_similar,
    inputs=gr.Image(type="filepath"),
    outputs=[
        gr.Image(label="Match 1"), gr.Markdown(label="Link 1"),
        gr.Image(label="Match 2"), gr.Markdown(label="Link 2"),
        gr.Image(label="Match 3"), gr.Markdown(label="Link 3"),
        gr.Image(label="Match 4"), gr.Markdown(label="Link 4"),
        gr.Image(label="Match 5"), gr.Markdown(label="Link 5"),
    ],
    title="Face Similarity Search",
    description="Upload an image to find similar faces from the dataset. Click the links below the images to view the Instagram profiles. Matching scores are displayed next to the links."
)

iface.launch(share=True)


# **Main program to search the similar face of image in dataset mantually by giving the pic path **

In [None]:
import os
import torch
import numpy as np
import faiss
from PIL import Image
import matplotlib.pyplot as plt
from facenet_pytorch import MTCNN, InceptionResnetV1
from tqdm import tqdm


#path of folder
IMAGE_FOLDER ="/content/images_dataset"


# Initialize models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mtcnn = MTCNN(device=device)
resnet = InceptionResnetV1(pretrained='vggface2', device=device).eval()


# Loading the images
def load_images_from_folder(folder):
    images = []
    image_paths = []
    for filename in os.listdir(folder):
        if filename.lower().endswith(('png', 'jpg', 'jpeg')):
            img_path = os.path.join(folder, filename)
            img = Image.open(img_path).convert("RGB")
            images.append(img)
            image_paths.append(img_path)
    return images, image_paths

images, image_paths = load_images_from_folder(IMAGE_FOLDER)

# Extract embeddings
def extract_embeddings(images):
    embeddings = []
    valid_paths = []
    for img, path in tqdm(zip(images, image_paths), total=len(images)):
        face = mtcnn(img)
        if face is not None:
            face = face.unsqueeze(0).to(device)
            emb = resnet(face).detach().cpu().numpy()
            embeddings.append(emb)
            valid_paths.append(path)
    return np.vstack(embeddings), valid_paths

embeddings, valid_paths = extract_embeddings(images)

# Store embeddings in FAISS
index = faiss.IndexFlatL2(512)
index.add(embeddings.astype(np.float32))

# Function to find and display similar images and converting them into RGB
def find_similar(image_path, top_k=5):
    img = Image.open(image_path).convert("RGB")
    face = mtcnn(img)
    if face is None:
        print("No face detected!")
        return
    face = face.unsqueeze(0).to(device)
    emb = resnet(face).detach().cpu().numpy().astype(np.float32)

    distances, indices = index.search(emb, top_k)
    similar_images = [valid_paths[i] for i in indices[0]]

    # Display images
    fig, axes = plt.subplots(1, top_k + 1, figsize=(150, 50))
    axes[0].imshow(img)
    axes[0].set_title("Query Image")
    axes[0].axis("off")

    for i, img_path in enumerate(similar_images):
        sim_img = Image.open(img_path).convert("RGB")
        axes[i + 1].imshow(sim_img)
        axes[i + 1].set_title(f"Match {i+1}")
        axes[i + 1].axis("off")

    plt.show()

# Entry query image path
query_image = "/content/471584015_947115076820441_3422149765727453140_n.jpg"
find_similar(query_image)
