In [1]:
# Import necessary libraries
import torch
from torchvision import models, transforms
from PIL import Image
from io import BytesIO
import requests
import os
from transformers import GPT2Tokenizer, GPT2LMHeadModel




In [2]:
# Load the pre-trained ResNet model
model = models.resnet50(pretrained=True)
model.eval()



ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [3]:
# Load ImageNet class labels
with open('imagenet_classes.txt') as f:
    labels = [line.strip() for line in f.readlines()]

In [4]:
# Define image transformations
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [5]:
def recognize_image(image_path):
    # Check if the image_path is a URL or a local file path
    if image_path.startswith('http://') or image_path.startswith('https://'):
        # Download the image from the URL
        response = requests.get(image_path)
        image = Image.open(BytesIO(response.content))
    else:
        # Load the image from the local file system
        if not os.path.isfile(image_path):
            raise FileNotFoundError(f"Local file not found: {image_path}")
        image = Image.open(image_path)
    
    # Transform the image
    image = transform(image).unsqueeze(0)  # Add batch dimension
    
    # Run the model
    with torch.no_grad():
        outputs = model(image)
    
    # Get the predicted class
    _, predicted = outputs.max(1)
    return predicted.item()


In [6]:
# Load pre-trained GPT model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_model = GPT2LMHeadModel.from_pretrained('gpt2')

In [7]:
def generate_description(object_name):
    # Generate a description using GPT-2
    query = f"Describe a {object_name}."
    inputs = tokenizer.encode(query, return_tensors='pt')
    outputs = gpt_model.generate(inputs, max_length=100, do_sample=True)
    description = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return description

In [8]:
def chatbot(image_path):
    # Recognize objects in the image
    object_id = recognize_image(image_path)
    
    # Map the object ID to a label from ImageNet
    recognized_object = labels[object_id]
    
    # Generate a description of the detected object
    description = generate_description(recognized_object)
    
    return description

In [9]:
# Test with a local file path
local_image_path = r"C:\Users\adity\Downloads\download (1).jpeg" # Replace with the actual path to your local image
print(chatbot(local_image_path))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Describe a assault_rifle.txt


Name: The first name of a rifle that was manufactured prior to 1863.


A: A short description of a firearm.


B: An abbreviation or general abbreviation of, e.g., "I've got a shotgun", "I have a pistol" or "I use a carbine". For example, if my name is "Samford Arms", "Sherlington Arms". "I do, sir".





In [10]:
# Test with an image URL
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/0/09/A6-EDY_A380_Emirates_31_jan_2013_jfk_%288442269364%29_%28cropped%29.jpg/1200px-A6-EDY_A380_Emirates_31_jan_2013_jfk_%288442269364%29_%28cropped%29.jpg"  # Replace with the actual URL to your image
print(chatbot(image_url))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Describe a airliner.

"I want to get you home," said he, "I'm Shia and I've lived here all my life, the last five years. I've seen no Taliban. I've never had anything but that one night when I came home from Pakistan, with my family on a boat to the Chinese capital of Shanghai, the first thing there was a bomb, like, "This is what this is after?" It's like it's some kind of video that you
