<a href="https://colab.research.google.com/github/barnaghosh/Burger-Builder/blob/main/VQA_Med1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Load the training set
train_df = pd.read_json("/content/drive/MyDrive/VQA/input/vqa rad/Visual-Question-Answering-master/Visual-Question-Answering-master/trainset.json")

# Load the test set
test_df = pd.read_json("/content/drive/MyDrive/VQA/input/vqa rad/Visual-Question-Answering-master/Visual-Question-Answering-master/testset.json")

In [2]:
from PIL import Image
import numpy as np

# Define the image size and the path to the images
image_size = (224, 224)
image_folder = "/content/drive/MyDrive/VQA/input/vqa rad/Visual-Question-Answering-master/Visual-Question-Answering-master/VQA_RAD Image Folder"

def load_and_preprocess_image(image_name):
    # Open the image file
    img = Image.open(f"{image_folder}/{image_name}")
    # Resize the image
    img = img.resize(image_size)
    # Convert the image data to a numpy array
    image = np.array(img)
    # Normalize the image
    image = image / 255.0

    return image
# Load and preprocess the images in the training set
train_images = np.array([load_and_preprocess_image(image_name) for image_name in train_df["image_name"]])

# Load and preprocess the images in the test set
test_images = np.array([load_and_preprocess_image(image_name) for image_name in test_df["image_name"]])

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define the maximum sequence length
max_seq_length = 100

# Create a tokenizer
tokenizer = Tokenizer()

# Fit the tokenizer on the questions
tokenizer.fit_on_texts(train_df["question"])

# Convert the questions to sequences of tokens
train_questions = tokenizer.texts_to_sequences(train_df["question"])
test_questions = tokenizer.texts_to_sequences(test_df["question"])

# Pad the sequences
train_questions = pad_sequences(train_questions, maxlen=max_seq_length)
test_questions = pad_sequences(test_questions, maxlen=max_seq_length)

In [4]:
from sklearn.preprocessing import LabelBinarizer

# Create a binarizer
binarizer = LabelBinarizer()

# Ensure all answers are strings
train_df["answer"] = train_df["answer"].astype(str)
test_df["answer"] = test_df["answer"].astype(str)

# Fit the binarizer and transform the answers
train_answers = binarizer.fit_transform(train_df["answer"])
test_answers = binarizer.transform(test_df["answer"])

In [5]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Flatten, concatenate

# Define the image model
image_input = Input(shape=(224, 224, 3))
image_model = VGG16(include_top=False, weights='imagenet', input_tensor=image_input)
for layer in image_model.layers:
    layer.trainable = False
image_model = Flatten()(image_model.output)

# Define the question model
question_input = Input(shape=(max_seq_length,))
question_model = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=256, input_length=max_seq_length)(question_input)
question_model = LSTM(256)(question_model)

In [6]:
# Combine the models
combined = concatenate([image_model, question_model])

# Add the classifier on top
output = Dense(len(binarizer.classes_), activation='softmax')(combined)

# Create the model
model = Model(inputs=[image_input, question_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [7]:
from tensorflow.keras.layers import Dropout

# Define the image model
image_input = Input(shape=(224, 224, 3))
image_model = VGG16(include_top=False, weights='imagenet', input_tensor=image_input)
for layer in image_model.layers:
    layer.trainable = False
image_model = Flatten()(image_model.output)
image_model = Dropout(0.5)(image_model)  # Add dropout layer

# Define the question model
question_input = Input(shape=(max_seq_length,))
question_model = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=256, input_length=max_seq_length)(question_input)
question_model = LSTM(256)(question_model)
question_model = Dropout(0.5)(question_model)  # Add dropout layer
# Define the question model
question_input = Input(shape=(max_seq_length,))
question_model = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=256, input_length=max_seq_length)(question_input)
question_model = LSTM(256)(question_model)
question_model = Dropout(0.5)(question_model)  # Add dropout layer

# Combine the models
combined = concatenate([image_model, question_model])

# Add the classifier on top
output = Dense(len(binarizer.classes_), activation='softmax')(combined)

# Create the model
model = Model(inputs=[image_input, question_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [8]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Define the callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3),
    ModelCheckpoint('vqa_model.h5', monitor='val_loss', save_best_only=True)
]

# Train the model
history = model.fit(
    [train_images, train_questions],
    train_answers,
    epochs=30,  # Increased number of epochs
    validation_data=([test_images, test_questions], test_answers),

)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [9]:
# Evaluate the model
loss, accuracy = model.evaluate([test_images, test_questions], test_answers)
print(f"Test loss: {loss}")
print(f"Test accuracy: {accuracy}")

Test loss: 4.6711745262146
Test accuracy: 0.5388026833534241


In [11]:
from keras.utils.vis_utils import plot_model
model = model([image_input, question_input], [output])
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

TypeError: ignored

In [12]:
from tensorflow.keras.models import load_model

# Load the best model
model = load_model('vqa_model.h5')

In [None]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.optimizers import Adam

# Define the image model
image_input = Input(shape=(224, 224, 3))
image_model = ResNet50(include_top=False, weights='imagenet', input_tensor=image_input)
for layer in image_model.layers:
    layer.trainable = False
image_model = Flatten()(image_model.output)
image_model = Dropout(0.5)(image_model)  # Add dropout layer

# Define the question model
question_input = Input(shape=(max_seq_length,))
question_model = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=256, input_length=max_seq_length)(question_input)
question_model = LSTM(256)(question_model)
question_model = Dropout(0.5)(question_model)  # Add dropout layer

# Combine the models
combined = concatenate([image_model, question_model])
# Add the classifier on top
output = Dense(len(binarizer.classes_), activation='softmax')(combined)

# Create the model
model = Model(inputs=[image_input, question_input], outputs=output)

# Compile the model with a lower learning rate
model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

# Define the callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3),
    ModelCheckpoint('vqa_model.h5', monitor='val_loss', save_best_only=True)
]
# Train the model
history = model.fit(
    [train_images, train_questions],
    train_answers,
    epochs=20,  # Increased number of epochs
    validation_data=([test_images, test_questions], test_answers),
    callbacks=callbacks  # Added callbacks
)

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate([test_images, test_questions], test_answers)
print(f"Test loss: {loss}")
print(f"Test accuracy: {accuracy}")

In [None]:
def predict(image, question):
    # Preprocess the image
    image = load_and_preprocess_image(image)
    image = np.expand_dims(image, axis=0)

    # Preprocess the question
    question = tokenizer.texts_to_sequences([question])
    question = pad_sequences(question, maxlen=max_seq_length)

    # Make prediction
    prediction = model.predict([image, question])

    # Decode the prediction
    answer = binarizer.inverse_transform(prediction)

    return answer[0]

In [None]:
image = "synpic100132.jpg"
question = "What is organ is present?"
print(predict(image, question))
