<div style="border-radius:10px; padding: 15px; background-color: #ffeacc; font-size:130%; text-align:left">
Import Libraries and Initialize Variables

In [9]:
import os
import h5py
import numpy as np
import pytesseract
from PIL import Image
from VGG_feature_extractor import VGGNet
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Define the path to your images directory
images_path = "all_images/"
img_list = [os.path.join(images_path, f) for f in os.listdir(images_path)]

# Initialize the VGGNet model for feature extraction
print("Start feature extraction")
model = VGGNet()

# Lists to store features, image names, and extracted texts
feats = []
names = []
texts = []


Start feature extraction
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 385ms/step


<div style="border-radius:10px; padding: 15px; background-color: #ffeacc; font-size:130%; text-align:left">
Function to Extract Text Using Tesseract

In [10]:
# Function to extract text from an image using Tesseract
def extract_text(image_path):
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)
        return text
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return ""


<div style="border-radius:10px; padding: 15px; background-color: #ffeacc; font-size:130%; text-align:left">
Extract Features from Images

In [11]:
# Iterate through all images to extract VGG-16 features and text features
for im in os.listdir(images_path):
    print("Extracting features from image - ", im)
    image_path = os.path.join(images_path, im)
    
    # Extract VGG-16 features
    X = model.extract_feat(image_path)
    feats.append(X)
    names.append(im)
    
    # Extract text from the image
    text = extract_text(image_path)
    texts.append(text)

# Convert lists to numpy arrays
feats = np.array(feats)

Extracting features from image -  0.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 235ms/step
Extracting features from image -  0000001.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step
Extracting features from image -  0000002.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step
Extracting features from image -  0000003.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 205ms/step
Extracting features from image -  0000004.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 226ms/step
Extracting features from image -  0000005.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step
Extracting features from image -  0000006.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 202ms/step
Extracting features from image -  0000007.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 231ms/step
Extracting features from image -  0000008.jpg
[1m1/1

<div style="border-radius:10px; padding: 15px; background-color: #ffeacc; font-size:130%; text-align:left">
Vectorize Text Features and Combine with Image Features

In [12]:
# Vectorize the extracted text using TF-IDF
vectorizer = TfidfVectorizer()
text_features = vectorizer.fit_transform(texts).toarray()

# Save the vectorizer
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

<div style="border-radius:10px; padding: 15px; background-color: #ffeacc; font-size:130%; text-align:left">
Save Combined Features to HDF5 File

In [13]:
# Ensure text_features have the same number of rows as feats
if text_features.shape[0] != feats.shape[0]:
    print("Mismatch between image features and text features dimensions.")
else:
    # Combine the VGG-16 image features and text features
    combined_features = np.hstack((feats, text_features))

    # Directory for storing extracted features
    output = "CombinedFeatures.h5"

    # Write the combined features to an HDF5 file
    print("Writing feature extraction results to h5 file")
    h5f = h5py.File(output, 'w')
    h5f.create_dataset('dataset_1', data=combined_features)
    h5f.create_dataset('dataset_2', data=np.string_(names))
    h5f.close()

Writing feature extraction results to h5 file
