In [61]:
#!sudo apt install -y tesseract-ocr
#!pip install pytesseract
#!pip install opencv-python
#!pip install thefuzz

In [62]:
import pytesseract
import cv2
from pytesseract import*
import argparse 
import os
import clip
import torch
from torch.utils.data import Dataset, DataLoader
import PIL
import pickle
from tqdm import tqdm
from thefuzz import fuzz
import pandas as pd

In [63]:
class Images(Dataset):
        """Images dataset"""
        
        def __init__(self, image_list, transform):
            """
            Args:
                image_list: List of image paths.
                transform : Transform to be applied on a sample.
            """
            self.image_list = image_list
            self.transform = transform
        
        def __len__(self):
            return len(self.image_list)
        
        def __getitem__(self, idx):
            image_path = self.image_list[idx]
            print("image_path: ", image_path)
            image = PIL.Image.open(image_path)
            image = self.transform(image)
            data = {'image':image, 
                    'img_path': image_path}
            return data

In [64]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device, jit=False)
print(f'Device used: {device}')

folder_path = 'images/'
image_list = [folder_path + file for file in os.listdir(folder_path)]

Device used: cpu


In [65]:
print('Attempting to open images...')
cleaned_image_list = []
for image_path in image_list:
    try:
        print("image_path: ", image_path)
        PIL.Image.open(image_path)
        cleaned_image_list.append(image_path)
    except:
        print(f"Failed for {image_path}")

print(f"There are {len(cleaned_image_list)} images that can be processed")    
dataset = Images(cleaned_image_list,preprocess)

dataloader = DataLoader(dataset, 
                        batch_size=256,
                        shuffle=True)



Attempting to open images...
image_path:  images/Test250.jpg
image_path:  images/Test230.jpg
image_path:  images/Test3159.jpg
image_path:  images/Test2870.jpg
image_path:  images/Test1717.jpg
image_path:  images/Test482.jpg
image_path:  images/.ipynb_checkpoints
Failed for images/.ipynb_checkpoints
image_path:  images/Test1620.jpg
image_path:  images/Test1856.jpg
image_path:  images/Test581.jpg
image_path:  images/Test1902.jpg
image_path:  images/Test129.jpg
image_path:  images/Test2371.jpg
image_path:  images/Test519.jpg
image_path:  images/Test946.jpg
image_path:  images/17.jpg
image_path:  images/Test133.jpg
image_path:  images/Test1889.jpg
image_path:  images/Test2068.jpg
image_path:  images/Test2590.jpg
image_path:  images/Test1359.jpg
image_path:  images/Test109.jpg
image_path:  images/0.jpg
image_path:  images/Test677.jpg
image_path:  images/Test1785.jpg
image_path:  images/Test1788.jpg
image_path:  images/Test172.jpg
image_path:  images/Test188.jpg
image_path:  images/Test206.j

In [66]:
print("Processing images...")
image_paths = []
embeddings = []

Processing images...


In [67]:
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [68]:
ktr = 1
for data in tqdm(dataloader):
    print("Ktr: ", ktr)
    with torch.no_grad():
        try:
            X = data['image'].to(device)
            image_embedding = model.encode_image(X)
            img_path = data['img_path']
            image_paths.extend(img_path)
            embeddings.extend([torch.Tensor(x).unsqueeze(0).cpu() for x in image_embedding.tolist()])
            ktr = ktr + 1
        except:
            pass
            print("Error")

  0%|          | 0/2 [00:00<?, ?it/s]

image_path:  images/Test677.jpg
image_path:  images/Test149.jpg
image_path:  images/19.jpg
image_path:  images/8.jpg
image_path:  images/Test2007.jpg
image_path:  images/Test400.jpg
image_path:  images/Test489.jpg
image_path:  images/Test162.jpg
image_path:  images/Test3749.jpg
image_path:  images/Test176.jpg
image_path:  images/Test160.jpg
image_path:  images/Test221.jpg
image_path:  images/Test778.jpg
image_path:  images/Test3789.jpg
image_path:  images/Test126.jpg
image_path:  images/Test168.jpg
image_path:  images/Test3265.jpg
image_path:  images/Test3118.jpg
image_path:  images/Test929.jpg
image_path:  images/Test228.jpg
image_path:  images/Test803.jpg
image_path:  images/Test225.jpg
image_path:  images/Test2616.jpg
image_path:  images/Test3985.jpg
image_path:  images/Test103.jpg
image_path:  images/Test665.jpg
image_path:  images/Test1240.jpg
image_path:  images/Test624.jpg
image_path:  images/Test3024.jpg
image_path:  images/Test2959.jpg
image_path:  images/Test837.jpg
image_pat

100%|██████████| 2/2 [00:06<00:00,  3.37s/it]

image_path:  images/Test644.jpg
image_path:  images/12.jpg
image_path:  images/Test937.jpg
Ktr:  2





In [69]:
myRevList = os.listdir(folder_path)

In [70]:
import re
lstImage_text = []
# initializing bad_chars_list
bad_chars = [';', ':', '!', "*", "{", "}"]

for image in  myRevList:
    try:
        #print(f'Working on - {folder_path}/{image}')
        img = cv2.imread(f'{folder_path}/{image}')
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        text_data = pytesseract.image_to_string(img_rgb)
        regex = r"([\w ,]+)\n(?:([\w ,.]+)\n)?(\(\w+ \d+, \d+\)\d*)"
        subst = "$1$2 $3"
        text_data = re.sub(regex, subst, text_data, 0)

        text_data = re.sub(r"(http[s]?\://\S+)|([\[\(].*[\)\]])|([#@]\S+)|\n", "",text_data)
        text_data = re.sub('\s+',' ',text_data)
        text_data = re.sub('\n+',' ',text_data)
        text_data = ''.join((filter(lambda i: i not in bad_chars, text_data)))
        
        lstImage_text.append((image, text_data))        
        
        #cv2.imshow("Image", img)
        #cv2.waitKey(0)
        #cv2.destroyAllWindows()
    except:
        pass

In [71]:
image_embeddings = dict(zip(image_paths,embeddings))

# save to pickle file for the app
print("Saving image embeddings")
with open('embeddings.pkl','wb') as f:
    pickle.dump(image_embeddings,f)

Saving image embeddings


In [72]:
lstImage_text

[('Test250.jpg', ' WWW.GDBLOG.NET '),
 ('Test230.jpg', 'Top Pro & ConQuotes '),
 ('Test3159.jpg', '4 '),
 ('Test2870.jpg',
  ' Woe) ee the stem ELL) eel eean alien from outer space.. its funny” '),
 ('Test1717.jpg', '1009 MIE“of aCag '),
 ('Test482.jpg', 'We are legal In all fifty states '),
 ('Test1620.jpg', ' LOVEIS TOO BEAUTO BE HIDIN THE CLOior '),
 ('Test1856.jpg',
  " It's funny how we know gaypeople exist and we don’thave proof of God but weee uous eeEta tated pied esos Pe)NM ke eee ekee Being Gay & Proud Quoteswww. geckoandfly.comi» OR Wiz Tin . ] ke Bi "),
 ('Test581.jpg',
  '‘There are poop have said that Pmmeng trace for being pends sopperticefy marriage. gy ateption. Whidee renpecthmbty det. bom(met Mrng brane Pa deers bameimg- Love lea human experience,et peta tater '),
 ('Test1902.jpg', ' Se aCe en asCard———] '),
 ('Test129.jpg', ' '),
 ('Test2371.jpg',
  "When people askwhat I see in you,I just smile andlook away becauseT'm afraid if they knew,they'd fall in lovewith you

In [73]:
dfImages = pd.DataFrame(lstImage_text)

In [74]:
dfImages

Unnamed: 0,0,1
0,Test250.jpg,WWW.GDBLOG.NET
1,Test230.jpg,Top Pro & ConQuotes
2,Test3159.jpg,4
3,Test2870.jpg,Woe) ee the stem ELL) eel eean alien from out...
4,Test1717.jpg,1009 MIE“of aCag
...,...,...
254,Test131.jpg,
255,Test140.jpg,
256,Test489.jpg,NE ea stUa ry4i a dePe
257,Test201.jpg,


In [75]:
dfImages.to_csv("images_to_txt.csv")