In [1]:
import requests
import json
import concurrent.futures
import time
from tqdm import tqdm
import pandas as pd
import os

In [2]:
df_pages = pd.read_csv('letsreadasia_pages.csv')
#Filter Pages that not have "Badan Pengembangan" or "lembaga" in 'extractedLongContentValue' column
df_pages = df_pages[~df_pages['extractedLongContentValue'].str.contains('Badan Pengembangan|lembaga|Lembaga|Foundation|SMART', na=False, regex=True)]
df_pages = df_pages.dropna(subset=['id','imageServingUrl'])

In [3]:
df_pages.iloc[10]['imageServingUrl']

'https://lh3.googleusercontent.com/xvhhn13sPGzf0EvoBT2Q5J3GqYEahWanm2zUAqfArZ5t1e_XpiJF9KgCMfz4DvEHYiQ8KBKuDhrYimleODZcnHs'

In [4]:
def row_scrape_image(row):
    try:
        image_url = row['imageServingUrl']
        image_name = f"{row['id']}.jpg"
        image_path = os.path.join('./images', image_name)
        r = requests.get(image_url, allow_redirects=True)
        open(image_path, 'wb').write(r.content)
        return image_path
    except Exception as e:
        print(e)
        return None

In [5]:
if not os.path.exists('./images'):
    os.mkdir('./images')
image_paths = []
with concurrent.futures.ThreadPoolExecutor() as executor:

    futures = []

    for index,row in df_pages.iterrows():
        futures.append(executor.submit(row_scrape_image, row=row))

    for future in tqdm(concurrent.futures.as_completed(futures)):
        image_paths.append(future.result())
df_pages['image_path'] = image_paths

7245it [02:15, 53.55it/s] 


In [6]:
df_pages.to_csv('letsreadasia_pages_images_filtered.csv', index=False)

### BLIP Captioning

In [2]:
df_pages = pd.read_csv('letsreadasia_pages_images_filtered.csv')

In [3]:
import torch
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("/mnt/e/AI-Project/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("/mnt/e/AI-Project/blip-image-captioning-large", torch_dtype=torch.float16).to("cuda")

2023-03-22 16:01:15.803766: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-22 16:01:17.040103: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/lib/wsl/lib::/home/karuniaperjuangan/anaconda3/lib::/usr/local/cuda-11.2/targets/x86_64-linux/lib::
2023-03-22 16:01:17.041956: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/lib/wsl/lib::/home

In [6]:
list_corrupted_paths = []
def predict_caption(image_path):
    img = Image.open(image_path).convert('RGB')
    inputs = processor(img, return_tensors="pt",max_length=75).to("cuda", torch.float16)
    outputs = model.generate(**inputs)
    caption = processor.batch_decode(outputs, skip_special_tokens=True)[0]
    return caption

def predict_caption_in_batch(image_paths):
    imgs = []
    for image_path in image_paths:
        try:
            img = Image.open(image_path).convert('RGB')
        except:
            img = Image.new('RGB', (224, 224))
            list_corrupted_paths.append(image_path)
            print(f"{image_path} corrupted or not found!")
        imgs.append(img)
    inputs = processor(imgs, return_tensors="pt").to("cuda", torch.float16)
    outputs = model.generate(**inputs)
    captions = processor.batch_decode(outputs, skip_special_tokens=True, max_length=75)
    return captions

In [7]:
BATCH_SIZE = 64
image_paths = df_pages['image_path'].tolist()

captions = []
for i in tqdm(range(0, len(image_paths), BATCH_SIZE)):
    batch_image_paths = image_paths[i:i+BATCH_SIZE]
    batch_captions = predict_caption_in_batch(batch_image_paths)
    captions.extend(batch_captions)
    
df_pages['caption'] = captions

 83%|████████▎ | 95/114 [07:27<01:26,  4.58s/it]

./images/5008034443034624.jpg corrupted or not found!
./images/5289509419745280.jpg corrupted or not found!
./images/6263704907677696.jpg corrupted or not found!
./images/5916196419403776.jpg corrupted or not found!


100%|██████████| 114/114 [08:49<00:00,  4.65s/it]


In [8]:
df_pages[df_pages['id'].astype(str).str.contains('5916196419403776')]

Unnamed: 0,id,extractedLongContentValue,bookId,pageNum,imageUrl,imageServingUrl,thumborImageUrl,thumborImageUrlWithourResizing,imageWidth,imageHeight,...,audio,video,translatedState,idClone,deleted,bookName,bookTotalPages,isEnding,image_path,caption
6136,5916196419403776,Kuda Nil ingin menari. Ia melompat-lompat di t...,f4b70634-bd75-4568-bd69-c26edad2e7e1,1,https://storage.googleapis.com/lets-read-asia/...,https://lh3.googleusercontent.com/yvHbe8NjlQgS...,https://letsread-images.hamropatro.com/ilKchoC...,https://letsread-images.hamropatro.com/bvSkkDi...,800,800,...,,,False,5916196419403776,False,Kuda Nil ingin menari,13,False,./images/5775458931048448.jpg,cartoon hippoid hippoid hippoid hippoid hippoi...


In [9]:
                    
df_pages = df_pages[~df_pages['image_path'].isin(list_corrupted_paths)]

In [10]:
df_pages.to_csv('letsreadasia_pages_images_filtered_captioned.csv', index=False)