### Importing libraries

In [1]:
from google.cloud import storage
from google.cloud import bigquery
import pandas as pd
from tqdm import tqdm
import os
from datetime import datetime
import pytz
import shutil

### Setup GCS Path & Configuration

In [2]:
project_id = 'gurihmas-corp-prd'
bucket_name_ops = 'ml_nitro'
dir_image_ops = 'ops/image/asli'
dir_data_ops = 'ops/data'

bucket_name_all = 'ldg_gurih_ktp'
dir_image_all = 'image/'

In [3]:
def download_blob(bucket_name, source_blob_name, destination_file_name):
    
    storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

### List KTP Images

In [5]:
IMG_ASLI_EXIST = '/home/jupyter/gurih_mas/OCR/image_samples/221018_Image_Asli.csv'
IMG_ASLI_REMAIN = '/home/jupyter/gurih_mas/OCR/image_samples/221018_Remaining_Image_Asli.csv'

df_img_exist = pd.read_csv(IMG_ASLI_EXIST, header=0, names=['ktp_filename'])
df_img_remain = pd.read_csv(IMG_ASLI_REMAIN, header=0, names=['ktp_filename'])
df_img_remain

Unnamed: 0,ktp_filename
0,0000d496-0c45-41a6-8419-1b975f296075
1,0001c4ef-5d44-4827-b247-168b64971aa8
2,00060617-0f7e-4e13-b5da-12319c049b02
3,0008f114-0fd9-409c-8cc8-98706e8a6d95
4,000a582f-ab51-4dec-8b60-2ffb834b0f9a
...,...
7733,ffb784f9-6e2c-4921-9431-f32cbcaf8f87
7734,ffc042be-590e-4770-b31e-a18f78f5327f
7735,ffc469f2-8c01-4a56-88d8-735c3f38ee86
7736,ffe09dfc-778d-4d60-ba45-41abcea845c2


In [6]:
df_img_exist

Unnamed: 0,ktp_filename
0,960e1b2b-8ca3-47dd-953c-deb355840791.jpg
1,954f69e1-e731-4b2e-9158-7e55f60b750e.jpg
2,c719bce5-0a01-4575-923d-3bedb7758463.jpg
3,00a2bd2d-4160-4b0a-8a3a-e135fb3b2853.jpg
4,4e509652-f84b-4455-af3c-e63bc8450c10.jpg
...,...
1348,a7530ac5-c0d3-478e-b282-6e5dba6571f0.jpg
1349,028239a4-af0a-42dd-bf45-95ef2624a2d5.jpg
1350,eb98b55d-b466-4bf6-b92f-05b31e05a295.jpg
1351,029811f7-8bd7-4f4f-bf80-c907c8dc86f0.jpg


In [10]:
df_img_added = df_img_remain.merge(df_img_exist.drop_duplicates(), on=['ktp_filename'], how='left', indicator=True)
df_img_added = df_img_added[df_img_added['_merge'] == 'left_only']
df_img_added

Unnamed: 0,ktp_filename,_merge
0,0000d496-0c45-41a6-8419-1b975f296075,left_only
1,0001c4ef-5d44-4827-b247-168b64971aa8,left_only
2,00060617-0f7e-4e13-b5da-12319c049b02,left_only
3,0008f114-0fd9-409c-8cc8-98706e8a6d95,left_only
4,000a582f-ab51-4dec-8b60-2ffb834b0f9a,left_only
...,...,...
7733,ffb784f9-6e2c-4921-9431-f32cbcaf8f87,left_only
7734,ffc042be-590e-4770-b31e-a18f78f5327f,left_only
7735,ffc469f2-8c01-4a56-88d8-735c3f38ee86,left_only
7736,ffe09dfc-778d-4d60-ba45-41abcea845c2,left_only


### Download Images

In [11]:
#Download 202218 Dataset

for i in tqdm(range(df_img_exist.shape[0])):
    image_folder_bucket = 'image'
    image_download_directory = '/home/jupyter/gurih_mas/OCR/image_samples/image_asli'
    image_filename_bucket = df_img_exist['ktp_filename'][i] ##.split(".")[0]
    image_full_path_bucket = '{}/{}'.format(image_folder_bucket, image_filename_bucket)
    image_full_path_dir = '{}/{}.jpg'.format(image_download_directory, image_filename_bucket)

    try:
        download_blob('ldg_gurih_ktp', image_full_path_bucket, image_full_path_dir)
        # print(image_full_path_bucket)
        # print(image_full_path_dir)
    except:
        ('Error file: {}'.format(image_full_path_dir))

100%|██████████| 1353/1353 [15:08<00:00,  1.49it/s]


In [17]:
#Download 202218 Dataset Additional

for i in tqdm(range(300)):
    image_folder_bucket = 'image'
    image_download_directory = '/home/jupyter/gurih_mas/OCR/image_samples/image_asli_added'
    image_filename_bucket = df_img_remain['ktp_filename'][i] ##.split(".")[0]
    image_full_path_bucket = '{}/{}'.format(image_folder_bucket, image_filename_bucket)
    image_full_path_dir = '{}/{}.jpg'.format(image_download_directory, image_filename_bucket)

    try:
        download_blob('ldg_gurih_ktp', image_full_path_bucket, image_full_path_dir)
        # print(image_full_path_bucket)
        # print(image_full_path_dir)
    except:
        ('Error file: {}'.format(image_full_path_dir))

100%|██████████| 300/300 [02:48<00:00,  1.78it/s]


### List downloaded images

In [18]:
folder_path = '/home/jupyter/gurih_mas/OCR/image_samples/image_asli_added'

df_img_added = pd.DataFrame([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
df_img_added.columns = ['ktp_filename']
print(df_img_added.shape)
df_img_added

(300, 1)


Unnamed: 0,ktp_filename
0,0068ef20-1d1b-4961-af65-7d9d0a0897cc.jpg
1,006404a7-81d4-4d8c-9dbe-ce3c8f4df4e8.jpg
2,00ffe211-568a-4345-ba25-0829863d0f7d.jpg
3,006002a2-1345-4e49-97c5-f3b08f01852e.jpg
4,003e5596-fa43-48ae-b5ce-5fd389d25128.jpg
...,...
295,00e89aba-ea71-412a-b072-73e1f6f78cca.jpg
296,00b03b4c-9e70-40e9-8f27-be94aa0b0529.jpg
297,004acbe7-abfe-47a0-b18f-54fbb2444934.jpg
298,0104f6b8-fd15-421f-be92-82f6fcfc924e.jpg


In [19]:
df_img_added.to_csv('/home/jupyter/gurih_mas/OCR/image_samples/221018_Added_Image_Asli.csv', index=False)