In [3]:
import os
import torch
import pandas as pd
import numpy as np
import PIL

from PIL import Image
from tqdm import tqdm

In [86]:
data = pd.read_csv('bms-molecular-translation/train_labels.csv', index_col='image_id')
data.head()

Unnamed: 0_level_0,InChI
image_id,Unnamed: 1_level_1
000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...
000019cc0cd2,InChI=1S/C21H30O4/c1-12(22)25-14-6-8-20(2)13(1...
0000252b6d2b,InChI=1S/C24H23N5O4/c1-14-13-15(7-8-17(14)28-1...
000026b49b7e,InChI=1S/C17H24N2O4S/c1-12(20)18-13(14-7-6-10-...
000026fc6c36,InChI=1S/C10H19N3O2S/c1-15-10(14)12-8-4-6-13(7...


In [87]:
data.shape

(2424186, 1)

In [88]:
data['layers_count'] = data['InChI'].apply(lambda formula: len(formula.split('/')))
data['chemical'] = data['InChI'].apply(lambda formula: formula.split('/')[1])

In [89]:
print(f'Unique images: {data.index.nunique()}')
print(f'Unique formulas: {data["InChI"].nunique()}')
print(f'Unique number of layers count: {data["layers_count"].nunique()}')
print(f'Unique number of layers values: {data["layers_count"].unique()}')
print(f'Unique chemical elements: {data["chemical"].nunique()}')

Unique images: 2424186
Unique formulas: 2424186
Unique number of layers count: 9
Unique number of layers values: [ 4  7  5  8  6  9 10  3 11]
Unique chemical elements: 329768


In [90]:
data.head()

Unnamed: 0_level_0,InChI,layers_count,chemical
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,4,C13H20OS
000019cc0cd2,InChI=1S/C21H30O4/c1-12(22)25-14-6-8-20(2)13(1...,7,C21H30O4
0000252b6d2b,InChI=1S/C24H23N5O4/c1-14-13-15(7-8-17(14)28-1...,4,C24H23N5O4
000026b49b7e,InChI=1S/C17H24N2O4S/c1-12(20)18-13(14-7-6-10-...,4,C17H24N2O4S
000026fc6c36,InChI=1S/C10H19N3O2S/c1-15-10(14)12-8-4-6-13(7...,4,C10H19N3O2S


In [93]:
data['image_url'] = ''
data['image_size'] = ''
data['image_width'] = 0
data['image_height'] = 0
folder_prefix = os.path.join('.', 'bms-molecular-translation', 'train')
for root, dirs, files in tqdm(os.walk(folder_prefix, topdown=True), position=0, leave=True):
    for image in tqdm(files, position=0, leave=True):
        try:
            image_id = image.split('.')[0]
            image_url = os.path.join(root, image)

            image_obj = PIL.Image.open(image_url)
            width, height = image_obj.size
            
            data.at[image_id, 'image_url'] = image_url
            data.at[image_id, 'image_width'] = width
            data.at[image_id, 'image_height'] = height
            data.at[image_id, 'image_size'] = f'{width}x{height}'
        except Exception as ex: 
            print(f'Exception occurred while processing image {image}: {str(ex)}')
data.head()

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 592/592 [00:03<00:00, 164.60it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 577/577 [00:03<00:00, 168.52it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 624/624 [00:03<00:00, 157.93it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 578/578 [00:03<00:00, 150.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 557/557 [00:03<00:00, 143.96it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 589/589 [00:03<00:00, 152.35it/s]
1

Unnamed: 0_level_0,InChI,layers_count,chemical,image_url,image_size,image_width,image_height
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,4,C13H20OS,.\bms-molecular-translation\train\0\0\0\000011...,325x229,325,229
000019cc0cd2,InChI=1S/C21H30O4/c1-12(22)25-14-6-8-20(2)13(1...,7,C21H30O4,.\bms-molecular-translation\train\0\0\0\000019...,288x148,288,148
0000252b6d2b,InChI=1S/C24H23N5O4/c1-14-13-15(7-8-17(14)28-1...,4,C24H23N5O4,.\bms-molecular-translation\train\0\0\0\000025...,509x335,509,335
000026b49b7e,InChI=1S/C17H24N2O4S/c1-12(20)18-13(14-7-6-10-...,4,C17H24N2O4S,.\bms-molecular-translation\train\0\0\0\000026...,243x177,243,177
000026fc6c36,InChI=1S/C10H19N3O2S/c1-15-10(14)12-8-4-6-13(7...,4,C10H19N3O2S,.\bms-molecular-translation\train\0\0\0\000026...,294x112,294,112


In [94]:
data.to_csv('extended_data.csv')

In [97]:
print(f'Average image width: {data["image_width"].mean()}')
print(f'Average image height: {data["image_height"].mean()}')
print(f'Unique image sizes: {data["image_size"].nunique()}')

Average image width: 380.95599017567133
Average image height: 219.82014870146102
Unique image sizes: 149000


In [104]:
data.loc[data['image_width'] == 0].head()

Unnamed: 0_level_0,InChI,layers_count,chemical,image_url,image_size,image_width,image_height
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [5]:
extended_data = pd.read_csv('extended_data.csv', index_col='image_id')
extended_data.head()

Unnamed: 0_level_0,InChI,layers_count,chemical,image_url,image_size,image_width,image_height
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,4,C13H20OS,.\bms-molecular-translation\train\0\0\0\000011...,325x229,325,229
000019cc0cd2,InChI=1S/C21H30O4/c1-12(22)25-14-6-8-20(2)13(1...,7,C21H30O4,.\bms-molecular-translation\train\0\0\0\000019...,288x148,288,148
0000252b6d2b,InChI=1S/C24H23N5O4/c1-14-13-15(7-8-17(14)28-1...,4,C24H23N5O4,.\bms-molecular-translation\train\0\0\0\000025...,509x335,509,335
000026b49b7e,InChI=1S/C17H24N2O4S/c1-12(20)18-13(14-7-6-10-...,4,C17H24N2O4S,.\bms-molecular-translation\train\0\0\0\000026...,243x177,243,177
000026fc6c36,InChI=1S/C10H19N3O2S/c1-15-10(14)12-8-4-6-13(7...,4,C10H19N3O2S,.\bms-molecular-translation\train\0\0\0\000026...,294x112,294,112
