# Import modules

In [1]:
# OS
import os

# Typing
from typing import List

# Analyser
import pandas as pd
from tqdm import tqdm

# Local utilization
from utils.classes.adapters import CIF2PandasAdapter
from utils.functions import (
    get_cif_filepath,
    merge_metadata,
    read_pandas_pickle
)

# Constants
from src.constants import (
    DATASET_PATH,
    CIF_TRAIN_PATH,
    CIF_PRETEST_PATH,
    PREPROCESSING_TRAIN_PATH,
    PREPROCESSING_PRETEST_PATH
)

# pickle
import pickle

tqdm.pandas()

# Load Essentials

## Load Adapters

In [2]:
cif2pandas_adapter = CIF2PandasAdapter()

## Load CSVs

In [3]:
Train = pd.read_csv(f"{DATASET_PATH}/train.csv".replace("//", "/"))
Train.index = Train.index + 1
Pretest = pd.read_csv(f"{DATASET_PATH}/pretest.csv".replace("//", "/"))
Pretest.index = Pretest.index + 1

# Join data with CIF filename

In [4]:
Train = get_cif_filepath(Train, CIF_TRAIN_PATH)
Train[['MOFname', 'cif_filepath']].head(5)

Unnamed: 0,MOFname,cif_filepath
1,mof_unit_1,tmlcc-2021/mof_cif_train/mof_unit_1.cif
2,mof_unit_2,tmlcc-2021/mof_cif_train/mof_unit_2.cif
3,mof_unit_3,tmlcc-2021/mof_cif_train/mof_unit_3.cif
4,mof_unit_4,tmlcc-2021/mof_cif_train/mof_unit_4.cif
5,mof_unit_5,tmlcc-2021/mof_cif_train/mof_unit_5.cif


In [5]:
Pretest= get_cif_filepath(Pretest, CIF_PRETEST_PATH)
Pretest[['MOFname', 'cif_filepath']].head(5)

Unnamed: 0,MOFname,cif_filepath
1,mof_unit_pretest_1,tmlcc-2021/mof_cif_pretest/mof_unit_pretest_1.cif
2,mof_unit_pretest_2,tmlcc-2021/mof_cif_pretest/mof_unit_pretest_2.cif
3,mof_unit_pretest_3,tmlcc-2021/mof_cif_pretest/mof_unit_pretest_3.cif
4,mof_unit_pretest_4,tmlcc-2021/mof_cif_pretest/mof_unit_pretest_4.cif
5,mof_unit_pretest_5,tmlcc-2021/mof_cif_pretest/mof_unit_pretest_5.cif


# Test Adapter

## CIF

In [6]:
adapted_cif: dict = cif2pandas_adapter.apply(Train.cif_filepath[1])

In [7]:
pd.DataFrame(adapted_cif['metadata'])

Unnamed: 0,0,1
0,_audit_creation_date,2013-03-05T16:41:16-0500
1,_symmetry_space_group_name_H-M,P1
2,_symmetry_Int_Tables_number,1
3,_space_group_crystal_system,triclinic
4,_cell_length_a,10.609882
5,_cell_length_b,10.643578
6,_cell_length_c,9.890832
7,_cell_angle_alpha,89.904234
8,_cell_angle_beta,91.242355
9,_cell_angle_gamma,90.261935


In [8]:
pd.DataFrame(adapted_cif['loops'][0])

Unnamed: 0,_atom_site_label,_atom_site_type_symbol,_atom_site_description,_atom_site_fract_x,_atom_site_fract_y,_atom_site_fract_z,_atom_type_partial_charge
7,O1,O,O_R,0.500020,0.316790,0.614690,-0.628319
8,O2,O,O_R,0.500000,0.316760,0.385350,-0.604978
9,C1,C,C_R,0.511239,0.260617,0.499919,0.943344
10,O3,O,O_R,0.319080,0.500020,0.614640,-0.626555
11,O4,O,O_R,0.319070,0.499980,0.385300,-0.570235
...,...,...,...,...,...,...,...
77,H19,H,H_,0.720880,0.734379,0.913303,0.347912
78,C30,C,C_2,0.348847,0.101593,0.756535,0.967858
79,O20,O,O_2,0.293392,0.097996,0.863474,-0.568112
80,O21,O,O_R,0.284013,0.135186,0.637326,-0.818747


In [9]:
pd.DataFrame(adapted_cif['loops'][1])

Unnamed: 0,_atom_site_label,_atom_site_type_symbol,_atom_site_description,_atom_site_fract_x,_atom_site_fract_y,_atom_site_fract_z,_atom_type_partial_charge
7,O1,O,O_R,0.500020,0.316790,0.614690,-0.628319
8,O2,O,O_R,0.500000,0.316760,0.385350,-0.604978
9,C1,C,C_R,0.511239,0.260617,0.499919,0.943344
10,O3,O,O_R,0.319080,0.500020,0.614640,-0.626555
11,O4,O,O_R,0.319070,0.499980,0.385300,-0.570235
...,...,...,...,...,...,...,...
77,H19,H,H_,0.720880,0.734379,0.913303,0.347912
78,C30,C,C_2,0.348847,0.101593,0.756535,0.967858
79,O20,O,O_2,0.293392,0.097996,0.863474,-0.568112
80,O21,O,O_R,0.284013,0.135186,0.637326,-0.818747


# Apply Adapters

In [10]:
def chuck_pickle(df, n, path='.'):
    for i in range(1, df.shape[0], n):
        chuck = df.loc[i:i+n -1]
        start = chuck.index[0]
        end = chuck.index[-1]
        filename = f'{start}_{end}.pickle'
        filepath = (path + '/'+ filename).replace('//', '/')
        if not os.path.exists(filepath):
            print(f"Pickle file from {start} to {end} not found, generating the new one")
            chuck['cif'] = chuck.cif_filepath.progress_apply(
                lambda cif_filepath: cif2pandas_adapter.apply(cif_filepath)
            )
            chuck.to_pickle(filepath)
        else:
            print(f"found {filepath}")


chuck_pickle(Train, 10000, path=PREPROCESSING_TRAIN_PATH)
chuck_pickle(Pretest, 10000, path=PREPROCESSING_PRETEST_PATH) 

found preprocessing_results/train/1_10000.pickle
found preprocessing_results/train/10001_20000.pickle
found preprocessing_results/train/20001_30000.pickle
found preprocessing_results/train/30001_40000.pickle
found preprocessing_results/train/40001_50000.pickle
found preprocessing_results/train/50001_60000.pickle
found preprocessing_results/train/60001_68613.pickle
found preprocessing_results/pretest/1_2000.pickle


In [11]:
chuck = pd.read_pickle("/work/preprocessing_results/train/1_10000.pickle")
merge_metadata(chuck)

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,...,_symmetry_space_group_name_H-M,_symmetry_Int_Tables_number,_space_group_crystal_system,_cell_length_a,_cell_length_b,_cell_length_c,_cell_angle_alpha,_cell_angle_beta,_cell_angle_gamma,_cell_volume
1,mof_unit_1,1116.667429,875.240600,0.00,0.07899,0.0607,COOH-OEt,3,4,11,...,P1,1,triclinic,10.609882,10.643578,9.890832,89.904234,91.242355,90.261935,1116.66742888
2,mof_unit_2,2769.503842,2211.697211,603.61,0.13794,0.1040,F-OMe,10,44,57,...,P1,1,triclinic,8.463295,17.684225,18.960098,100.063446,91.815775,96.94221,2769.50384235
3,mof_unit_3,1089.818728,773.687960,788.50,0.14874,0.1262,OMe-COOH,2,22,24,...,P1,1,triclinic,10.73211,9.552271,10.631996,89.202223,89.943258,90.387501,1089.81872751
4,mof_unit_4,2205.198301,1304.638720,1441.53,0.21814,0.2220,H-SO3H,9,17,24,...,P1,1,triclinic,6.93553,17.504896,19.27498,109.537419,90.23458,90.532988,2205.19830122
5,mof_unit_5,1137.800963,901.736120,0.00,0.07778,0.0591,NHMe-OH,2,1,22,...,P1,1,triclinic,10.825925,9.699886,10.853274,89.694657,92.990983,91.404649,1137.80096321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,mof_unit_9996,2733.592152,2128.550096,734.48,0.17093,0.1322,COOH-NO2,10,44,55,...,P1,1,triclinic,8.408072,17.316532,19.086984,98.096373,92.225944,95.739547,2733.59215194
9997,mof_unit_9997,2930.745926,1211.695820,3040.39,0.40394,0.5884,HCO-SO3H,3,14,29,...,P1,1,triclinic,27.732558,10.71514,9.868152,89.002148,90.135056,91.6441,2930.74592589
9998,mof_unit_9998,4142.678504,2475.175680,-1.00,-1.00000,0.0000,H-NO2,4,6,17,...,P1,1,triclinic,12.927189,13.236449,27.764004,94.30699,90.581332,118.869563,4142.67850366
9999,mof_unit_9999,1101.969860,1702.390120,0.00,0.09968,0.0389,I-Br,2,10,26,...,P1,1,triclinic,17.232074,8.78406,7.504137,103.137752,89.887983,94.877927,1101.96986013


In [12]:
Train = read_pandas_pickle(PREPROCESSING_TRAIN_PATH)
Train = merge_metadata(Train)

Pretest = read_pandas_pickle(PREPROCESSING_PRETEST_PATH)
Pretest = merge_metadata(Pretest)

60001_68613.pickle
40001_50000.pickle
10001_20000.pickle
1_10000.pickle
20001_30000.pickle
50001_60000.pickle
30001_40000.pickle
1_2000.pickle


In [13]:
Train.to_pickle("/work/preprocessing_results/train_merged_CIF/Train.pickle")
Pretest.to_pickle("/work/preprocessing_results/pretest_merged_CIF/Pretest.pickle")

In [14]:
Train

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,...,_symmetry_space_group_name_H-M,_symmetry_Int_Tables_number,_space_group_crystal_system,_cell_length_a,_cell_length_b,_cell_length_c,_cell_angle_alpha,_cell_angle_beta,_cell_angle_gamma,_cell_volume
1,mof_unit_1,1116.667429,875.240600,0.00,0.07899,0.0607,COOH-OEt,3,4,11,...,P1,1,triclinic,10.609882,10.643578,9.890832,89.904234,91.242355,90.261935,1116.66742888
2,mof_unit_2,2769.503842,2211.697211,603.61,0.13794,0.1040,F-OMe,10,44,57,...,P1,1,triclinic,8.463295,17.684225,18.960098,100.063446,91.815775,96.94221,2769.50384235
3,mof_unit_3,1089.818728,773.687960,788.50,0.14874,0.1262,OMe-COOH,2,22,24,...,P1,1,triclinic,10.73211,9.552271,10.631996,89.202223,89.943258,90.387501,1089.81872751
4,mof_unit_4,2205.198301,1304.638720,1441.53,0.21814,0.2220,H-SO3H,9,17,24,...,P1,1,triclinic,6.93553,17.504896,19.27498,109.537419,90.23458,90.532988,2205.19830122
5,mof_unit_5,1137.800963,901.736120,0.00,0.07778,0.0591,NHMe-OH,2,1,22,...,P1,1,triclinic,10.825925,9.699886,10.853274,89.694657,92.990983,91.404649,1137.80096321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68609,mof_unit_68609,1188.302573,1001.700216,0.00,0.00000,0.0000,Pr-F,3,4,24,...,P1,1,triclinic,10.718161,10.88649,10.19387,90.842269,92.241073,90.753457,1188.3025732
68610,mof_unit_68610,1506.660363,1493.296496,0.00,0.01108,0.0000,SO3H,10,42,46,...,P1,1,triclinic,8.19262,12.57623,15.033794,95.184751,99.147187,97.430499,1506.66036263
68611,mof_unit_68611,2035.532738,1959.518320,0.00,0.00000,0.0000,OPr,4,14,22,...,P1,1,triclinic,11.237482,11.321902,18.60812,90.200546,90.118422,59.292454,2035.53273793
68612,mof_unit_68612,3985.426053,3638.677280,0.00,0.00000,0.0000,OPr-Me,4,4,15,...,P1,1,triclinic,19.396341,11.081428,18.544746,90.467105,90.841157,89.911826,3985.42605274


In [15]:
Train.drop("cif", axis=1).to_csv("/work/preprocessing_results/train_merged_CIF/Train.csv")
Pretest.drop("cif", axis=1).to_csv("/work/preprocessing_results/pretest_merged_CIF/Pretest.csv")

In [16]:
len(set(Train['_space_group_crystal_system']))

5

In [17]:
set(Train['_space_group_crystal_system'])

{'cubic', 'monoclinic', 'orthorhombic', 'tetragonal', 'triclinic'}

In [19]:
set(Train['functional_groups'])

{'Br',
 'Br-CN',
 'Br-COOH',
 'Br-Cl',
 'Br-Et',
 'Br-F',
 'Br-H',
 'Br-HCO',
 'Br-I',
 'Br-Me',
 'Br-NH2',
 'Br-NHMe',
 'Br-NO2',
 'Br-OEt',
 'Br-OH',
 'Br-OMe',
 'Br-OPr',
 'Br-Ph',
 'Br-Pr',
 'Br-SO3H',
 'CN',
 'CN-Br',
 'CN-COOH',
 'CN-Cl',
 'CN-Et',
 'CN-F',
 'CN-H',
 'CN-HCO',
 'CN-I',
 'CN-Me',
 'CN-NH2',
 'CN-NHMe',
 'CN-NO2',
 'CN-OEt',
 'CN-OH',
 'CN-OMe',
 'CN-OPr',
 'CN-Ph',
 'CN-Pr',
 'CN-SO3H',
 'COOH',
 'COOH-Br',
 'COOH-CN',
 'COOH-Cl',
 'COOH-Et',
 'COOH-F',
 'COOH-H',
 'COOH-HCO',
 'COOH-I',
 'COOH-Me',
 'COOH-NH2',
 'COOH-NHMe',
 'COOH-NO2',
 'COOH-OEt',
 'COOH-OH',
 'COOH-OMe',
 'COOH-OPr',
 'COOH-Ph',
 'COOH-Pr',
 'COOH-SO3H',
 'Cl',
 'Cl-Br',
 'Cl-CN',
 'Cl-COOH',
 'Cl-Et',
 'Cl-F',
 'Cl-H',
 'Cl-HCO',
 'Cl-I',
 'Cl-Me',
 'Cl-NH2',
 'Cl-NHMe',
 'Cl-NO2',
 'Cl-OEt',
 'Cl-OH',
 'Cl-OMe',
 'Cl-OPr',
 'Cl-Ph',
 'Cl-Pr',
 'Cl-SO3H',
 'Et',
 'Et-Br',
 'Et-CN',
 'Et-COOH',
 'Et-Cl',
 'Et-F',
 'Et-H',
 'Et-HCO',
 'Et-I',
 'Et-Me',
 'Et-NH2',
 'Et-NHMe',
 'Et-NO2',
 'Et-OE

In [18]:
set(Pretest['_space_group_crystal_system'])

{'monoclinic', 'orthorhombic', 'tetragonal', 'triclinic'}

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cf8541de-dbc3-45f6-bc1e-4fa446cacbcd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>