# Import modules

In [1]:
# OS
import os

# Typing
from typing import List

# Analyser
import pandas as pd
from tqdm import tqdm

# Local utilization
from utils.classes.adapters import CIF2PandasAdapter
from utils.functions import get_cif_filepath

# Constants
from src.constants import (
    DATASET_PATH,
    TRAIN_PATH,
    TEST_PATH
)

# pickle
import pickle

tqdm.pandas()

# Load Essentials

## Load Adapters

In [2]:
cif2pandas_adapter = CIF2PandasAdapter()

## Load CSVs

In [3]:
Train = pd.read_csv(f"{DATASET_PATH}/train.csv".replace("//", "/"))
Train.index = Train.index + 1
Pretest = pd.read_csv(f"{DATASET_PATH}/pretest.csv".replace("//", "/"))
Pretest.index = Pretest.index + 1

# Join data

In [4]:
Train = get_cif_filepath(Train, TRAIN_PATH)
Train[['MOFname', 'cif_filepath']].head(5)

Unnamed: 0,MOFname,cif_filepath
1,mof_unit_1,tmlcc-2021/mof_cif_train/mof_unit_1.cif
2,mof_unit_2,tmlcc-2021/mof_cif_train/mof_unit_2.cif
3,mof_unit_3,tmlcc-2021/mof_cif_train/mof_unit_3.cif
4,mof_unit_4,tmlcc-2021/mof_cif_train/mof_unit_4.cif
5,mof_unit_5,tmlcc-2021/mof_cif_train/mof_unit_5.cif


# Test Adapter

## CIF

In [5]:
adapted_cif: dict = cif2pandas_adapter.apply(Train.cif_filepath[1])

In [6]:
pd.DataFrame(adapted_cif['metadata'])

Unnamed: 0,0,1
0,_audit_creation_date,2013-03-05T16:41:16-0500
1,_symmetry_space_group_name_H-M,P1
2,_symmetry_Int_Tables_number,1
3,_space_group_crystal_system,triclinic
4,_cell_length_a,10.609882
5,_cell_length_b,10.643578
6,_cell_length_c,9.890832
7,_cell_angle_alpha,89.904234
8,_cell_angle_beta,91.242355
9,_cell_angle_gamma,90.261935


In [7]:
pd.DataFrame(adapted_cif['loops'][0])

Unnamed: 0,_atom_site_label,_atom_site_type_symbol,_atom_site_description,_atom_site_fract_x,_atom_site_fract_y,_atom_site_fract_z,_atom_type_partial_charge
7,O1,O,O_R,0.500020,0.316790,0.614690,-0.628319
8,O2,O,O_R,0.500000,0.316760,0.385350,-0.604978
9,C1,C,C_R,0.511239,0.260617,0.499919,0.943344
10,O3,O,O_R,0.319080,0.500020,0.614640,-0.626555
11,O4,O,O_R,0.319070,0.499980,0.385300,-0.570235
...,...,...,...,...,...,...,...
77,H19,H,H_,0.720880,0.734379,0.913303,0.347912
78,C30,C,C_2,0.348847,0.101593,0.756535,0.967858
79,O20,O,O_2,0.293392,0.097996,0.863474,-0.568112
80,O21,O,O_R,0.284013,0.135186,0.637326,-0.818747


In [8]:
pd.DataFrame(adapted_cif['loops'][1])

Unnamed: 0,_atom_site_label,_atom_site_type_symbol,_atom_site_description,_atom_site_fract_x,_atom_site_fract_y,_atom_site_fract_z,_atom_type_partial_charge
7,O1,O,O_R,0.500020,0.316790,0.614690,-0.628319
8,O2,O,O_R,0.500000,0.316760,0.385350,-0.604978
9,C1,C,C_R,0.511239,0.260617,0.499919,0.943344
10,O3,O,O_R,0.319080,0.500020,0.614640,-0.626555
11,O4,O,O_R,0.319070,0.499980,0.385300,-0.570235
...,...,...,...,...,...,...,...
77,H19,H,H_,0.720880,0.734379,0.913303,0.347912
78,C30,C,C_2,0.348847,0.101593,0.756535,0.967858
79,O20,O,O_2,0.293392,0.097996,0.863474,-0.568112
80,O21,O,O_R,0.284013,0.135186,0.637326,-0.818747


# Apply Adapters

In [None]:
# n = 200000  #chunk row size
# list_df = [df[i:i+n] for i in range(0, Train.shape[0],n)]
def chuck_pickle(df, n, path='.'):
    for i in range(0, df.shape[0],n):
        filename = f'{i}_{i+n}.pickle'
        filepath = (path + '/'+ filename).replace('//', '/')
        if not os.path.exists(filepath):
            print(f"Pickle file from {i} to {i+n} not found, generating the new one")
            chuck = df[i:i+n]
            chuck['cif'] = chuck.cif_filepath.progress_apply(
                lambda cif_filepath: cif2pandas_adapter.apply(cif_filepath)
            )
            chuck.to_pickle(filepath)

PREPROCESSING_TRAIN_PATH = 'preprocessing_results/Train'
chuck_pickle(Train, 10000, path=PREPROCESSING_TRAIN_PATH)      

Pickle file from 0 to 10000 not found, generating the new one
  2%|▏         | 226/10000 [00:03<02:18, 70.49it/s]

In [None]:
 Train

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol],CO2_working_capacity [mL/g],cif_filepath,cif
0,mof_unit_1,1116.667429,875.240600,0.00,0.07899,0.0607,COOH-OEt,3,4,11,pcu,22.864166,6.786041,105.284502,tmlcc-2021/mof_cif_train/mof_unit_1.cif,1.0
1,mof_unit_2,2769.503842,2211.697211,603.61,0.13794,0.1040,F-OMe,10,44,57,etb,33.616780,7.147286,101.224774,tmlcc-2021/mof_cif_train/mof_unit_2.cif,
2,mof_unit_3,1089.818728,773.687960,788.50,0.14874,0.1262,OMe-COOH,2,22,24,pcu,19.263726,6.347967,118.987011,tmlcc-2021/mof_cif_train/mof_unit_3.cif,
3,mof_unit_4,2205.198301,1304.638720,1441.53,0.21814,0.2220,H-SO3H,9,17,24,sra,25.701377,6.190085,187.626004,tmlcc-2021/mof_cif_train/mof_unit_4.cif,
4,mof_unit_5,1137.800963,901.736120,0.00,0.07778,0.0591,NHMe-OH,2,1,22,pcu,30.001838,6.478063,79.210001,tmlcc-2021/mof_cif_train/mof_unit_5.cif,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68608,mof_unit_68609,1188.302573,1001.700216,0.00,0.00000,0.0000,Pr-F,3,4,24,pcu,24.131770,,-12.943652,tmlcc-2021/mof_cif_train/mof_unit_68609.cif,
68609,mof_unit_68610,1506.660363,1493.296496,0.00,0.01108,0.0000,SO3H,10,42,46,etb,6.071818,,-12.985582,tmlcc-2021/mof_cif_train/mof_unit_68610.cif,
68610,mof_unit_68611,2035.532738,1959.518320,0.00,0.00000,0.0000,OPr,4,14,22,acs,9.876134,,-13.187635,tmlcc-2021/mof_cif_train/mof_unit_68611.cif,
68611,mof_unit_68612,3985.426053,3638.677280,0.00,0.00000,0.0000,OPr-Me,4,4,15,acs,5.285051,inf,15.672698,tmlcc-2021/mof_cif_train/mof_unit_68612.cif,


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cf8541de-dbc3-45f6-bc1e-4fa446cacbcd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>