# Import modules

In [None]:
# OS
import os

# Typing
from typing import List

# Analyser
import numpy as np
import pandas as pd
from tqdm import tqdm

# Local utilization
from utils.classes.adapters import CIF2PandasAdapter
from utils.functions import (
    get_cif_filepath,
    merge_metadata,
    read_pandas_pickle,
    aggregate_loop_0,
    aggregate_loop_1
)

# Constants
from src.constants import (
    DATASET_PATH,
    CIF_TRAIN_PATH,
    CIF_PRETEST_PATH,
    PREPROCESSING_TRAIN_PATH,
    PREPROCESSING_PRETEST_PATH
)

# pickle
import pickle

tqdm.pandas()

# Load Essentials

## Load Adapters

In [None]:
cif2pandas_adapter = CIF2PandasAdapter()

## Load CSVs

In [None]:
Train = pd.read_csv(f"{DATASET_PATH}/train.csv".replace("//", "/"))
Train.index = Train.index + 1
Pretest = pd.read_csv(f"{DATASET_PATH}/pretest.csv".replace("//", "/"))
Pretest.index = Pretest.index + 1

# Join data with CIF filename

In [None]:
Train = get_cif_filepath(Train, CIF_TRAIN_PATH)
Train[['MOFname', 'cif_filepath']].head(5)

Unnamed: 0,MOFname,cif_filepath
1,mof_unit_1,tmlcc-2021/mof_cif_train/mof_unit_1.cif
2,mof_unit_2,tmlcc-2021/mof_cif_train/mof_unit_2.cif
3,mof_unit_3,tmlcc-2021/mof_cif_train/mof_unit_3.cif
4,mof_unit_4,tmlcc-2021/mof_cif_train/mof_unit_4.cif
5,mof_unit_5,tmlcc-2021/mof_cif_train/mof_unit_5.cif


In [None]:
Pretest= get_cif_filepath(Pretest, CIF_PRETEST_PATH)
Pretest[['MOFname', 'cif_filepath']].head(5)

Unnamed: 0,MOFname,cif_filepath
1,mof_unit_pretest_1,tmlcc-2021/mof_cif_pretest/mof_unit_pretest_1.cif
2,mof_unit_pretest_2,tmlcc-2021/mof_cif_pretest/mof_unit_pretest_2.cif
3,mof_unit_pretest_3,tmlcc-2021/mof_cif_pretest/mof_unit_pretest_3.cif
4,mof_unit_pretest_4,tmlcc-2021/mof_cif_pretest/mof_unit_pretest_4.cif
5,mof_unit_pretest_5,tmlcc-2021/mof_cif_pretest/mof_unit_pretest_5.cif


# Test Adapter

## CIF

In [None]:
adapted_cif: dict = cif2pandas_adapter.apply(Train.cif_filepath[1])

In [None]:
pd.DataFrame(adapted_cif['metadata'])

Unnamed: 0,0,1
0,_audit_creation_date,2013-03-05T16:41:16-0500
1,_symmetry_space_group_name_H-M,P1
2,_symmetry_Int_Tables_number,1
3,_space_group_crystal_system,triclinic
4,_cell_length_a,10.609882
5,_cell_length_b,10.643578
6,_cell_length_c,9.890832
7,_cell_angle_alpha,89.904234
8,_cell_angle_beta,91.242355
9,_cell_angle_gamma,90.261935


In [None]:
pd.DataFrame(adapted_cif['loops'][0])

Unnamed: 0,_atom_site_label,_atom_site_type_symbol,_atom_site_description,_atom_site_fract_x,_atom_site_fract_y,_atom_site_fract_z,_atom_type_partial_charge
7,O1,O,O_R,0.500020,0.316790,0.614690,-0.628319
8,O2,O,O_R,0.500000,0.316760,0.385350,-0.604978
9,C1,C,C_R,0.511239,0.260617,0.499919,0.943344
10,O3,O,O_R,0.319080,0.500020,0.614640,-0.626555
11,O4,O,O_R,0.319070,0.499980,0.385300,-0.570235
...,...,...,...,...,...,...,...
77,H19,H,H_,0.720880,0.734379,0.913303,0.347912
78,C30,C,C_2,0.348847,0.101593,0.756535,0.967858
79,O20,O,O_2,0.293392,0.097996,0.863474,-0.568112
80,O21,O,O_R,0.284013,0.135186,0.637326,-0.818747


In [None]:
pd.DataFrame(adapted_cif['loops'][1])

Unnamed: 0,_geom_bond_atom_site_label_1,_geom_bond_atom_site_label_2,_geom_bond_distance,_ccdc_geom_bond_type
4,O4,Zn2,1.964878,S
5,Zn1,O7,1.964880,S
6,C10,C11,1.421566,A
7,C21,C22,1.527080,S
8,O19,H19,1.013968,S
...,...,...,...,...
83,O13,C25,1.411417,S
84,C21,H8,1.113555,S
85,C14,C15,1.497242,S
86,O6,C3,1.284366,A


# Apply Adapters

In [None]:
def chuck_pickle(df, n, path='.'):
    for i in range(1, df.shape[0], n):
        chuck = df.loc[i:i+n -1]
        start = chuck.index[0]
        end = chuck.index[-1]
        filename = f'{start}_{end}.pickle'
        filepath = (path + '/'+ filename).replace('//', '/')
        if not os.path.exists(filepath):
            print(f"Pickle file from {start} to {end} not found, generating the new one")
            chuck['cif'] = chuck.cif_filepath.progress_apply(
                lambda cif_filepath: cif2pandas_adapter.apply(cif_filepath)
            )
            chuck.to_pickle(filepath)
        else:
            print(f"found {filepath}")


chuck_pickle(Train, 10000, path=PREPROCESSING_TRAIN_PATH)
chuck_pickle(Pretest, 10000, path=PREPROCESSING_PRETEST_PATH) 

found preprocessing_results/train/1_10000.pickle
found preprocessing_results/train/10001_20000.pickle
found preprocessing_results/train/20001_30000.pickle
found preprocessing_results/train/30001_40000.pickle
found preprocessing_results/train/40001_50000.pickle
found preprocessing_results/train/50001_60000.pickle
found preprocessing_results/train/60001_68613.pickle
found preprocessing_results/pretest/1_2000.pickle


# Merge

## metadata

In [None]:
Train = read_pandas_pickle(PREPROCESSING_TRAIN_PATH)
Train = merge_metadata(Train)

Pretest = read_pandas_pickle(PREPROCESSING_PRETEST_PATH)
Pretest = merge_metadata(Pretest)

60001_68613.pickle
40001_50000.pickle
10001_20000.pickle
20001_30000.pickle
50001_60000.pickle
1_10000.pickle
30001_40000.pickle
1_2000.pickle


## Loop 0

In [None]:
Train = aggregate_loop_0(Train)

100%|██████████| 68613/68613 [02:15<00:00, 505.26it/s]


In [None]:
Pretest = aggregate_loop_0(Pretest)

100%|██████████| 2000/2000 [00:04<00:00, 492.43it/s]


## Loop 1

In [None]:
Train = aggregate_loop_1(Train)

100%|██████████| 68613/68613 [01:21<00:00, 838.91it/s]


In [None]:
Pretest = aggregate_loop_1(Pretest)

100%|██████████| 2000/2000 [00:02<00:00, 840.50it/s]


# Finalize

## Sort columns

In [None]:
target = ['CO2_working_capacity [mL/g]']
sorted_columns = list(
    Train.drop('CO2_working_capacity [mL/g]', axis=1).#
        columns
) + target

Train = Train[sorted_columns]

## Save

In [None]:
Train.to_pickle("/work/preprocessing_results/train_merged_CIF/Train.pickle")
Pretest.to_pickle("/work/preprocessing_results/pretest_merged_CIF/Pretest.pickle")

Train.drop("cif", axis=1).to_csv("/work/preprocessing_results/train_merged_CIF/Train.csv")
Pretest.drop("cif", axis=1).to_csv("/work/preprocessing_results/pretest_merged_CIF/Pretest.csv")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cf8541de-dbc3-45f6-bc1e-4fa446cacbcd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>