In [None]:
import pandas as pd
from src.constants import (
    PREPROCESSING_PATH,
    TARGET
)

import numpy as np
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import power_transform
from utils.classes.extractor import FunctionalGroupToGramExtractor

from typing import (
    Optional,
    List,
    Any
)
from sklearn.model_selection import KFold, RepeatedKFold, RepeatedStratifiedKFold

import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [None]:
from xgboost import Booster, DMatrix


model_void_fraction = Booster()
model_void_fraction.load_model("imputer/xgboost/void_fraction.json")
void_fraction_columns = [
    "heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]",
    "CO2/N2_selectivity",
    "_cell_volume",
    "_cell_length_a",
    "_cell_length_b",
    "_cell_length_c",
    "density"
]

model_surface_area = Booster()
model_surface_area.load_model("imputer/xgboost/surface_area.json")
surface_area_columns = [
        "void_fraction",
        'void_volume [cm^3/g]',
        "heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]",
        "CO2/N2_selectivity",
        "_cell_volume",
        "_cell_length_a",
        "_cell_length_b",
        "_cell_length_c",
        "density"
    ]

In [None]:
Train = pd.read_csv(f"{PREPROCESSING_PATH}/train_merged_CIF/Train.csv")
Train = Train.set_index(Train.columns[0])
Train = Train.replace([np.inf], 99999999)
Train = Train.replace([np.inf], -99999999)
Train['density'] = (Train["weight [u]"] / Train["volume [A^3]"]) * 1.66054


Pretest = pd.read_csv(f"{PREPROCESSING_PATH}/pretest_merged_CIF/Pretest.csv")
Pretest = Pretest.set_index(Pretest.columns[0])
Pretest = Pretest.replace([np.inf], 99999999)
Pretest = Pretest.replace([np.inf], -99999999)
Pretest['density'] = (Train["weight [u]"] / Pretest["volume [A^3]"]) * 1.66054

def impute(df):
    df_imputed = pd.DataFrame()

    # Void fraction
    df_imputed['MOFname'] = df['MOFname']
    df_imputed['void_fraction'] = df['void_fraction']
    df_imputed['void_fraction_imputed'] = 0
    indices = df.loc[df['void_fraction'] <= 0].index

    df_imputed.at[indices, 'void_fraction'] = model_void_fraction.predict(
        DMatrix(df.loc[indices][void_fraction_columns].values)
    )
    df_imputed.at[indices, 'void_fraction_imputed'] = 1
    df.loc[indices, 'void_fraction'] = df_imputed.loc[indices, 'void_fraction']

    # Void volume
    df_imputed['void_volume [cm^3/g]'] = df['void_volume [cm^3/g]']
    df_imputed['void_volume_imputed'] = 0
    indices = df.loc[df['void_volume [cm^3/g]'] <= 0].index
    df_imputed.at[
        indices,
        'void_volume [cm^3/g]'
    ] = df_imputed.loc[indices]['void_fraction'] /df.loc[indices]["density"]
    df.loc[indices, 'void_volume [cm^3/g]'] = df_imputed.loc[indices, 'void_volume [cm^3/g]']
    df_imputed.at[indices, 'void_volume_imputed'] = 1

    # Surface area
    df_imputed['surface_area [m^2/g]'] = df['surface_area [m^2/g]']
    df_imputed['surface_area_imputed'] = 0
    indices = df.loc[df['surface_area [m^2/g]'] <= 0].index
    df_imputed.at[indices, 'surface_area [m^2/g]'] = model_surface_area.predict(
        DMatrix(df.loc[indices][surface_area_columns].values)
    )
    df_imputed.at[indices, 'surface_area_imputed'] = 1
    # Train.loc[indices, 'void_fraction'] = train_imputed.loc[indices, 'void_fraction']
    return df_imputed


imputed_train = impute(Train)

In [None]:
imputed_pretest = impute(Pretest)

In [None]:
imputed_train.to_csv("imputer/xgboost/train.csv")
imputed_pretest.to_csv("imputer/xgboost/pretest.csv")

# Test results

In [None]:
imputed_train['CO2_working_capacity [mL/g]'] = Train['CO2_working_capacity [mL/g]']

In [None]:
imputed_train.columns

Index(['MOFname', 'void_fraction', 'void_fraction_imputed',
       'void_volume [cm^3/g]', 'void_volume_imputed', 'surface_area [m^2/g]',
       'surface_area_imputed', 'CO2_working_capacity [mL/g]'],
      dtype='object')

In [None]:
from xgboost import XGBRegressor

regr = XGBRegressor()
regr.fit(
    imputed_train.drop([
        'MOFname',
        'CO2_working_capacity [mL/g]',
        'void_fraction_imputed',
        'void_volume_imputed',
        'surface_area_imputed'
    ], axis=1).values,
    Train['CO2_working_capacity [mL/g]'].values
)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
from sklearn.metrics import mean_absolute_error
import numpy as np

np.log(np.abs(mean_absolute_error(
    regr.predict(imputed_train.drop([
        'MOFname',
        'CO2_working_capacity [mL/g]',
        'void_fraction_imputed',
        'void_volume_imputed',
        'surface_area_imputed'
    ], axis=1)),
    Train['CO2_working_capacity [mL/g]']
)))

3.686014059049829

In [None]:
regr = XGBRegressor()
regr.fit(
    Train[[
        'void_fraction',
        'void_volume [cm^3/g]',
        'surface_area [m^2/g]'
    ]].values,
    Train['CO2_working_capacity [mL/g]'].values
)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
np.log(np.abs(mean_absolute_error(
    regr.predict(
        Train[[
        'void_fraction',
        'void_volume [cm^3/g]',
        'surface_area [m^2/g]'
    ]].values
    ),
    Train['CO2_working_capacity [mL/g]']
)))

3.6986475376445243

In [None]:
model_void_fraction.predict(DMatrix(Train[void_fraction_columns].values))

array([0.14334765, 0.12290488, 0.16122892, ..., 0.35678   , 0.13348885,
       0.08541402], dtype=float32)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cf8541de-dbc3-45f6-bc1e-4fa446cacbcd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>