# Import modules

In [None]:
import pandas as pd
from src.constants import (
    PREPROCESSING_PATH,
    TARGET
)

import numpy as np
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import power_transform
from catboost import CatBoostRegressor
from utils.classes.extractor import FunctionalGroupToGramExtractor

from typing import (
    Optional,
    List,
    Any
)
from sklearn.model_selection import KFold, RepeatedKFold, RepeatedStratifiedKFold

import warnings
warnings.filterwarnings("ignore")

# Load essentials

In [None]:
Train = pd.read_csv(f"{PREPROCESSING_PATH}/train_merged_CIF/Train.csv")
Train = Train.set_index(Train.columns[0])

Pretest = pd.read_csv(f"{PREPROCESSING_PATH}/pretest_merged_CIF/Pretest.csv")
Pretest = Pretest.set_index(Pretest.columns[0])

funtional_group_extractor = FunctionalGroupToGramExtractor()

In [None]:
# train_imputation = pd.read_csv("imputer/xgboost/train.csv", index_col=0).drop(
#     [
#         'MOFname',
#         'void_fraction_imputed',
#         'void_volume_imputed',
#         'surface_area_imputed',
#         'surface_area [m^2/g]'
#     ],
#     axis=1
# )
# train_imputation.columns

In [None]:
train_imputation = pd.read_csv("imputer/xgboost/train.csv", index_col=0).drop(
    [
        'MOFname',
        'void_fraction_imputed',
        'void_volume_imputed',
        'surface_area_imputed',
        'surface_area [m^2/g]'
    ],
    axis=1
)
# Train = Train.rename(columns={'surface_area [m^2/g]': 'surface_area_old'})
Train = pd.concat(
    [
        Train.drop([
            'void_fraction',
            'void_volume [cm^3/g]'
        ], axis=1),
        train_imputation
    ], axis=1
)

In [None]:
    pretest_imputation = pd.read_csv("imputer/xgboost/pretest.csv", index_col=0).drop(
    [
        'MOFname',
        'void_fraction_imputed',
        'void_volume_imputed',
        'surface_area_imputed',
        'surface_area [m^2/g]'
    ],
    axis=1
)
Pretest = pd.concat(
    [
        Pretest.drop([
            # 'surface_area [m^2/g]',
            'void_fraction',
            'void_volume [cm^3/g]'
        ], axis=1),
        pretest_imputation
    ], axis=1
)

# Set up pipeline

In [None]:

class PreprocessingPipeline:
    funcgroup2num = None
    topology2num = None
    spacegroup2num = None

    def label(self, df):
        # if self.funcgroup2num is None:
        #     self.funcgroup2num = {
        #         c: i
        #         for i, c
        #         in enumerate(df["functional_groups"].unique())
        #     }

        # if self.topology2num is None:
        #     self.topology2num = {
        #         c: i
        #         for i, c
        #         in enumerate(df["topology"].unique())
        #     }
        # df["label_topology"] = df["topology"].map(self.topology2num)

        df["label_topology_pcu"] = (df["topology"] == "pcu").astype(int)
        df["label_topology_sra"] = (df["topology"] == "sra").astype(int)
        df["label_topology_acs"] = (df["topology"] == "acs").astype(int)
        df["label_topology_etb"] = (df["topology"] == "etb").astype(int)
        df["label_topology_bcu"] = (df["topology"] == "bcu").astype(int)
        df["label_topology_nbo"] = (df["topology"] == "nbo").astype(int)
        # if self.spacegroup2num is None:
        #     self.spacegroup2num = {
        #         c: i
        #         for i, c
        #         in enumerate(df["_space_group_crystal_system"].unique())
        #     }

        # df["label_funcgroup"] = df["functional_groups"].map(self.funcgroup2num)
        
        df["label_spacegroup_triclinic"] = (
            df["_space_group_crystal_system"] == "triclinic"
        ).astype(int)
        
        return df

    @staticmethod
    def get_density(df: pd.DataFrame):
        df['density'] = (df["weight [u]"] / df["volume [A^3]"]) * 1.66054
        return df

    @staticmethod
    def replace_surface_area_equal_0_with_null(df: pd.DataFrame):
        df["surface_area [m^2/g]"] = df["surface_area [m^2/g]"].replace(0, np.nan)
        return df

    @staticmethod
    def replace_inf_with_null(df: pd.DataFrame):
        df = df.replace(np.inf, 999999)
        df = df.replace(-np.inf, -999999)
        return df

    @staticmethod
    def drop_unused_columns(
        df, unused_columns: Optional[List[str]] = None
    ) -> pd.DataFrame:
        if not unused_columns:
            unused_columns = [
                "MOFname",
                # 'functional_groups',
                "topology",
                "cif_filepath",
                "_audit_creation_date",
                "_symmetry_Int_Tables_number",
                "_symmetry_space_group_name_H-M",
                "_space_group_crystal_system",
                # "bond_type_countT",
                # 'metal_linker',
                # 'organic_linker1',
                # 'organic_linker2'
                "partial_charge_mean",
                "partial_charge_std",
                "_cell_volume",
                "surface_area_old"
            ]
        for col in unused_columns:
            try:
                df.drop(col, axis=1, inplace=True)
            except KeyError:
                pass
        return df

    @staticmethod
    def set_imputer(X: pd.DataFrame) -> KNNImputer:
        imputer = KNNImputer(n_neighbors=5)
        imputer.fit(X)
        return imputer

    @staticmethod
    def impute_value(X: pd.DataFrame, imputer: KNNImputer):
        return imputer.transform(X)

    @staticmethod
    def extract_functional_group(X: pd.DataFrame, fit: bool) -> pd.DataFrame:
        funtional_group_extacted = funtional_group_extractor.transform(X, fit)
        return pd.concat([X, funtional_group_extractor], axis=1)


class TrainDataPreprocessingPipeline(PreprocessingPipeline):
    @staticmethod
    def drop_surface_area_equal_minus_1(df):
        return df.drop(df[df['surface_area [m^2/g]'] == -1].index)

    def impute(self):
        temp_columns = self.X.columns
        temp_index = self.X.index

        if not self.imputer:
            self.imputer = self.set_imputer(self.X)

        self.X = self.impute_value(self.X, self.imputer)

        self.X = pd.DataFrame(self.X)
        self.X.columns = temp_columns
        self.X.index = temp_index

    def run(self):
        self.functional_group_extractor = FunctionalGroupToGramExtractor()

        # Drop and add featrures
        print("Print droping and replace null")
        self.df = self.drop_surface_area_equal_minus_1(self.df)
        self.df = self.replace_surface_area_equal_0_with_null(self.df)
        self.df = self.replace_inf_with_null(self.df)
        self.df = self.label(self.df)
        self.df = self.drop_unused_columns(self.df)
        self.df = self.df.drop(self.df[self.df["CO2/N2_selectivity"] == 0].index)
        self.df = self.get_density(self.df)
        # Split
        print("Split X, y")
        self.X = self.df.drop(TARGET, axis=1)
        self.y = self.df[[TARGET]]

        # Extract
        # print("Extract functional group")
        # functional_group = self.functional_group_extractor.transform(
        #     self.X[['functional_groups']],
        #     fit=True
        # )

        # Impute
        print("Impute")
        self.X = self.X.drop("functional_groups", axis=1)
        self.impute()
        # self.X = pd.concat([self.X, functional_group], axis=1)

    def __init__(self, df: pd.DataFrame, imputer: Any = None):
        self.df = df
        self.imputer = imputer


In [None]:
class TestDataPreprocessingPipeline(PreprocessingPipeline):
    def impute(self):
        temp_columns = self.X.columns
        temp_index = self.X.index

        if not self.imputer:
            self.imputer = self.set_imputer(self.X)

        self.X = self.impute_value(self.X, self.imputer)

        self.X = pd.DataFrame(self.X)
        self.X.columns = temp_columns
        self.X.index = temp_index

    def run(self):
        # Drop
        print("Print droping and replace null")
        self.X = self.replace_surface_area_equal_0_with_null(self.df)
        self.X = self.replace_inf_with_null(self.X)
        self.X = self.label(self.X)
        self.X = self.drop_unused_columns(self.X)
        self.X = self.get_density(self.X)

        # Extract
        # print("Extract functional group")
        # functional_group = self.functional_group_extractor.transform(
        #     self.X[['functional_groups']],
        # )

        # Impute
        print("Impute")
        self.X = self.X.drop('functional_groups', axis=1)
        self.impute()
        # self.X = pd.concat([self.X, functional_group], axis=1)
        self.X = self.X[self.columns]

    
    def __init__(
        self,
        df: pd.DataFrame,
        imputer: Any = None,
        functional_group_extractor: FunctionalGroupToGramExtractor = None,
        columns: list = None
    ):
        self.df = df
        self.imputer = imputer
        self.functional_group_extractor = functional_group_extractor
        self.columns = columns



# Run pipeline

In [None]:
class Imputer9999:
    def transform(self, df):
        return df.replace(np.nan, -9999999)

imputer = Imputer9999()
train = TrainDataPreprocessingPipeline(Train, imputer)
train.run()

Print droping and replace null
Split X, y
Impute


In [None]:
train.X

Unnamed: 0_level_0,volume [A^3],weight [u],surface_area [m^2/g],metal_linker,organic_linker1,organic_linker2,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol],_cell_length_a,_cell_length_b,...,void_fraction,void_volume [cm^3/g],label_topology_pcu,label_topology_sra,label_topology_acs,label_topology_etb,label_topology_bcu,label_topology_nbo,label_spacegroup_triclinic,density
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1116.667429,875.240600,-9999999.00,3,4,11,22.864166,6.786041e+00,10.609882,10.643578,...,0.078990,0.060700,1,0,0,0,0,0,1,1.301526
2,2769.503842,2211.697211,603.61,10,44,57,33.616780,7.147286e+00,8.463295,17.684225,...,0.137940,0.104000,0,0,0,1,0,0,1,1.326090
3,1089.818728,773.687960,788.50,2,22,24,19.263726,6.347967e+00,10.732110,9.552271,...,0.148740,0.126200,1,0,0,0,0,0,1,1.178856
4,2205.198301,1304.638720,1441.53,9,17,24,25.701377,6.190085e+00,6.935530,17.504896,...,0.218140,0.222000,0,1,0,0,0,0,1,0.982408
5,1137.800963,901.736120,-9999999.00,2,1,22,30.001838,6.478063e+00,10.825925,9.699886,...,0.077780,0.059100,1,0,0,0,0,0,1,1.316020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68609,1188.302573,1001.700216,-9999999.00,3,4,24,24.131770,-9.999999e+06,10.718161,10.886490,...,0.257010,0.183607,1,0,0,0,0,0,1,1.399781
68610,1506.660363,1493.296496,-9999999.00,10,42,46,6.071818,-9.999999e+06,8.192620,12.576230,...,0.011080,0.006732,0,0,0,1,0,0,1,1.645811
68611,2035.532738,1959.518320,-9999999.00,4,14,22,9.876134,-9.999999e+06,11.237482,11.321902,...,0.356780,0.223193,0,0,1,0,0,0,1,1.598529
68612,3985.426053,3638.677280,-9999999.00,4,4,15,5.285051,9.999990e+05,19.396341,11.081428,...,0.133489,0.088049,0,0,1,0,0,0,1,1.516066


In [None]:
test = TestDataPreprocessingPipeline(
    Pretest,
    train.imputer,
    train.functional_group_extractor,
    train.X.columns
)
test.topology2num = train.topology2num
test.spacegroup2num = train.spacegroup2num
test.run()

Print droping and replace null
Impute


In [None]:
assert all(train.X.columns == test.X.columns)

In [None]:
len(train.X.columns)

30

In [None]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import (
    VotingRegressor,
    StackingRegressor,
    RandomForestRegressor,
    GradientBoostingRegressor
)
from sklearn.linear_model import Lasso

In [None]:


def log_mean_absolute_error(y_true, y_pred):
    return np.log(mean_absolute_error(y_true, y_pred))


def fit_catboost(X, y):
    catboost = CatBoostRegressor(
        iterations=500,
        verbose=False,
        # l2_leaf_reg=0.001
    )
    catboost.fit(X, y)
    return catboost


def fit_xgboost(X, y):
    reg = XGBRegressor()
    reg.fit(X.values, y.values)
    return reg


def fit_lightboost(X, y):
    reg = LGBMRegressor()
    reg.fit(X.values, y.values)
    return reg


def fit_voting(X, y):
    reg = VotingRegressor(
        [
            ('cat', CatBoostRegressor(
                    iterations=500,
                    grow_policy='Lossguide',
                    verbose=False,
                    # l2_leaf_reg=0.001
                )
            ),
            ('xgb', XGBRegressor(grow_policy='lossguide')),
            # ('gb', GradientBoostingRegressor())
        ]
    )
    reg.fit(X.values, y.values)
    return reg


fit_models = [fit_voting]
kf = KFold(n_splits=5, shuffle=True, random_state=1234)
# kf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1234)
# kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=1234)
fit_results = {}
for k, (train_index, test_index) in enumerate(kf.split(train.X)):
    print(f"K Fold: {k + 1}")
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = train.X.iloc[train_index], train.X.iloc[test_index]
    y_train, y_test = train.y.iloc[train_index], train.y.iloc[test_index]

    for fit_model in fit_models:
        model_name = '_'.join(fit_model.__name__.split('fit_')[1:])
        if model_name not in fit_results:
            fit_results[model_name] = []

        model = fit_model(X_train, y_train)
        train_pred = model.predict(X_train)
        log_mean_error_train = log_mean_absolute_error(y_train, train_pred)

        # model = fit_model(X_test, y_test)
        test_pred = model.predict(X_test)
        log_mean_error_test = log_mean_absolute_error(y_test, test_pred)

        test_pred = pd.DataFrame(
            test_pred,
            index= y_test.index,
            columns=y_test.columns
        )
        error_sort = np.abs(y_test - test_pred).sort_values(TARGET, ascending=False)
        

        print(f"Log mean error train: {log_mean_error_train}")
        print(f"Log mean error test: {log_mean_error_test}")
        print(error_sort.head(10))
        

        fit_results[model_name].append({
            'model': model,
            'log_mean_error_train': log_mean_error_train,
            'log_mean_error_test': log_mean_error_test
        })

K Fold: 1
TRAIN: [    1     2     3 ... 66712 66713 66714] TEST: [    0     4    13 ... 66699 66708 66711]
Log mean error train: 2.7891317981801182
Log mean error test: 2.975818867322161
            CO2_working_capacity [mL/g]
Unnamed: 0                             
28723                        266.765117
9703                         205.284635
35888                        182.886156
11015                        181.345677
55868                        176.498780
20510                        171.504807
55158                        170.783859
13905                        170.582824
44212                        168.888997
62994                        165.184599
K Fold: 2
TRAIN: [    0     1     2 ... 66711 66713 66714] TEST: [   18    24    30 ... 66700 66703 66712]
Log mean error train: 2.7849041701267248
Log mean error test: 2.9894584623631424
            CO2_working_capacity [mL/g]
Unnamed: 0                             
53863                        228.798167
47904                    

In [None]:
"""
funcgroup2num = {c: i for i, c in enumerate(train['functional_groups'].unique())}
topology2num = {c: i for i, c in enumerate(train['topology'].unique())}
spacegroup2num = {c: i for i, c in enumerate(train['_space_group_crystal_system'].unique())}

train['label_funcgroup'] = train['functional_groups'].map(funcgroup2num)
train['label_topology'] = train['topology'].map(topology2num)
train['label_spacegroup'] = train['_space_group_crystal_system'].map(spacegroup2num)
"""

"\nfuncgroup2num = {c: i for i, c in enumerate(train['functional_groups'].unique())}\ntopology2num = {c: i for i, c in enumerate(train['topology'].unique())}\nspacegroup2num = {c: i for i, c in enumerate(train['_space_group_crystal_system'].unique())}\n\ntrain['label_funcgroup'] = train['functional_groups'].map(funcgroup2num)\ntrain['label_topology'] = train['topology'].map(topology2num)\ntrain['label_spacegroup'] = train['_space_group_crystal_system'].map(spacegroup2num)\n"

In [None]:
np.mean([score['log_mean_error_test'] for score in fit_results['voting']])

2.9945073439803815

In [None]:
np.std([score['log_mean_error_test'] for score in fit_results['voting']])

0.013642386317950879

In [None]:
test_pred = pd.DataFrame(test_pred, index= y_test.index, columns=y_test.columns)
error_sort = np.abs(y_test - test_pred).sort_values(TARGET, ascending=False)

In [None]:
Train.loc[error_sort.index].head(500).mean()

volume [A^3]                                     2.272474e+03
weight [u]                                       1.493163e+03
surface_area [m^2/g]                             6.800387e+02
metal_linker                                     4.896000e+00
organic_linker1                                  1.023400e+01
organic_linker2                                  1.844200e+01
CO2/N2_selectivity                               5.143143e+01
heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]    7.440457e+00
_symmetry_Int_Tables_number                      1.000000e+00
_cell_length_a                                   1.363911e+01
_cell_length_b                                   1.187266e+01
_cell_length_c                                   1.522640e+01
_cell_angle_alpha                                9.124762e+01
_cell_angle_beta                                 9.208854e+01
_cell_angle_gamma                                9.130844e+01
_cell_volume                                     2.272474e+03
partial_

In [None]:
error_train = Train.loc[error_sort.index]
Train["LMAE"] = error_sort.iloc[:, 0]
Train["prediction"] = test_pred.iloc[:, 0]

In [None]:
Train.loc[error_sort.index].to_csv("error_train.csv")

In [None]:
Train.loc[error_sort.index].tail(500).mean()

volume [A^3]                                     6.264446e+03
weight [u]                                       1.921994e+03
surface_area [m^2/g]                             2.823862e+03
metal_linker                                     3.388000e+00
organic_linker1                                  1.110200e+01
organic_linker2                                  2.039600e+01
CO2/N2_selectivity                               1.540580e+01
heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]    4.824315e+00
_symmetry_Int_Tables_number                      1.000000e+00
_cell_length_a                                   1.725140e+01
_cell_length_b                                   1.649214e+01
_cell_length_c                                   1.796945e+01
_cell_angle_alpha                                9.258016e+01
_cell_angle_beta                                 9.425136e+01
_cell_angle_gamma                                9.263996e+01
_cell_volume                                     6.264446e+03
partial_

In [None]:
# train_prediction = .predict(X)

In [None]:
# np.log(mean_absolute_error(train_prediction, y))

In [None]:
# feature_importances = pd.DataFrame(catboost.feature_importances_)
# feature_importances.index = Train.drop(TARGET, axis=1).columns

In [None]:
# feature_importances.sort_values(by=0, ascending=False)

In [None]:
# feature_importances[0].nlargest(20).plot(kind='barh')

In [None]:
# feature_importances[0].nlargest(20).index

In [None]:
sub = np.mean(
    [model['model'].predict(test.X) for model in fit_results['voting']],
    axis=0
)

In [None]:
sub = pd.DataFrame(sub)

In [None]:
# Pretest = pd.read_csv(f"{PREPROCESSING_PATH}/pretest_merged_CIF/Pretest.csv")
# Pretest = Pretest.set_index(Pretest.columns[0])
sub.index = "pretest_" + pd.Index(range(1, sub.shape[0] + 1)).astype(str)
sub.index = sub.index.set_names('id')
sub.columns = ['CO2_working_capacity [mL/g]']

In [None]:
sub

Unnamed: 0_level_0,CO2_working_capacity [mL/g]
id,Unnamed: 1_level_1
pretest_1,103.466330
pretest_2,123.235101
pretest_3,192.207003
pretest_4,65.542557
pretest_5,95.884323
...,...
pretest_1996,1.368971
pretest_1997,3.157281
pretest_1998,-1.186327
pretest_1999,-7.830633


In [None]:
sub.to_csv("submission.csv")

In [None]:
import zipfile
import hashlib
 

sha256_hash = hashlib.sha256()
with open("submission.csv","rb") as f:
    # Read and update hash string value in blocks of 4K
    for byte_block in iter(lambda: f.read(4096),b""):
        sha256_hash.update(byte_block)
    hash_str = sha256_hash.hexdigest()

print(hash_str)
zipfile.ZipFile(f'{hash_str}.zip', mode='w').write("submission.csv")

ab65fee4d74b7f540baa83eeeb463edcbfd44ef08a8c9aa5d460a90c04cb9d3e


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cf8541de-dbc3-45f6-bc1e-4fa446cacbcd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>