# Import modules

In [None]:
import pandas as pd
from src.constants import (
    PREPROCESSING_PATH,
    TARGET
)

import numpy as np
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import power_transform
from catboost import CatBoostRegressor
from utils.classes.extractor import FunctionalGroupToGramExtractor

from typing import (
    Optional,
    List,
    Any
)
from sklearn.model_selection import KFold, RepeatedKFold, RepeatedStratifiedKFold

import warnings
warnings.filterwarnings("ignore")

# Load essentials

In [None]:
Train = pd.read_csv(f"{PREPROCESSING_PATH}/train_merged_CIF/Train.csv")
Train = Train.set_index(Train.columns[0])

Pretest = pd.read_csv(f"{PREPROCESSING_PATH}/pretest_merged_CIF/Pretest.csv")
Pretest = Pretest.set_index(Pretest.columns[0])

funtional_group_extractor = FunctionalGroupToGramExtractor()

# Set up pipeline

In [None]:

class PreprocessingPipeline:
    funcgroup2num = None
    topology2num = None
    spacegroup2num = None

    def label(self, df):
        # if self.funcgroup2num is None:
        #     self.funcgroup2num = {
        #         c: i
        #         for i, c
        #         in enumerate(df["functional_groups"].unique())
        #     }

        # if self.topology2num is None:
        #     self.topology2num = {
        #         c: i
        #         for i, c
        #         in enumerate(df["topology"].unique())
        #     }
        # df["label_topology"] = df["topology"].map(self.topology2num)

        df["label_topology_pcu"] = (df["topology"] == "pcu").astype(int)
        df["label_topology_sra"] = (df["topology"] == "sra").astype(int)
        df["label_topology_acs"] = (df["topology"] == "acs").astype(int)
        df["label_topology_etb"] = (df["topology"] == "etb").astype(int)
        df["label_topology_bcu"] = (df["topology"] == "bcu").astype(int)
        df["label_topology_nbo"] = (df["topology"] == "nbo").astype(int)
        # if self.spacegroup2num is None:
        #     self.spacegroup2num = {
        #         c: i
        #         for i, c
        #         in enumerate(df["_space_group_crystal_system"].unique())
        #     }

        # df["label_funcgroup"] = df["functional_groups"].map(self.funcgroup2num)
        
        df["label_spacegroup_triclinic"] = (
            df["_space_group_crystal_system"] == "triclinic"
        ).astype(int)
        
        return df

    @staticmethod
    def replace_surface_area_equal_0_with_null(df: pd.DataFrame):
        df["surface_area [m^2/g]"] = df["surface_area [m^2/g]"].replace(0, np.nan)
        return df

    @staticmethod
    def replace_inf_with_null(df: pd.DataFrame):
        df = df.replace(np.inf, 999999)
        df = df.replace(-np.inf, -999999)
        return df

    @staticmethod
    def drop_unused_columns(
        df, unused_columns: Optional[List[str]] = None
    ) -> pd.DataFrame:
        if not unused_columns:
            unused_columns = [
                "MOFname",
                # 'functional_groups',
                "topology",
                "cif_filepath",
                "_audit_creation_date",
                "_symmetry_space_group_name_H-M",
                "_space_group_crystal_system",
                # 'metal_linker',
                # 'organic_linker1',
                # 'organic_linker2'
                "partial_charge_mean",
                "partial_charge_std",
                "_cell_volume",
            ]
        for col in unused_columns:
            try:
                df.drop(col, axis=1, inplace=True)
            except KeyError:
                pass
        return df

    @staticmethod
    def set_imputer(X: pd.DataFrame) -> KNNImputer:
        imputer = KNNImputer(n_neighbors=5)
        imputer.fit(X)
        return imputer

    @staticmethod
    def impute_value(X: pd.DataFrame, imputer: KNNImputer):
        return imputer.transform(X)

    @staticmethod
    def extract_functional_group(X: pd.DataFrame, fit: bool) -> pd.DataFrame:
        funtional_group_extacted = funtional_group_extractor.transform(X, fit)
        return pd.concat([X, funtional_group_extractor], axis=1)


class TrainDataPreprocessingPipeline(PreprocessingPipeline):
    @staticmethod
    def drop_surface_area_equal_minus_1(df):
        return df.drop(df[df["surface_area [m^2/g]"] == -1].index)

    def impute(self):
        temp_columns = self.X.columns
        temp_index = self.X.index

        if not self.imputer:
            self.imputer = self.set_imputer(self.X)

        self.X = self.impute_value(self.X, self.imputer)

        self.X = pd.DataFrame(self.X)
        self.X.columns = temp_columns
        self.X.index = temp_index

    def run(self):
        self.functional_group_extractor = FunctionalGroupToGramExtractor()

        # Drop
        print("Print droping and replace null")
        self.df = self.drop_surface_area_equal_minus_1(self.df)
        self.df = self.replace_surface_area_equal_0_with_null(self.df)
        self.df = self.replace_inf_with_null(self.df)
        self.df = self.label(self.df)
        self.df = self.drop_unused_columns(self.df)
        self.df = self.df.drop(self.df[self.df["CO2/N2_selectivity"] == 0].index)
        # Split
        print("Split X, y")
        self.X = self.df.drop(TARGET, axis=1)
        self.y = self.df[[TARGET]]

        # Extract
        # print("Extract functional group")
        # functional_group = self.functional_group_extractor.transform(
        #     self.X[['functional_groups']],
        #     fit=True
        # )

        # Impute
        print("Impute")
        self.X = self.X.drop("functional_groups", axis=1)
        self.impute()
        # self.X = pd.concat([self.X, functional_group], axis=1)

    def __init__(self, df: pd.DataFrame, imputer: Any = None):
        self.df = df
        self.imputer = imputer


In [None]:
class TestDataPreprocessingPipeline(PreprocessingPipeline):
    def impute(self):
        temp_columns = self.X.columns
        temp_index = self.X.index

        if not self.imputer:
            self.imputer = self.set_imputer(self.X)

        self.X = self.impute_value(self.X, self.imputer)

        self.X = pd.DataFrame(self.X)
        self.X.columns = temp_columns
        self.X.index = temp_index

    def run(self):
        # Drop
        print("Print droping and replace null")
        self.X = self.replace_surface_area_equal_0_with_null(self.df)
        self.X = self.replace_inf_with_null(self.X)
        self.X = self.label(self.X)
        self.X = self.drop_unused_columns(self.X)

        # Extract
        # print("Extract functional group")
        # functional_group = self.functional_group_extractor.transform(
        #     self.X[['functional_groups']],
        # )

        # Impute
        print("Impute")
        self.X = self.X.drop('functional_groups', axis=1)
        self.impute()
        # self.X = pd.concat([self.X, functional_group], axis=1)
        self.X = self.X[self.columns]

    
    def __init__(
        self,
        df: pd.DataFrame,
        imputer: Any = None,
        functional_group_extractor: FunctionalGroupToGramExtractor = None,
        columns: list = None
    ):
        self.df = df
        self.imputer = imputer
        self.functional_group_extractor = functional_group_extractor
        self.columns = columns



# Run pipeline

In [None]:
class Imputer9999:
    def transform(self, df):
        return df.replace(np.nan, -9999)

imputer = Imputer9999()
train = TrainDataPreprocessingPipeline(Train, imputer)
train.run()

Print droping and replace null
Split X, y
Impute


In [None]:
train.X

Unnamed: 0_level_0,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],metal_linker,organic_linker1,organic_linker2,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol],...,bond_type_countD,bond_type_countT,bond_type_countAm,label_topology_pcu,label_topology_sra,label_topology_acs,label_topology_etb,label_topology_bcu,label_topology_nbo,label_spacegroup_triclinic
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1116.667429,875.240600,-9999.00,0.07899,0.0607,3,4,11,22.864166,6.786041,...,8.0,0.0,0.0,1,0,0,0,0,0,1
2,2769.503842,2211.697211,603.61,0.13794,0.1040,10,44,57,33.616780,7.147286,...,8.0,1.0,0.0,0,0,0,1,0,0,1
3,1089.818728,773.687960,788.50,0.14874,0.1262,2,22,24,19.263726,6.347967,...,2.0,0.0,0.0,1,0,0,0,0,0,1
4,2205.198301,1304.638720,1441.53,0.21814,0.2220,9,17,24,25.701377,6.190085,...,2.0,0.0,0.0,0,1,0,0,0,0,1
5,1137.800963,901.736120,-9999.00,0.07778,0.0591,2,1,22,30.001838,6.478063,...,0.0,0.0,0.0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68609,1188.302573,1001.700216,-9999.00,0.00000,0.0000,3,4,24,24.131770,-9999.000000,...,0.0,0.0,0.0,1,0,0,0,0,0,1
68610,1506.660363,1493.296496,-9999.00,0.01108,0.0000,10,42,46,6.071818,-9999.000000,...,10.0,0.0,0.0,0,0,0,1,0,0,1
68611,2035.532738,1959.518320,-9999.00,0.00000,0.0000,4,14,22,9.876134,-9999.000000,...,0.0,0.0,0.0,0,0,1,0,0,0,1
68612,3985.426053,3638.677280,-9999.00,0.00000,0.0000,4,4,15,5.285051,999999.000000,...,0.0,0.0,0.0,0,0,1,0,0,0,1


In [None]:
test = TestDataPreprocessingPipeline(
    Pretest,
    train.imputer,
    train.functional_group_extractor,
    train.X.columns
)
test.topology2num = train.topology2num
test.spacegroup2num = train.spacegroup2num
test.run()

Print droping and replace null
Impute


In [None]:
assert all(train.X.columns == test.X.columns)

In [None]:
len(train.X.columns)

30

In [None]:
test.X[train.X.columns]

Unnamed: 0_level_0,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],metal_linker,organic_linker1,organic_linker2,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol],...,bond_type_countD,bond_type_countT,bond_type_countAm,label_topology_pcu,label_topology_sra,label_topology_acs,label_topology_etb,label_topology_bcu,label_topology_nbo,label_spacegroup_triclinic
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,6288.293858,2271.687140,4148.48,0.41225,0.6872,4,7,27,14.048791,5.604779,...,0.0,3.0,0.0,0,0,1,0,0,0,1
2,1790.506437,887.747320,2191.34,0.30231,0.3672,2,4,26,20.217222,6.148776,...,6.0,0.0,0.0,1,0,0,0,0,0,1
3,2348.969203,1239.765880,2030.88,0.28533,0.3256,3,18,22,33.108662,6.164397,...,8.0,0.0,0.0,1,0,0,0,0,0,1
4,2941.571525,1147.951400,3587.13,0.41963,0.6475,2,8,15,12.800562,5.164957,...,2.0,0.0,0.0,1,0,0,0,0,0,1
5,705.397601,643.270740,-9999.00,0.07060,0.0466,3,10,22,23.395617,7.090687,...,2.0,0.0,0.0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1996,5111.109714,1578.082220,3630.31,0.58981,1.1504,2,1,14,4.778698,3.675003,...,3.0,0.0,0.0,0,0,0,0,0,1,1
1997,911.269336,481.279680,2546.02,0.36132,0.4120,2,10,20,6.883387,3.732121,...,2.0,5.0,0.0,1,0,0,0,0,0,1
1998,4236.596494,1127.792600,4296.42,0.60298,1.3641,2,7,20,5.270734,3.354425,...,12.0,5.0,0.0,1,0,0,0,0,0,1
1999,22861.645381,3492.712720,6252.01,0.75732,2.9852,2,6,11,3.172914,2.643592,...,8.0,16.0,0.0,0,0,0,0,0,1,1


In [None]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import (
    VotingRegressor,
    StackingRegressor,
    RandomForestRegressor,
    GradientBoostingRegressor
)
from sklearn.linear_model import Lasso

In [None]:


def log_mean_absolute_error(y_true, y_pred):
    return np.log(mean_absolute_error(y_true, y_pred))


def fit_catboost(X, y):
    catboost = CatBoostRegressor(
        iterations=500,
        verbose=False,
        # l2_leaf_reg=0.001
    )
    catboost.fit(X, y)
    return catboost


def fit_xgboost(X, y):
    reg = XGBRegressor()
    reg.fit(X.values, y.values)
    return reg


def fit_lightboost(X, y):
    reg = LGBMRegressor()
    reg.fit(X.values, y.values)
    return reg


def fit_voting(X, y):
    reg = VotingRegressor(
        [
            ('cat', CatBoostRegressor(
                    iterations=500,
                    verbose=False,
                    # l2_leaf_reg=0.001
                )
            ),
            ('xgb', XGBRegressor()),
            # ('gb', GradientBoostingRegressor())
        ]
    )
    reg.fit(X.values, y.values)
    return reg


fit_models = [fit_voting]
# kf = KFold(n_splits=5, shuffle=True, random_state=1234)
kf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1234)
# kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=1234)
fit_results = {}
for k, (train_index, test_index) in enumerate(kf.split(train.X)):
    print(f"K Fold: {k + 1}")
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = train.X.iloc[train_index], train.X.iloc[test_index]
    y_train, y_test = train.y.iloc[train_index], train.y.iloc[test_index]

    for fit_model in fit_models:
        model_name = '_'.join(fit_model.__name__.split('fit_')[1:])
        if model_name not in fit_results:
            fit_results[model_name] = []

        model = fit_model(X_train, y_train)
        train_pred = model.predict(X_train)
        log_mean_error_train = log_mean_absolute_error(y_train, train_pred)

        # model = fit_model(X_test, y_test)
        test_pred = model.predict(X_test)
        log_mean_error_test = log_mean_absolute_error(y_test, test_pred)

        test_pred = pd.DataFrame(
            test_pred,
            index= y_test.index,
            columns=y_test.columns
        )
        error_sort = np.abs(y_test - test_pred).sort_values(TARGET, ascending=False)
        

        print(f"Log mean error train: {log_mean_error_train}")
        print(f"Log mean error test: {log_mean_error_test}")
        print(error_sort.head(10))
        

        fit_results[model_name].append({
            'model': model,
            'log_mean_error_train': log_mean_error_train,
            'log_mean_error_test': log_mean_error_test
        })

K Fold: 1
TRAIN: [    1     2     3 ... 66712 66713 66714] TEST: [    0     4    13 ... 66699 66708 66711]
Log mean error train: 2.830729637327026
Log mean error test: 2.959877678486929
            CO2_working_capacity [mL/g]
Unnamed: 0                             
28723                        245.345026
13905                        211.465785
17031                        201.485206
26786                        197.109078
11015                        184.487758
20510                        182.813665
9703                         178.451673
44212                        176.864697
55158                        167.463218
55868                        162.222568
K Fold: 2
TRAIN: [    0     1     2 ... 66711 66713 66714] TEST: [   18    24    30 ... 66700 66703 66712]
Log mean error train: 2.824259667309154
Log mean error test: 2.9805359261070077
            CO2_working_capacity [mL/g]
Unnamed: 0                             
47904                        229.046227
53863                      

In [None]:
"""
funcgroup2num = {c: i for i, c in enumerate(train['functional_groups'].unique())}
topology2num = {c: i for i, c in enumerate(train['topology'].unique())}
spacegroup2num = {c: i for i, c in enumerate(train['_space_group_crystal_system'].unique())}

train['label_funcgroup'] = train['functional_groups'].map(funcgroup2num)
train['label_topology'] = train['topology'].map(topology2num)
train['label_spacegroup'] = train['_space_group_crystal_system'].map(spacegroup2num)
"""

"\nfuncgroup2num = {c: i for i, c in enumerate(train['functional_groups'].unique())}\ntopology2num = {c: i for i, c in enumerate(train['topology'].unique())}\nspacegroup2num = {c: i for i, c in enumerate(train['_space_group_crystal_system'].unique())}\n\ntrain['label_funcgroup'] = train['functional_groups'].map(funcgroup2num)\ntrain['label_topology'] = train['topology'].map(topology2num)\ntrain['label_spacegroup'] = train['_space_group_crystal_system'].map(spacegroup2num)\n"

In [None]:
np.mean([score['log_mean_error_test'] for score in fit_results['voting']])

2.9844776476974983

In [None]:
np.std([score['log_mean_error_test'] for score in fit_results['voting']])

0.008564641532892311

In [None]:
test_pred = pd.DataFrame(test_pred, index= y_test.index, columns=y_test.columns)
error_sort = np.abs(y_test - test_pred).sort_values(TARGET, ascending=False)

In [None]:
Train.loc[error_sort.index].head(500).mean()

volume [A^3]                                     2.294609e+03
weight [u]                                       1.470077e+03
surface_area [m^2/g]                             7.099383e+02
void_fraction                                    1.522479e-01
void_volume [cm^3/g]                             1.543830e-01
metal_linker                                     5.026000e+00
organic_linker1                                  1.056200e+01
organic_linker2                                  1.901200e+01
CO2/N2_selectivity                               4.951897e+01
heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]    7.351905e+00
_symmetry_Int_Tables_number                      1.000000e+00
_cell_length_a                                   1.446708e+01
_cell_length_b                                   1.187437e+01
_cell_length_c                                   1.505692e+01
_cell_angle_alpha                                9.150984e+01
_cell_angle_beta                                 9.192422e+01
_cell_an

In [None]:
error_train = Train.loc[error_sort.index]
Train["LMAE"] = error_sort.iloc[:, 0]
Train["prediction"] = test_pred.iloc[:, 0]

In [None]:
Train.loc[error_sort.index].to_csv("error_train.csv")

In [None]:
Train.loc[error_sort.index].tail(500).mean()

volume [A^3]                                     6.262009e+03
weight [u]                                       1.989694e+03
surface_area [m^2/g]                             2.776634e+03
void_fraction                                    4.034569e-01
void_volume [cm^3/g]                             8.045564e-01
metal_linker                                     3.502000e+00
organic_linker1                                  1.102400e+01
organic_linker2                                  2.038400e+01
CO2/N2_selectivity                               1.699629e+01
heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]    4.920585e+00
_symmetry_Int_Tables_number                      1.000000e+00
_cell_length_a                                   1.708530e+01
_cell_length_b                                   1.667149e+01
_cell_length_c                                   1.855528e+01
_cell_angle_alpha                                9.264591e+01
_cell_angle_beta                                 9.565678e+01
_cell_an

In [None]:
# train_prediction = .predict(X)

In [None]:
# np.log(mean_absolute_error(train_prediction, y))

In [None]:
# feature_importances = pd.DataFrame(catboost.feature_importances_)
# feature_importances.index = Train.drop(TARGET, axis=1).columns

In [None]:
# feature_importances.sort_values(by=0, ascending=False)

In [None]:
# feature_importances[0].nlargest(20).plot(kind='barh')

In [None]:
# feature_importances[0].nlargest(20).index

In [None]:
sub = np.mean(
    [model['model'].predict(test.X) for model in fit_results['voting']],
    axis=0
)

In [None]:
sub = pd.DataFrame(sub)

In [None]:
# Pretest = pd.read_csv(f"{PREPROCESSING_PATH}/pretest_merged_CIF/Pretest.csv")
# Pretest = Pretest.set_index(Pretest.columns[0])
sub.index = "pretest_" + pd.Index(range(1, sub.shape[0] + 1)).astype(str)
sub.index = sub.index.set_names('id')
sub.columns = ['CO2_working_capacity [mL/g]']

In [None]:
sub

Unnamed: 0_level_0,CO2_working_capacity [mL/g]
id,Unnamed: 1_level_1
pretest_1,99.998438
pretest_2,124.138334
pretest_3,191.226678
pretest_4,64.744444
pretest_5,93.375602
...,...
pretest_1996,0.615915
pretest_1997,6.403723
pretest_1998,0.923610
pretest_1999,-8.484995


In [None]:
sub.to_csv("submission.csv")

In [None]:
import zipfile
import hashlib
 

sha256_hash = hashlib.sha256()
with open("submission.csv","rb") as f:
    # Read and update hash string value in blocks of 4K
    for byte_block in iter(lambda: f.read(4096),b""):
        sha256_hash.update(byte_block)
    hash_str = sha256_hash.hexdigest()

print(hash_str)
zipfile.ZipFile(f'{hash_str}.zip', mode='w').write("submission.csv")

8f12a781b0dc31999595f4946a02db24b46e2490d0e9b7e27a67b872d245e1ee


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cf8541de-dbc3-45f6-bc1e-4fa446cacbcd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>