In [408]:
#import all necessary libraries
import pandas as pd
import numpy as np

In [409]:
#import cleaned training dataset
train_df = pd.read_csv('../data/train_improved.csv')
train_df.head()

Unnamed: 0,id,SMILES,Tm,Group 1,Group 2,Group 3,Group 4,Group 5,Group 6,Group 7,...,Group 327,Group 328,Group 331,Group 365,Group 367,Group 372,Group 374,Group 401,Group 402,Group 403
0,2175,FC1=C(F)C(F)(F)C1(F)F,213.15,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1222,c1ccc2c(c1)ccc3Nc4ccccc4c23,407.15,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,2,0,0
2,2994,CCN1C(C)=Nc2ccccc12,324.15,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1704,CC#CC(=O)O,351.15,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2526,CCCCC(S)C,126.15,2,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [410]:
#drop 'id' and 'SMILES' columns and separate features and target variable
TARGET = 'Tm'
X = train_df.drop(columns=['id', TARGET])
y = train_df[TARGET]

In [411]:
#generate basic + intermediate + advanced smiles features and append at the dataframe
import sys 
sys.path.append("..")
from features.basic_smiles_feature_generator import BasicSmilesFeatureGenerator
from features.intermediate_smiles_feature_generator import IntermediateSmilesFeatureGenerator
from features.advanced_smiles_feature_generator import AdvancedSmilesFeatureGenerator

basic_feature_generator = BasicSmilesFeatureGenerator(smiles_col="SMILES")
basic_smiles_features = basic_feature_generator.generate(X)

intermediate_feature_generator = IntermediateSmilesFeatureGenerator(smiles_col="SMILES")
intermediate_smiles_features = intermediate_feature_generator.generate(X)

advanced_feature_generator = AdvancedSmilesFeatureGenerator(smiles_col="SMILES")
advanced_smiles_features = advanced_feature_generator.generate(X)

X = pd.concat(
    [basic_smiles_features, intermediate_smiles_features, advanced_smiles_features,X.drop(columns=['SMILES'])], 
    axis=1
)



In [412]:
#split data into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [413]:
#create a light gbm regressor

from lightgbm import LGBMRegressor


model1 = LGBMRegressor(
        objective="regression",      # continuous target
        metric="rmse",               # change to "mae" if Kaggle uses MAE
        
        n_estimators=2000,           # large, use early stopping
        learning_rate=0.03,          # small LR = stable learning
        
        num_leaves=64,               # good balance for tabular data
        max_depth=-1,                # let leaves control complexity
        
        min_child_samples=20,        # prevents overfitting
        min_child_weight=1e-3,
        
        subsample=0.8,               # row sampling
        subsample_freq=1,
        colsample_bytree=0.8,        # feature sampling
        
        reg_alpha=0.1,               # L1 regularization
        reg_lambda=0.1,              # L2 regularization
        
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )


In [414]:
#train model1 on training data
model1.fit(X_train, y_train)

0,1,2
,boosting_type,'gbdt'
,num_leaves,64
,max_depth,-1
,learning_rate,0.03
,n_estimators,2000
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [415]:
#predict using model1 on validation set
y_pred = model1.predict(X_val)

In [416]:
#judge model1 on mean absolute error
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_val, y_pred)
print(f"Mean Data Value: {y_val.mean()}")
print(f"Mean Absolute Error: {mae}")
print(f"MAE Percentage: {mae / y_val.mean() * 100:.2f}%")

Mean Data Value: 280.14812382739217
Mean Absolute Error: 30.95824012261052
MAE Percentage: 11.05%


In [417]:
#create pipeline with standard scaler, pca and light gbm regressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

model2 = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),
    ('lgbm', LGBMRegressor(
        objective="regression",      # continuous target
        metric="rmse",               # change to "mae" if Kaggle uses MAE
        
        n_estimators=2000,           # large, use early stopping
        learning_rate=0.03,          # small LR = stable learning
        
        num_leaves=64,               # good balance for tabular data
        max_depth=-1,                # let leaves control complexity
        
        min_child_samples=20,        # prevents overfitting
        min_child_weight=1e-3,
        
        subsample=0.8,               # row sampling
        subsample_freq=1,
        colsample_bytree=0.8,        # feature sampling
        
        reg_alpha=0.1,               # L1 regularization
        reg_lambda=0.1,              # L2 regularization
        
        random_state=42,
        n_jobs=-1,
        verbose=-1
    ))
])

In [418]:
#train model2 on training data
model2.fit(X_train, y_train)

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('scaler', ...), ('pca', ...), ...]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"copy  copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.",True
,"with_mean  with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.",True
,"with_std  with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).",True

0,1,2
,"n_components  n_components: int, float or 'mle', default=None Number of components to keep. if n_components is not set all components are kept::  n_components == min(n_samples, n_features) If ``n_components == 'mle'`` and ``svd_solver == 'full'``, Minka's MLE is used to guess the dimension. Use of ``n_components == 'mle'`` will interpret ``svd_solver == 'auto'`` as ``svd_solver == 'full'``. If ``0 < n_components < 1`` and ``svd_solver == 'full'``, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. If ``svd_solver == 'arpack'``, the number of components must be strictly less than the minimum of n_features and n_samples. Hence, the None case results in::  n_components == min(n_samples, n_features) - 1",0.95
,"copy  copy: bool, default=True If False, data passed to fit are overwritten and running fit(X).transform(X) will not yield the expected results, use fit_transform(X) instead.",True
,"whiten  whiten: bool, default=False When True (False by default) the `components_` vectors are multiplied by the square root of n_samples and then divided by the singular values to ensure uncorrelated outputs with unit component-wise variances. Whitening will remove some information from the transformed signal (the relative variance scales of the components) but can sometime improve the predictive accuracy of the downstream estimators by making their data respect some hard-wired assumptions.",False
,"svd_solver  svd_solver: {'auto', 'full', 'covariance_eigh', 'arpack', 'randomized'}, default='auto' ""auto"" :  The solver is selected by a default 'auto' policy is based on `X.shape` and  `n_components`: if the input data has fewer than 1000 features and  more than 10 times as many samples, then the ""covariance_eigh""  solver is used. Otherwise, if the input data is larger than 500x500  and the number of components to extract is lower than 80% of the  smallest dimension of the data, then the more efficient  ""randomized"" method is selected. Otherwise the exact ""full"" SVD is  computed and optionally truncated afterwards. ""full"" :  Run exact full SVD calling the standard LAPACK solver via  `scipy.linalg.svd` and select the components by postprocessing ""covariance_eigh"" :  Precompute the covariance matrix (on centered data), run a  classical eigenvalue decomposition on the covariance matrix  typically using LAPACK and select the components by postprocessing.  This solver is very efficient for n_samples >> n_features and small  n_features. It is, however, not tractable otherwise for large  n_features (large memory footprint required to materialize the  covariance matrix). Also note that compared to the ""full"" solver,  this solver effectively doubles the condition number and is  therefore less numerical stable (e.g. on input data with a large  range of singular values). ""arpack"" :  Run SVD truncated to `n_components` calling ARPACK solver via  `scipy.sparse.linalg.svds`. It requires strictly  `0 < n_components < min(X.shape)` ""randomized"" :  Run randomized SVD by the method of Halko et al. .. versionadded:: 0.18.0 .. versionchanged:: 1.5  Added the 'covariance_eigh' solver.",'auto'
,"tol  tol: float, default=0.0 Tolerance for singular values computed by svd_solver == 'arpack'. Must be of range [0.0, infinity). .. versionadded:: 0.18.0",0.0
,"iterated_power  iterated_power: int or 'auto', default='auto' Number of iterations for the power method computed by svd_solver == 'randomized'. Must be of range [0, infinity). .. versionadded:: 0.18.0",'auto'
,"n_oversamples  n_oversamples: int, default=10 This parameter is only relevant when `svd_solver=""randomized""`. It corresponds to the additional number of random vectors to sample the range of `X` so as to ensure proper conditioning. See :func:`~sklearn.utils.extmath.randomized_svd` for more details. .. versionadded:: 1.1",10
,"power_iteration_normalizer  power_iteration_normalizer: {'auto', 'QR', 'LU', 'none'}, default='auto' Power iteration normalizer for randomized SVD solver. Not used by ARPACK. See :func:`~sklearn.utils.extmath.randomized_svd` for more details. .. versionadded:: 1.1",'auto'
,"random_state  random_state: int, RandomState instance or None, default=None Used when the 'arpack' or 'randomized' solvers are used. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `. .. versionadded:: 0.18.0",

0,1,2
,boosting_type,'gbdt'
,num_leaves,64
,max_depth,-1
,learning_rate,0.03
,n_estimators,2000
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [419]:
#predict using model2 on validation set
y_pred = model2.predict(X_val)



In [420]:
#judge model2 based on mean absolute error
mae = mean_absolute_error(y_val, y_pred)
print(f"Mean Data Value: {y_val.mean()}")
print(f"Mean Absolute Error: {mae}")
print(f"MAE Percentage: {mae / y_val.mean() * 100:.2f}%")

Mean Data Value: 280.14812382739217
Mean Absolute Error: 32.530620526332115
MAE Percentage: 11.61%


In [421]:
#create a pipeline with TruncatedSVD applied to group numerical features and light GBM regressor
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer

#apply truncated SVD to numerical features with group in their name
numeric_group_cols = [col for col in X.columns if 'group' in col.lower()]
group_col_transformer = ColumnTransformer(
    transformers=[
        ('tsvd', TruncatedSVD(n_components=50), numeric_group_cols)
    ], remainder='passthrough'
)

model3 = Pipeline([
    ('tsvd', group_col_transformer),
    ('lgbm', LGBMRegressor(
        objective="regression",      # continuous target
        metric="rmse",               # change to "mae" if Kaggle uses MAE
        
        n_estimators=2000,           # large, use early stopping
        learning_rate=0.03,          # small LR = stable learning
        
        num_leaves=64,               # good balance for tabular data
        max_depth=-1,                # let leaves control complexity
        
        min_child_samples=20,        # prevents overfitting
        min_child_weight=1e-3,
        
        subsample=0.8,               # row sampling
        subsample_freq=1,
        colsample_bytree=0.8,        # feature sampling
        
        reg_alpha=0.1,               # L1 regularization
        reg_lambda=0.1,              # L2 regularization
        
        random_state=42,
        n_jobs=-1,
        verbose=-1
    ))
])

In [422]:
#fit model3 on training data
model3.fit(X_train, y_train)

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('tsvd', ...), ('lgbm', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"transformers  transformers: list of tuples List of (name, transformer, columns) tuples specifying the transformer objects to be applied to subsets of the data. name : str  Like in Pipeline and FeatureUnion, this allows the transformer and  its parameters to be set using ``set_params`` and searched in grid  search. transformer : {'drop', 'passthrough'} or estimator  Estimator must support :term:`fit` and :term:`transform`.  Special-cased strings 'drop' and 'passthrough' are accepted as  well, to indicate to drop the columns or to pass them through  untransformed, respectively. columns : str, array-like of str, int, array-like of int, array-like of bool, slice or callable  Indexes the data on its second axis. Integers are interpreted as  positional columns, while strings can reference DataFrame columns  by name. A scalar string or int should be used where  ``transformer`` expects X to be a 1d array-like (vector),  otherwise a 2d array will be passed to the transformer.  A callable is passed the input data `X` and can return any of the  above. To select multiple columns by name or dtype, you can use  :obj:`make_column_selector`.","[('tsvd', ...)]"
,"remainder  remainder: {'drop', 'passthrough'} or estimator, default='drop' By default, only the specified columns in `transformers` are transformed and combined in the output, and the non-specified columns are dropped. (default of ``'drop'``). By specifying ``remainder='passthrough'``, all remaining columns that were not specified in `transformers`, but present in the data passed to `fit` will be automatically passed through. This subset of columns is concatenated with the output of the transformers. For dataframes, extra columns not seen during `fit` will be excluded from the output of `transform`. By setting ``remainder`` to be an estimator, the remaining non-specified columns will use the ``remainder`` estimator. The estimator must support :term:`fit` and :term:`transform`. Note that using this feature requires that the DataFrame columns input at :term:`fit` and :term:`transform` have identical order.",'passthrough'
,"sparse_threshold  sparse_threshold: float, default=0.3 If the output of the different transformers contains sparse matrices, these will be stacked as a sparse matrix if the overall density is lower than this value. Use ``sparse_threshold=0`` to always return dense. When the transformed output consists of all dense data, the stacked result will be dense, and this keyword will be ignored.",0.3
,"n_jobs  n_jobs: int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"transformer_weights  transformer_weights: dict, default=None Multiplicative weights for features per transformer. The output of the transformer is multiplied by these weights. Keys are transformer names, values the weights.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each transformer will be printed as it is completed.",False
,"verbose_feature_names_out  verbose_feature_names_out: bool, str or Callable[[str, str], str], default=True - If True, :meth:`ColumnTransformer.get_feature_names_out` will prefix  all feature names with the name of the transformer that generated that  feature. It is equivalent to setting  `verbose_feature_names_out=""{transformer_name}__{feature_name}""`. - If False, :meth:`ColumnTransformer.get_feature_names_out` will not  prefix any feature names and will error if feature names are not  unique. - If ``Callable[[str, str], str]``,  :meth:`ColumnTransformer.get_feature_names_out` will rename all the features  using the name of the transformer. The first argument of the callable is the  transformer name and the second argument is the feature name. The returned  string will be the new feature name. - If ``str``, it must be a string ready for formatting. The given string will  be formatted using two field names: ``transformer_name`` and ``feature_name``.  e.g. ``""{feature_name}__{transformer_name}""``. See :meth:`str.format` method  from the standard library for more info. .. versionadded:: 1.0 .. versionchanged:: 1.6  `verbose_feature_names_out` can be a callable or a string to be formatted.",True
,"force_int_remainder_cols  force_int_remainder_cols: bool, default=False This parameter has no effect. .. note::  If you do not access the list of columns for the remainder columns  in the `transformers_` fitted attribute, you do not need to set  this parameter. .. versionadded:: 1.5 .. versionchanged:: 1.7  The default value for `force_int_remainder_cols` will change from  `True` to `False` in version 1.7. .. deprecated:: 1.7  `force_int_remainder_cols` is deprecated and will be removed in 1.9.",'deprecated'

0,1,2
,"n_components  n_components: int, default=2 Desired dimensionality of output data. If algorithm='arpack', must be strictly less than the number of features. If algorithm='randomized', must be less than or equal to the number of features. The default value is useful for visualisation. For LSA, a value of 100 is recommended.",50
,"algorithm  algorithm: {'arpack', 'randomized'}, default='randomized' SVD solver to use. Either ""arpack"" for the ARPACK wrapper in SciPy (scipy.sparse.linalg.svds), or ""randomized"" for the randomized algorithm due to Halko (2009).",'randomized'
,"n_iter  n_iter: int, default=5 Number of iterations for randomized SVD solver. Not used by ARPACK. The default is larger than the default in :func:`~sklearn.utils.extmath.randomized_svd` to handle sparse matrices that may have large slowly decaying spectrum.",5
,"n_oversamples  n_oversamples: int, default=10 Number of oversamples for randomized SVD solver. Not used by ARPACK. See :func:`~sklearn.utils.extmath.randomized_svd` for a complete description. .. versionadded:: 1.1",10
,"power_iteration_normalizer  power_iteration_normalizer: {'auto', 'QR', 'LU', 'none'}, default='auto' Power iteration normalizer for randomized SVD solver. Not used by ARPACK. See :func:`~sklearn.utils.extmath.randomized_svd` for more details. .. versionadded:: 1.1",'auto'
,"random_state  random_state: int, RandomState instance or None, default=None Used during randomized svd. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `.",
,"tol  tol: float, default=0.0 Tolerance for ARPACK. 0 means machine precision. Ignored by randomized SVD solver.",0.0

0,1,2
,boosting_type,'gbdt'
,num_leaves,64
,max_depth,-1
,learning_rate,0.03
,n_estimators,2000
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [423]:
#predict using model3 on validation set
y_pred = model3.predict(X_val)



In [424]:
#judge model3 based on mean absolute error
mae = mean_absolute_error(y_val, y_pred)
print(f"Mean Data Value: {y_val.mean()}")
print(f"Mean Absolute Error: {mae}")
print(f"MAE Percentage: {mae / y_val.mean() * 100:.2f}%")

Mean Data Value: 280.14812382739217
Mean Absolute Error: 29.962143379559272
MAE Percentage: 10.70%


In [425]:
#import test dataset
test_df = pd.read_csv('../data/test.csv')
test_df.head()

Unnamed: 0,id,SMILES,Group 1,Group 2,Group 3,Group 4,Group 5,Group 6,Group 7,Group 8,...,Group 415,Group 416,Group 417,Group 418,Group 419,Group 420,Group 421,Group 422,Group 423,Group 424
0,1022,CCOC(=O)c1ccc(O)cc1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1146,CCCCCCc1ccc(O)cc1O,1,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,79,ClCBr,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2279,C=CCCCCCCCC,1,7,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1342,Fc1ccc(cc1)C(F)(F)F,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [426]:
#save ids for submission file
test_ids = test_df['id']

#drop 'id' column from test set and generate features
X_test = test_df.drop(columns=['id'])

basic_feature_generator = BasicSmilesFeatureGenerator(smiles_col="SMILES")
basic_smiles_features = basic_feature_generator.generate(X_test)
intermediate_feature_generator = IntermediateSmilesFeatureGenerator(smiles_col="SMILES")
intermediate_smiles_features = intermediate_feature_generator.generate(X_test)
advanced_feature_generator = AdvancedSmilesFeatureGenerator(smiles_col="SMILES")
advanced_smiles_features = advanced_feature_generator.generate(X_test)

X_test = pd.concat(
    [basic_smiles_features, intermediate_smiles_features, advanced_smiles_features,X_test.drop(columns=['SMILES'])], 
    axis=1
)



In [427]:
#apply model3 to predict on test set and save to ../output/submission.csv
test_preds = model3.predict(X_test)
submission_df = pd.DataFrame({'id': test_ids, 'Tm': test_preds})
submission_df.to_csv('../output/submission.csv', index=False)

