In [25]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import numpy as np

In [26]:
dir = r"C:\Users\arnab\OneDrive\Jupyter Notebook\Retention_Time_Pred"

# Load datasets
train_data = pd.read_csv(dir + r'\train.csv')
test_data = pd.read_csv(dir + r'\test.csv')

# Convert SMILES to RDKit Molecule objects
train_data['molecule'] = train_data['SMILES'].apply(Chem.MolFromSmiles)
test_data['molecule'] = test_data['SMILES'].apply(Chem.MolFromSmiles)

In [27]:
# Generate ECFP fingerprints
def generate_fingerprint(mol):
    return AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)

train_data['fingerprint'] = train_data['molecule'].apply(generate_fingerprint)
test_data['fingerprint'] = test_data['molecule'].apply(generate_fingerprint)

In [28]:
# Convert fingerprints to DataFrame
def fp_to_array(fp):
    arr = np.zeros((1,), dtype=int)
    Chem.DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

X_train_fingerprints = np.array(train_data['fingerprint'].apply(fp_to_array).tolist())
X_test_fingerprints = np.array(test_data['fingerprint'].apply(fp_to_array).tolist())

X_train = pd.DataFrame(X_train_fingerprints)
X_test = pd.DataFrame(X_test_fingerprints)

In [29]:
# One-hot encode the 'Lab' column
X_train_labs = pd.get_dummies(train_data['Lab'], prefix='lab')
X_test_labs = pd.get_dummies(test_data['Lab'], prefix='lab')

# Concatenate fingerprints and one-hot encoded labs
X_train = pd.concat([X_train, X_train_labs], axis=1)
X_test = pd.concat([X_test, X_test_labs], axis=1)

X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# Prepare the target variable
y_train = train_data['RT']

In [30]:
# Initialize and train the Gradient Boosting model
# Using default parameters for GradientBoostingRegressor, which can be tuned further
model = GradientBoostingRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
test_data['RT'] = model.predict(X_test)

# Save the results to a new CSV file
test_data[['SMILES', 'Lab', 'RT']].to_csv('test_predictions.csv', index=False)