In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from preprocessing import preprocessing
from rdkit import Chem
from rdkit.Chem import AllChem, RDKFingerprint


#Ensemble learning and random forest

df = preprocessing("C:\\Users\Gilbert\Documents\BCB_Research\Kcat_Benchmark_ML_Models\kcat_transferase.csv")

In [2]:
d = df.copy()

d.head()

data = d

In [3]:
df.head()

Unnamed: 0,EC_number,Species,Compound,Compound_name,Amino_encoding,Kcat,unit
0,2.1.1.1,Homo sapiens,C1=CC(=CN=C1)C(=O)N,Nicotinamide,MESGFTSKDTYLSHFNPRDFLEKYYKFGSRHSAESQILKHLLKNLF...,0.041,s^(-1)
1,2.1.1.1,Homo sapiens,C1=CC(=CN=C1)C(=O)N,Nicotinamide,MESGFTSKDTYLSHFNPRDYLEKYYKFGSRHSAESQILKHLLKNLF...,1.02,s^(-1)
2,2.1.1.1,Homo sapiens,C1=CC(=CN=C1)C(=O)N,Nicotinamide,MESGFTSKDTYLSHFNPRDYLEKYYKFGSRHSAESQILKHLLKNLF...,0.083,s^(-1)
3,2.1.1.10,Brassica oleracea,C(CS)C(C(=O)O)N,L-Homocysteine,MGLEKKSALLEDLIEKCGGCAVVDGGFATQLEIHGAAINDPLWSAV...,0.0375,s^(-1)
4,2.1.1.10,Escherichia coli,C(CS)C(C(=O)O)N,L-Homocysteine,MSQNNPLRALLDKQDILLLDGAMATELEARGCNLADSLWSAKVLVE...,0.38,s^(-1)


In [4]:
#preprocessing
#lets encode the data using label encoder 
label_encoder = LabelEncoder()
data["EC_number"] = label_encoder.fit_transform(data["EC_number"])
data["Species"] = label_encoder.fit_transform(data["Species"])

amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
amino_to_index = {amino: i for i, amino in enumerate(amino_acids)}

# Convert amino acid sequences to one-hot encoding
def convert_to_one_hot(sequence):
    one_hot_sequence = np.zeros((len(sequence), len(amino_acids)))
    
    for i, amino in enumerate(sequence):
        if amino in amino_to_index:
            index = amino_to_index[amino]
            one_hot_sequence[i, index] = 1
            
    return one_hot_sequence

# Apply the conversion to the DataFrame column

data["Amino"] = data["Amino_encoding"].apply(convert_to_one_hot)

data.drop(columns=["Amino_encoding"], inplace = True)

# convert compound name into numbers.
compound = data["Compound"]

data["smiles"] = [Chem.MolFromSmiles(smiles) for smiles in compound]

mol = data["smiles"]

data["Fingerprint_rdk"] = [RDKFingerprint(i) for i in mol]






In [5]:
data.head()

Unnamed: 0,EC_number,Species,Compound,Compound_name,Kcat,unit,Amino,smiles,Fingerprint_rdk
0,0,99,C1=CC(=CN=C1)C(=O)N,Nicotinamide,0.041,s^(-1),"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",<rdkit.Chem.rdchem.Mol object at 0x000001B849B...,"[1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
1,0,99,C1=CC(=CN=C1)C(=O)N,Nicotinamide,1.02,s^(-1),"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",<rdkit.Chem.rdchem.Mol object at 0x000001B849B...,"[1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
2,0,99,C1=CC(=CN=C1)C(=O)N,Nicotinamide,0.083,s^(-1),"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",<rdkit.Chem.rdchem.Mol object at 0x000001B849B...,"[1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
3,1,39,C(CS)C(C(=O)O)N,L-Homocysteine,0.0375,s^(-1),"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",<rdkit.Chem.rdchem.Mol object at 0x000001B849B...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
4,1,81,C(CS)C(C(=O)O)N,L-Homocysteine,0.38,s^(-1),"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",<rdkit.Chem.rdchem.Mol object at 0x000001B849B...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."


In [6]:
# train and split the data
# EC_number, Species, Amino, and fingerprint_rdk

from sklearn.model_selection import train_test_split

# main features
# Flatten fingerprint_rdk in to a flatten data list 
fingerprints = np.array(data['Fingerprint_rdk'].tolist())

# fix this part lol, 
amino_features = np.array(data['Amino'].tolist())  # Modify this depending on the format of 'Amino'


# Select relevant columns
selected_features = ['EC_number', 'Species']  # Add other columns as needed

# Convert selected columns to a NumPy array
other_features = data[selected_features].values

# Combine features into numpy array
x = np.hstack((other_features, fingerprints, amino_features))

# target variables
y = data["Kcat"]

x_train, y_train, x_test, y_test = train_test_split(x, y, test_size=0.2, random_state= 0)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (4136,) + inhomogeneous part.

In [None]:
# this initialization of the regression model
rf_regressor = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)

rf_regressor.fit(x_train, y_train)

y_pred = rf_regressor.predict(x_test)


ValueError: setting an array element with a sequence.