In [1]:
import pandas as pd
df = pd.read_csv('acetylcholinesterase_bioactivity_data_pIC50_pubchem_fp.csv')
df.head()

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.124939
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.30103
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.522879
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.09691


In [2]:
X = df.drop(['pIC50'], axis=1)
Y = df.iloc[:,-1]

In [3]:
from sklearn.feature_selection import VarianceThreshold

def remove_low_variance(input_data, threshold=0.1):
    selection = VarianceThreshold(threshold)
    selection.fit(input_data)
    return input_data[input_data.columns[selection.get_support(indices=True)]]

X = remove_low_variance(X, threshold=0.1)

In [6]:
cat_features = list(range(0, X.shape[1]))

In [7]:
from catboost import Pool
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

data = train_test_split(X, Y, test_size=0.2, random_state=0)
X_train, X_validation, y_train, y_validation = data

train_pool = Pool(
    data=X_train, 
    label=y_train, 
    cat_features=cat_features
)

validation_pool = Pool(
    data=X_validation, 
    label=y_validation, 
    cat_features=cat_features
)

In [8]:
model = CatBoostRegressor(loss_function='RMSE')
model.fit(train_pool, eval_set=validation_pool, verbose=False)

<catboost.core.CatBoostRegressor at 0x7fe34a7d7be0>

In [10]:
Y_pred = model.predict(X)
Y_pred

array([5.94716841, 6.61813858, 4.91465913, ..., 5.54215127, 5.61465885,
       5.60192842])

In [11]:
from sklearn.metrics import mean_squared_error, r2_score
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y, Y_pred))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y, Y_pred))

Mean squared error (MSE): 1.05
Coefficient of determination (R^2): 0.56


In [13]:
from sklearn.ensemble import RandomForestRegressor
model2 = RandomForestRegressor(n_estimators=500, random_state=42)
model2.fit(X, Y)
r2 = model2.score(X, Y)
r2

0.7956074743528752

In [14]:
Y_pred2 = model2.predict(X)
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y, Y_pred2))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y, Y_pred2))

Mean squared error (MSE): 0.49
Coefficient of determination (R^2): 0.80


In [15]:
import pickle
pickle.dump(model2, open('acetylcholinesterase_model.pkl', 'wb'))