# Импорты и данные

In [72]:
import pandas as pd
from pandas import array
from pandas import DataFrame

import pickle

import numpy as np
from numpy import zeros, array

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error, balanced_accuracy_score
from sklearn.model_selection import KFold, cross_val_predict, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('data_ML.csv')

In [3]:
smiles = data['smiles'].to_list()
labels = data['labels'].to_list()

In [4]:
del data["smiles"]
del data["labels"]
del data["Unnamed: 0"]

In [5]:
y = labels
X = data

In [6]:
kf = KFold(n_splits=5, random_state=1, shuffle=True)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state=0)

# RandomForestClassifier

In [9]:
rfc = RandomForestClassifier(n_estimators = 10, random_state = 1)
rfc.fit(X_train, y_train)
display(rfc.score(X_train, y_train))

0.8972397738609911

In [10]:
pred_rfc = cross_val_predict(rfc, X_test, y_test, cv=kf)

In [11]:
balanced_accuracy_score(y_test, pred_rfc)

0.6720645456626231

In [12]:
mean_squared_error(y_test, pred_rfc)

0.25398936170212766

In [13]:
r2_score(y_test, pred_rfc)

-0.174116339142661

In [14]:
rfc1 = RandomForestClassifier(n_estimators = 50, random_state = 1)
rfc1.fit(X_train, y_train)
display(rfc1.score(X_train, y_train))

0.9035583638177586

In [15]:
pred_rfc1 = cross_val_predict(rfc1, X_test, y_test, cv=kf)

In [16]:
balanced_accuracy_score(y_test, pred_rfc1)

0.6856423503253442

In [17]:
mean_squared_error(y_test, pred_rfc1)

0.24468085106382978

In [18]:
r2_score(y_test, pred_rfc1)

-0.13108589739397702

In [19]:
rfc2 = RandomForestClassifier(n_estimators = 100, random_state = 1)
rfc2.fit(X_train, y_train)
display(rfc2.score(X_train, y_train))

0.9038909211839042

In [20]:
pred_rfc2 = cross_val_predict(rfc2, X_test, y_test, cv=kf)

In [21]:
balanced_accuracy_score(y_test, pred_rfc2)

0.6877431906614786

In [22]:
mean_squared_error(y_test, pred_rfc2)

0.24335106382978725

In [23]:
r2_score(y_test, pred_rfc2)

-0.12493869142987934

In [24]:
rfc3 = RandomForestClassifier(n_estimators = 150, random_state = 1)
rfc3.fit(X_train, y_train)
display(rfc3.score(X_train, y_train))

0.9038909211839042

In [25]:
pred_rfc3 = cross_val_predict(rfc3, X_test, y_test, cv=kf)

In [26]:
balanced_accuracy_score(y_test, pred_rfc3)

0.6921001863780532

In [27]:
mean_squared_error(y_test, pred_rfc3)

0.24202127659574468

In [28]:
r2_score(y_test, pred_rfc3)

-0.11879148546578167

In [29]:
rfc4 = RandomForestClassifier(n_estimators = 200, random_state = 1)
rfc2.fit(X_train, y_train)
display(rfc2.score(X_train, y_train))

0.9038909211839042

In [30]:
pred_rfc4 = cross_val_predict(rfc4, X_test, y_test, cv=kf)

In [31]:
balanced_accuracy_score(y_test, pred_rfc4)

0.6866151129712585

In [32]:
mean_squared_error(y_test, pred_rfc4)

0.24335106382978725

In [33]:
r2_score(y_test, pred_rfc4)

-0.12493869142987934

In [34]:
import seaborn as sns
import matplotlib.pyplot as plt

feats = {}
for feature, importance in zip(data.columns, rfc3.feature_importances_):
    feats[feature] = importance
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-Importance'})
importances = importances.sort_values(by='Gini-Importance', ascending=False)
importances = importances.reset_index()
importances = importances.rename(columns={'index': 'Features'})
display(importances)

Unnamed: 0,Features,Gini-Importance
0,MW,0.025162
1,LogP,0.017079
2,TPSA,0.014351
3,NumHAcceptors,0.011944
4,NumHeteroatoms,0.011722
...,...,...
2051,1652,0.000000
2052,1914,0.000000
2053,1913,0.000000
2054,175,0.000000


# GradientBoostingClassifier

In [35]:
gbc1 = GradientBoostingClassifier(n_estimators = 50)

In [36]:
gbc1.fit(X_train, y_train)

In [37]:
pred_gbc1 = cross_val_predict(gbc1, X_test, y_test, cv=kf)

In [38]:
balanced_accuracy_score(y_test, pred_gbc1)

0.6720645456626231

In [39]:
mean_squared_error(y_test, pred_gbc1)

0.25398936170212766

In [40]:
r2_score(y_test, pred_gbc1)

-0.174116339142661

In [41]:
gbc2 = GradientBoostingClassifier(n_estimators = 100)

In [42]:
gbc2.fit(X_train, y_train)

In [43]:
pred_gbc2 = cross_val_predict(gbc2, X_test, y_test, cv=kf)

In [44]:
balanced_accuracy_score(y_test, pred_gbc2)

0.6749419612202858

In [45]:
mean_squared_error(y_test, pred_gbc2)

0.25930851063829785

In [46]:
r2_score(y_test, pred_gbc2)

-0.19870516299905172

In [53]:
gbc3 = GradientBoostingClassifier(n_estimators = 200)

In [54]:
gbc3.fit(X_train, y_train)

In [55]:
pred_gbc3 = cross_val_predict(gbc3, X_test, y_test, cv=kf)

In [56]:
balanced_accuracy_score(y_test, pred_gbc3)

0.6959503645816303

In [57]:
mean_squared_error(y_test, pred_gbc3)

0.24601063829787234

In [58]:
r2_score(y_test, pred_gbc3)

-0.1372331033580747

In [59]:
gbc4 = GradientBoostingClassifier(n_estimators = 300)

In [60]:
gbc4.fit(X_train, y_train)

In [61]:
pred_gbc4 = cross_val_predict(gbc4, X_test, y_test, cv=kf)

In [62]:
balanced_accuracy_score(y_test, pred_gbc4)

0.7024082006343393

In [63]:
mean_squared_error(y_test, pred_gbc4)

0.24335106382978725

In [64]:
r2_score(y_test, pred_gbc4)

-0.12493869142987934

In [85]:
feats = {}
for feature, importance in zip(data.columns, gbc4.feature_importances_):
    feats[feature] = importance
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-Importance'})
importances = importances.sort_values(by='Gini-Importance', ascending=False)
importances = importances.reset_index()
importances = importances.rename(columns={'index': 'Features'})
display(importances)

Unnamed: 0,Features,Gini-Importance
0,694,0.101835
1,1959,0.093201
2,MW,0.063397
3,1019,0.033721
4,1357,0.032127
...,...,...
2051,791,0.000000
2052,790,0.000000
2053,788,0.000000
2054,787,0.000000


# LogisticRegression

In [77]:
logreg = LogisticRegression()

In [78]:
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [79]:
pred_logreg = cross_val_predict(logreg, X_test, y_test, cv=kf)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [80]:
balanced_accuracy_score(y_test, pred_logreg)

0.6866886832554033

In [81]:
mean_squared_error(y_test, pred_logreg)

0.2632978723404255

In [82]:
r2_score(y_test, pred_logreg)

-0.21714678089134498

# Save

In [83]:
rfc3_filename = 'rfc3_model.sav'
pickle.dump(rfc3, open(rfc3_filename, 'wb'))

In [84]:
gbc4_filename = 'gbc4_model.sav'
pickle.dump(gbc4, open(gbc4_filename, 'wb'))