# Molecule

In [None]:
from rdkit import Chem
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem

In [None]:
smi = 'c1(C#Cc2ccc(C3CCC(CC)CC3)cc2)ccc(CCC)cc1'
mol = Chem.MolFromSmiles(smi)
Draw.MolToFile(mol,'.png')

# PCC code

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

In [None]:
dataSet = pd.read_excel(".xlsx")
pearson_result=dataSet.corr()
pearson_result.to_excel(".xlsx")

In [None]:
def excuteFilter(x,pearson_result,dataSet,path):   
    header=pearson_result.columns
    data = pearson_result.values
    indices = np.triu_indices_from(data)
    [rows, cols] = data.shape
    addlist = [] 
    removelist = [] 
    removeindex = [] 
    for i in range(rows):
        if header[i] not in removelist:
            for j in range(cols):
                if j < i:
                    if data[i,j]>x or data[i,j]<-x:
                        if header[j] in addlist:
                            if header[i] not in removelist: 
                                removelist.append(header[i])
                                continue
                        if header[i] not in addlist: 
                            addlist.append(header[i])
                        if header[j] not in removelist: 
                             removelist.append(header[j])
    retainList = []
    for col in header:
        if col not in removelist:
            retainList.append(col)
            print(col)
    dataSet[retainList].to_excel(path,index = False)

In [None]:
excuteFilter(0.7,pearson_result,dataSet,".xlsx")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
dataframe = pd.read_excel(".xlsx")
dcorr=dataframe.corr()
plt.rcParams['font.sans-serif']=['FangSong'] 
plt.rcParams['axes.unicode_minus']=False
plt.subplots(figsize=(30, 30))

fig=sns.heatmap(dcorr,
            cmap='YlGnBu',
            annot=False,
            fmt=".2f",
            mask=np.triu(np.ones_like(dcorr,dtype=bool)) 
           )
fig_path=".jpg"; 
heatmap = fig.get_figure()
heatmap.savefig(fig_path, dpi = 400)

# Standardization of feature values

In [None]:
import pandas as pd
import numpy as np

In [None]:
data=pd.read_excel(".xlsx")
X = data.iloc[:,:]

In [None]:
from sklearn.preprocessing import StandardScaler
X1 = StandardScaler().fit_transform(X)

In [None]:
dataframe = pd.DataFrame(x1)  
dataframe.to_excel(".xlsx",index=True) 

# PU code

In [None]:
from pulearn import BaggingPuClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd                    
import numpy as np       

In [None]:
df = pd.read_excel(".xlsx",header = None)
X1 = df.iloc[:, :-1]
y = df.iloc[:,-1]

In [None]:
data = pd.read_excel('.xlsx')
X_d =data.columns.tolist()[1:]
del X_d[-1]
X_d

In [None]:
for i in range(10,100):
    bc = BaggingPuClassifier(
        RandomForestClassifier(
            n_estimators = 100
            random_state = 0
        ),   
        n_estimators = i,  
        max_samples = sum(y),
        n_jobs = -1,           
        random_state = 0,
        oob_score=True
)
    bc.fit(X, y) 
    score =bc.oob_score_  
    print("i=" +str(i)+"score",score)

In [None]:
bc = BaggingPuClassifier(
    RandomForestClassifier(
         n_estimators = 100
         random_state = 0
    ),  
    n_estimators =36,  
    max_samples = sum(y),
    n_jobs = -1,           
    random_state = 0,
    oob_score=True
)
bc.fit(X, y) #执行pu分类训练
print("score:",bc.oob_score_)

In [None]:
print(bc.oob_decision_function_)

In [None]:
y_pp=bc.predict_proba(X)

# kNN code

In [None]:
from sklearn.naive_bayes import BernoulliNB
import numpy as np
import pandas as pd
from sklearn.svm import NuSVC
from sklearn.neighbors import KNeighborsClassifier
pd.set_option('max_columns',1000)
pd.set_option('max_row',300)
pd.set_option('display.float_format', lambda x:' %.5f' % x)
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth',10000)
import sys
np.set_printoptions(threshold=sys.maxsize)

In [None]:
data = pd.read_excel('.xlsx')
X = data.iloc[:,1:-1]
Y = data.iloc[:,-1]

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.3,random_state = 0)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
for n_neighbors in range(1,50,1):
    for weights in ['uniform', 'distance']:
        nei = KNeighborsClassifier(n_neighbors, weights=weights)
        nei.fit(x_train,y_train)
        nei.score(x_test,y_test)
        print(f'{n_neighbors},{weights}score', nei.score(x_test,y_test))
        print(f'{n_neighbors},{weights}score', nei.score(x_train,y_train))

In [None]:
clf = KNeighborsClassifier(n_neighbors = 22,weights='distance')
clf.fit(x_train,y_train)

In [None]:
y_pred = clf.predict(x_test)
y_pred

In [None]:
train_score = clf.score(x_train,y_train)
test_score = clf.score(x_test,y_test)
print("train score:{};test score:{}".format(train_score,test_score))

In [None]:
import matplotlib.pyplot as plt
plt.scatter(range(0,29),y_pred,c='red',s=200,label='ML.predicted')
plt.scatter(range(0,29),y_test,c='blue',label='Experiment')
plt.legend()
plt_path=".jpg"; 
plt.savefig(plt_path, dpi = 600)

# Confusion matrix code

In [None]:
from sklearn.metrics import confusion_matrix
C2 = confusion_matrix(y_test,y_pred,labels = [0,1])
print(C2)

In [None]:
import seaborn as sns
sns.set(font_scale = 1.01)
f,ax = plt.subplots()
sns.heatmap(C2,annot = True)
ax.set_title('confusion_matrix')
ax.set_xlabel('predict')
ax.set_ylabel('true')
ax_path="confusion_matrix.jpg"; 
heatmap = ax.get_figure()
heatmap.savefig(ax_path, dpi = 600)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
accuracy_score(y_test,y_pred)
precision_score(y_test,y_pred)
recall_score(y_test,y_pred)
f1_score(y_test,y_pred)

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
y_score = clf.predict_proba(x_test)
fpr,tpr,threshold = roc_curve(y_test, y_score[:, 1])
roc_auc = auc(fpr,tpr) 
plt.figure()
lw = 2
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt_path="ROC.jpg"; 
plt.savefig(plt_path, dpi = 600)
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_test, y_score[:, 1])
prec, recall, _ = precision_recall_curve(y_test, y_score[:, 1],
                                         )
pr_display = PrecisionRecallDisplay(precision=prec, recall=recall).plot()
pr_display.ax_.set_title('2-class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))

plt_path="jingdu.jpg"; 
plt.savefig(plt_path, dpi = 600)
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
clf = KNeighborsClassifier(n_neighbors = 22, weights='uniform')
scores = cross_val_score(clf, X, Y, cv=10)
scores.mean()

# SHAP code

In [None]:
import shap
shap.initjs()

In [None]:
data = pd.read_excel('.xlsx')
X_d =data.columns.tolist()[1:]
del X_d[-1]
X_d

In [None]:
from pandas.core.frame import DataFrame
X_dis = DataFrame(X_d)
X_display= pd.DataFrame(X_dis.values.T,columns = X_dis.index,index = X_dis.columns)

In [None]:
explainer = shap.KernelExplainer(bc.predict_proba, X)
f = lambda X: bc.predict_proba(X)[:,1]
med = X.median().values.reshape((1,X.shape[1]))
explainer = shap.KernelExplainer(f, med)
shap_values_single = explainer.shap_values(X.iloc[0,:], nsamples=100)
shap.force_plot(explainer.expected_value, shap_values_single, X_display.iloc[0,:])

In [None]:
shap_values = explainer.shap_values(X.iloc[0:100,:], nsamples=100)
shap.force_plot(explainer.expected_value, shap_values, X.iloc[0:100,:], X_display.iloc[0,:])

In [None]:
shap.summary_plot(shap_values, X.iloc[0:100,:], X_display.iloc[0,:])
shap.summary_plot(shap_values, X, X_display.iloc[0,:], plot_type="bar")

In [None]:
shap.dependence_plot(20, shap_values, X.values, feature_names=X.columns)
shap.dependence_plot("rank(1)", shap_values, X.values, feature_names=X.columns)

In [None]:
f = lambda X: bc.predict_proba(X)[:,1]
med = X.median().values.reshape((1,X.shape[1]))
explainer = shap.KernelExplainer(f, med)
fig4 = shap.force_plot(explainer.expected_value, shap_values[26], X_display.iloc[0,:],matplotlib=True,show = False)
plt.savefig("index4.jpg", dpi = 400)

In [None]:
f = lambda X: bc.predict_proba(X)[:,1]
med = X.median().values.reshape((1,X.shape[1]))
explainer = shap.KernelExplainer(f, med)
shap_values_single = explainer.shap_values(X.iloc[0,:], nsamples=100)
shap.force_plot(explainer.expected_value, shap_values[65], X_display.iloc[0,:],matplotlib=True,show = False)
plt.savefig("index5.jpg", dpi = 300)

In [None]:
f = lambda X: bc.predict_proba(X)[:,1]
med = X.median().values.reshape((1,X.shape[1]))
explainer = shap.KernelExplainer(f, med)
shap_values_single = explainer.shap_values(X.iloc[0,:], nsamples=100)
shap.force_plot(explainer.expected_value, shap_values[83], X_display.iloc[0,:],matplotlib=True,show = False)
plt.savefig("index6.jpg", dpi = 300)