In [1]:
import duckdb
import pandas as pd

In [2]:
test_path = 'D:/Taggle/test.parquet'
train_path = 'D:/Taggle/train.parquet' #定义训练集和测试集的位置

con = duckdb.connect()  #连接到DuckDB

#从Parquet文件中读取数据
#f""".."""是python中格式化字符串（f-strings)的一种形式，允许在字符串中嵌入表达式，并在运行时求值和替换
#SELECT * 从数据源中选择所有列
#parquet_scan用于从指定路径的parquet文件中读取数据，{}为之前预定义的占位符，对应前面定义的训练集和测试集
#WHERE binds = 0 限制读取条件为binds列为0的所有行
#ORDER BY random() #随机排序这些行#
# LIMIT 30000) #限制返回的行数为30000行#
#  UNION ALL #将两部分结果合并在一起，保留所有重复的行#
#.df() #.df将query方法返回的结果通过df的方法转换为pandas dataframe便于进行后续的分析处理

df = con.query (f"""(SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 0 
                        ORDER BY random() 
                        LIMIT 30000)
                        UNION ALL
                        (SELECT * 
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 1
                        ORDER BY random()
                        LIMIT 30000)""").df() 

con.close() #关闭连接 

In [3]:
df.head() #查看数据框的前五行

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds
0,140040997,O=C(Nc1c(C(=O)O)cccc1C(=O)O)OCC1c2ccccc2-c2ccc...,Cl.NCCCCF,COc1cccc(N)n1,COc1cccc(Nc2nc(NCCCCF)nc(Nc3c(C(=O)O)cccc3C(=O...,HSA,0
1,249760641,O=C(O)C[C@@H](Cc1ccc(I)cc1)NC(=O)OCC1c2ccccc2-...,COc1ccc(O)c(N)c1,Cl.NCc1nc(-c2ccco2)n[nH]1,COc1ccc(O)c(Nc2nc(NCc3nc(-c4ccco4)n[nH]3)nc(N[...,BRD4,0
2,244876262,O=C(O)C[C@@H](Cc1ccc(C(F)(F)F)cc1)NC(=O)OCC1c2...,Nc1c(F)cccc1F,Nc1ccc2ncccc2c1,O=C(C[C@@H](Cc1ccc(C(F)(F)F)cc1)Nc1nc(Nc2ccc3n...,sEH,0
3,244177558,O=C(O)C[C@@H](Cc1ccc(C(F)(F)F)cc1)NC(=O)OCC1c2...,CN(C)c1nc(Cl)c(CN)s1.Cl.Cl,Nc1ccc(F)cc1Cl,CN(C)c1nc(Cl)c(CNc2nc(Nc3ccc(F)cc3Cl)nc(N[C@@H...,HSA,0
4,256993266,O=C(O)C[C@@H](NC(=O)OCC1c2ccccc2-c2ccccc21)c1c...,Nc1ccc2cn[nH]c2c1,Nc1n[nH]c2ncccc12,O=C(C[C@@H](Nc1nc(Nc2ccc3cn[nH]c3c2)nc(Nc2n[nH...,BRD4,0


In [4]:
from rdkit import Chem  # RDKit的核心模块，用于分子对象的创建和基本操作
from rdkit.Chem import AllChem  # RDKit的拓展模块，提供高级化学功能
from sklearn.ensemble import RandomForestClassifier  # 随机森林分类器
from sklearn.model_selection import train_test_split  # 数据集划分
from sklearn.metrics import average_precision_score  # 评估指标
from sklearn.preprocessing import OneHotEncoder  # 一热编码


In [5]:
#将smile转为RDKit的分子
#df[]是pandas对数据框进行访问、修改的常见用法

df['molecule'] = df['molecule_smiles'].apply(Chem.MolFromSmiles) #借助pandas中的apply方法，对数据框中molecule_smiles列调用Chem.MolFromSmiles，将结果存储在新的molecule列中

#定义了一个生成ECFP的函数“generate_ecfp",ECFP的半径为2（默认），生成的指纹位数为1024
#GetMorganFingerprintAsBitVect是RDKit提供的函数，用于生成分子的Morgan指纹（ECFP）
#这个函数式定义了一个ECFP生成函数，首先会读取molecule这一列，如果是空则返回none，否则就返回生成的指纹列表

def generate_ecfp(molecule, radius=2, bits=1024):
    if molecule is None:
        return None
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits)) 
    
#根据generate_ecfp函数进行ECFPs的生成，并存储在列表的ecfp列中

df['ecfp'] = df['molecule'].apply(generate_ecfp)#得到ECFPs

In [7]:
df.head() #会发现多了两列，molecule和ecfp

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds,molecule,ecfp
0,140040997,O=C(Nc1c(C(=O)O)cccc1C(=O)O)OCC1c2ccccc2-c2ccc...,Cl.NCCCCF,COc1cccc(N)n1,COc1cccc(Nc2nc(NCCCCF)nc(Nc3c(C(=O)O)cccc3C(=O...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0x00000249CAB...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,249760641,O=C(O)C[C@@H](Cc1ccc(I)cc1)NC(=O)OCC1c2ccccc2-...,COc1ccc(O)c(N)c1,Cl.NCc1nc(-c2ccco2)n[nH]1,COc1ccc(O)c(Nc2nc(NCc3nc(-c4ccco4)n[nH]3)nc(N[...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x00000249CAB...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,244876262,O=C(O)C[C@@H](Cc1ccc(C(F)(F)F)cc1)NC(=O)OCC1c2...,Nc1c(F)cccc1F,Nc1ccc2ncccc2c1,O=C(C[C@@H](Cc1ccc(C(F)(F)F)cc1)Nc1nc(Nc2ccc3n...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x00000249CAB...,"[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
3,244177558,O=C(O)C[C@@H](Cc1ccc(C(F)(F)F)cc1)NC(=O)OCC1c2...,CN(C)c1nc(Cl)c(CN)s1.Cl.Cl,Nc1ccc(F)cc1Cl,CN(C)c1nc(Cl)c(CNc2nc(Nc3ccc(F)cc3Cl)nc(N[C@@H...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0x00000249CAB...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,256993266,O=C(O)C[C@@H](NC(=O)OCC1c2ccccc2-c2ccccc21)c1c...,Nc1ccc2cn[nH]c2c1,Nc1n[nH]c2ncccc12,O=C(C[C@@H](Nc1nc(Nc2ccc3cn[nH]c3c2)nc(Nc2n[nH...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x00000249CAB...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
# One-hot encode the protein_name
onehot_encoder = OneHotEncoder(sparse_output=False) #将分类变量转化为一热编码，并指定输出为密集数组
protein_onehot = onehot_encoder.fit_transform(df['protein_name'].values.reshape(-1, 1))

# Combine ECFPs and one-hot encoded protein_name
X = [ecfp + protein for ecfp, protein in zip(df['ecfp'].tolist(), protein_onehot.tolist())]
y = df['binds'].tolist()

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the random forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]  # Probability of the positive class

# Calculate the mean average precision
map_score = average_precision_score(y_test, y_pred_proba)
print(f"Mean Average Precision (mAP): {map_score:.2f}")