In [9]:
import numpy as np 
import pandas as pd 

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./kaggle/input\siatprotein2023\Rhla.xlsx
./kaggle/input\siatprotein2023\sample_submission.csv
./kaggle/input\siatprotein2023\test.csv
./kaggle/input\siatprotein2023\training.csv


In [10]:
train_data = pd.read_csv("./kaggle/input/siatprotein2023/training.csv")
print("训练数据集：")
train_data.head()

训练数据集：


Unnamed: 0,SequenceID,Sequence,Activity,Selectivity
0,1,RAQLSQ,1.0,1.0
1,2,AAQLSQ,3.228,1.837
2,3,CAQLSQ,2.17,2.445
3,4,DAQLSQ,1.759,1.061
4,5,EAQLSQ,1.531,1.032


In [11]:
test_data = pd.read_csv("./kaggle/input/siatprotein2023/test.csv")
print("测试数据集：")
test_data.head()

测试数据集：


Unnamed: 0,SequenceID,Sequence
0,1594,AACCAD
1,1595,AACCTQ
2,1596,AADCSM
3,1597,AADQTQ
4,1598,AADTLQ


In [12]:
sample_data = pd.read_csv("./kaggle/input/siatprotein2023/sample_submission.csv")
print("最后提交的数据格式：")
sample_data.head()

最后提交的数据格式：


Unnamed: 0,SequenceID,Activity,Selectivity
0,1594,2.043824,0.616243
1,1595,2.641935,2.026498
2,1596,0.956439,0.506145
3,1597,2.279373,0.427398
4,1598,0.056524,1.903379


In [13]:
def assign_score(df:pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    def _assign_score(percentile):
        if percentile <= 0.005:
            return 5
        elif 0.005 < percentile <= 0.02:
            return 1
        elif 0.02 < percentile <= 0.1:
            return 0.1
        else:
            return 0
    df['Fitness'] = df['Activity'] * df['Selectivity']
    percentiles = df['Fitness'].rank(pct=True, ascending=False)
    df['score'] = percentiles.apply(_assign_score)
    return df

In [17]:
df.head()

Unnamed: 0,SequenceID,Sequence,Activity,Selectivity,Fitness,score
0,1,RAQLSQ,1.0,1.0,1.0,0.0
1,2,AAQLSQ,3.228,1.837,5.929836,5.0
2,3,CAQLSQ,2.17,2.445,5.30565,1.0
3,4,DAQLSQ,1.759,1.061,1.866299,0.0
4,5,EAQLSQ,1.531,1.032,1.579992,0.0


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error
from scipy.stats import spearmanr
import pandas as pd
import numpy as np
import time

# 读取excel文件
df = pd.read_csv('./kaggle/input/siatprotein2023/training.csv')
df = assign_score(df)


# 提取氨基酸序列
sequences = df.iloc[:, 1].tolist()

# 为预测的目标指标创建标签
activity_labels = df.iloc[:, 2].tolist()
selectivity_labels = df.iloc[:, 3].tolist()
fitness_labels = df.iloc[:, 4].tolist()
score_labels = df.iloc[:, 5].tolist()



# 定义Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='char')),
    ('rf', RandomForestRegressor(random_state=40))
])

# 定义网格搜索参数
params = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'rf__n_estimators': [50, 100],
    'rf__max_depth': [None, 10]
}
# 定义评分函数
scorer = make_scorer(mean_squared_error, greater_is_better=False)

# 划分训练集和测试集
X_train, X_test, y_train_activity, y_test_activity = train_test_split(sequences, activity_labels, test_size=0.2, random_state=42)
_, _, y_train_selectivity, y_test_selectivity = train_test_split(sequences, selectivity_labels, test_size=0.2, random_state=42)
_, _, y_train_fitness, y_test_fitness = train_test_split(sequences, fitness_labels, test_size=0.2, random_state=42)
_, _, y_train_score, y_test_score = train_test_split(sequences, score_labels, test_size=0.2, random_state=42)



# 为活性和选择性分别进行网格搜索和模型训练
trained_models = {}  # 存储训练好的模型

for y_train, y_test, label in [
    # (y_train_activity, y_test_activity, "Activity"), 
    # (y_train_selectivity, y_test_selectivity, "Selectivity"),
    # (y_train_fitness, y_test_fitness, "Fitness"),
    (y_train_score, y_test_score, "Score"),
]:
    start_time = time.time()
    grid_search = GridSearchCV(pipeline, params, cv=5, scoring=scorer)
    grid_search.fit(X_train, y_train)
    end_time = time.time()

    # 输出运算时间
    execution_time = end_time - start_time
    print(f"代码执行时间: {execution_time} 秒")


    # 输出最佳参数
    print(f"Best parameters for {label}: ", grid_search.best_params_)
    # 使用最佳模型进行预测
    preds = grid_search.predict(X_test)
    # 输出预测结果的RMSE
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    print(f"{label} RMSE: {rmse}")
    # 计算平均绝对误差
    mae = mean_absolute_error(y_test, preds)
    print(f"{label} MAE: {mae}")
    # 计算Spearman秩相关系数
    spearman = spearmanr(y_test, preds)[0]
    print(f"{label} Spearman: {spearman}\n")

    # 存储训练好的模型
    trained_models[label] = grid_search.best_estimator_


代码执行时间: 31.271180629730225 秒
Best parameters for Score:  {'rf__max_depth': None, 'rf__n_estimators': 100, 'tfidf__ngram_range': (1, 2)}
Score RMSE: 0.3329270774080615
Score MAE: 0.07200626959247648
Score Spearman: 0.009831393080815862



In [15]:
# 读取测试数据
test_data = pd.read_csv("./kaggle/input/siatprotein2023/test.csv")
# 提取测试数据的氨基酸序列
test_sequences = test_data['Sequence'].tolist()

# 使用训练好的模型进行预测
result_df = pd.DataFrame()
result_df['SequenceID'] = test_data['SequenceID']

for label in ["Activity", "Selectivity"]:
    preds = trained_models[label].predict(test_sequences)
    result_df[label] = preds

# 将结果保存为csv文件
result_df.to_csv('predictions.csv', index=False)
print("Your submission was successfully saved!")
#查看结果，确保格式正确后提交
print(result_df)

KeyError: 'Activity'