In [1]:
import pandas as pd
import numpy as np
import sys
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from tabpfn import TabPFNRegressor

# 将当前工作目录添加到 Python 路径
# 获取当前工作目录
current_working_dir = os.getcwd()

# 将根目录添加到 sys.path
sys.path.append(os.path.abspath(os.path.join(current_working_dir, '../..')))

In [2]:
# 读取数据
df = pd.read_csv('../../data/dataset_cleaned.csv')

# 对指定列进行独热编码
columns_to_encode = ['CM_type', 'CM_morph', 'MS2_morph', 'CP_morph', 'Cation', 'Anion']
df_encoded = pd.get_dummies(df, columns=columns_to_encode)

# 对目标值进行对数变换
df_encoded['Cs_log'] = df_encoded['Cs'].apply(lambda x: np.log(x))

X = df_encoded.drop(['Cs','Cs_log'], axis=1)
y = df_encoded['Cs_log']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=6
)

In [3]:
# 初始化TabPFN回归模型
model = TabPFNRegressor()

# 训练模型
model.fit(X_train, y_train)

# 预测
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# 将预测值转换为原始值
y_train_pred = np.exp(y_train_pred)
y_test_pred = np.exp(y_test_pred)
y_train = np.exp(y_train)
y_test = np.exp(y_test)

# 评估
from function import metrics_to_dataframe
tabpfn_metrics = metrics_to_dataframe(y_train, y_train_pred, y_test, y_test_pred, 'TabPFN')
tabpfn_metrics

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,TabPFN,0.998049,13.133877,3.002197,24.441127,0.971334,42.858348,8.231612,90.435344
