In [None]:
#The SISSO modeling and parameter tuning process is fully referenced https://github.com/rouyang2017/SISSO

In [None]:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import ticker  
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import ListedColormap
import matplotlib.ticker as ticker
from matplotlib.colors import LinearSegmentedColormap

In [None]:
df = pd.read_excel("DES-139-15000.xlsx")

In [None]:
#2D projection map visualizing the classification results obtained using the SISSO algorithm.

In [None]:
plt.rcParams['font.family'] = 'Arial'

plt.rcParams['figure.figsize'] = (7, 7)
plt.rcParams['axes.facecolor'] = 'white'
fig, ax = plt.subplots()

cpl_data = df.iloc[:20]
no_cpl_data = df.iloc[20:]

ax.scatter(cpl_data['descriptor_1'], cpl_data['descriptor_2'],
           alpha=0.8, c="#FF0D57", label='CPL', marker="o", s=100)
ax.scatter(no_cpl_data['descriptor_1'], no_cpl_data['descriptor_2'],
           alpha=1, c="#008BFB", label='No CPL', marker="x", s=100)

ax.set_xlim(-5, 22)  
ax.set_ylim(-15.0, 200.0) 

ax.set_xticks([])
ax.set_yticks([])

formatter = ticker.ScalarFormatter(useMathText=True)
formatter.set_scientific(True)
formatter.set_powerlimits((0, 0))  

ax.xaxis.get_offset_text().set_x(0.05)  
ax.yaxis.get_offset_text().set_x(-0.05) 

ax.legend(
    loc='upper right',          
    bbox_to_anchor=(0.98, 0.98),
    framealpha=1, 
    fontsize=16,
    borderaxespad=0.5,          
    frameon=True                
)

plt.title('SISSO Classification Results', fontsize=20, weight="bold")
ax.set_xlabel('Descriptor 1', fontsize=20)
ax.set_ylabel('Descriptor 2', fontsize=20)
plt.xticks(rotation=0)
plt.yticks(rotation=0)
plt.tick_params(labelsize=12)

plt.savefig("2D-DES-139-15000.png", dpi=300, format="png", bbox_inches='tight')

In [None]:
#Decision boundary corresponding to the SISSO classification results.

In [None]:
X = df[['descriptor_1', 'descriptor_2']].values
y = np.array([1]*20 + [0]*38)  

model = LogisticRegression()
model.fit(X, y)

w0 = model.intercept_[0]
w1 = model.coef_[0][0]
w2 = model.coef_[0][1]

def format_term(coef, variable):
    if coef >= 0:
        return f"+ {coef:.4f}{variable}"
    else:
        return f"- {abs(coef):.4f}{variable}"

term1 = format_term(w1, 'x_1')
term2 = format_term(w2, 'x_2')
term3 = format_term(w0, '')

if term1.startswith('+'):
    term1 = term1[2:]

formula = f"Decision boundary: ${term1} {term2} {term3} = 0$"
print(formula)

In [None]:
# 创建网格来绘制决策边界
h = 1000  
x_min, x_max = -5, 22
y_min, y_max = -15.0, 200.0

xx, yy = np.meshgrid(
    np.arange(x_min, x_max, (x_max - x_min) / h),
    np.arange(y_min, y_max, (y_max - y_min) / h)
)

# 预测整个扩展网格的分类概率
Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)

# 计算决策边界的解析解（用于绘制完整直线）
x_line = np.array([x_min, x_max])
y_line = (-w0 - w1 * x_line) / w2

In [None]:
# 设置全局字体为 Arial
plt.rcParams['font.family'] = 'Arial'

# 设置画布格式
plt.rcParams['figure.figsize'] = (7, 7)
plt.rcParams['axes.facecolor'] = 'white'
fig, ax = plt.subplots()

# 绘制决策边界（概率等高线）
cmap_light = ListedColormap(['#c6d9f1', '#f4cccc'])
ax.contourf(xx, yy, Z, alpha=0.3, cmap=cmap_light)

# 绘制决策边界线
ax.contour(xx, yy, Z, levels=[0.5], colors='k', linewidths=2, linestyles='dashed')

# 绘制数据点
ax.scatter(X[y==1, 0], X[y==1, 1], alpha=0.8, c="#FF0D57", label='CPL', marker="o", s=100)
ax.scatter(X[y==0, 0], X[y==0, 1], alpha=1, c="#008BFB", label='No CPL', marker="x", s=100)

# 设置固定的坐标范围
ax.set_xlim(-5, 22)
ax.set_ylim(-15.0, 200.0)  

# 完全隐藏坐标轴刻度（包括刻度值和刻度线）
ax.set_xticks([])
ax.set_yticks([])

# 设置科学计数法
formatter = ticker.ScalarFormatter(useMathText=True)
formatter.set_scientific(True)
formatter.set_powerlimits((0, 0))

# 应用格式到坐标轴
ax.xaxis.set_major_formatter(formatter)
ax.yaxis.set_major_formatter(formatter)

# 调整指数偏移量的显示位置
ax.xaxis.get_offset_text().set_x(0.05)
ax.yaxis.get_offset_text().set_x(-0.05)

# 将公式添加到图像中
ax.text(0.05, 0.05, formula, transform=ax.transAxes, fontsize=12, 
        bbox=dict(facecolor='white', alpha=0.8))

# 添加图例
ax.legend(
    loc='upper right',
    bbox_to_anchor=(0.98, 0.98),
    framealpha=1,
    fontsize=16,
    borderaxespad=0.5,
    frameon=True
)

# 设置轴标签
plt.title('Classification Boundary', fontsize=20, weight="bold")
ax.set_xlabel('Descriptor 1', fontsize=20)
ax.set_ylabel('Descriptor 2', fontsize=20)

# 设置刻度和字体大小
plt.xticks(rotation=0, fontsize=12)
plt.yticks(rotation=0, fontsize=12)

# 保存并显示

plt.savefig("Decision boundary-DES-139-15000.png", dpi=300, format="png", bbox_inches='tight')
plt.show()

In [None]:
#2D visualization map of the prediction library (n=1242) generated using the SISSO

In [None]:
# 设置全局字体为 Arial
plt.rcParams['font.family'] = 'Arial'

# 读取Excel文件
data = pd.read_excel("SISSO Decision boundary.xlsx", na_values=['#VALUE!', 'error', 'NaN'])  # 可自定义错误值
data = data.dropna(subset=['descriptor 1', 'descriptor 2', 'Number'])  # 移除包含空值的行

In [None]:
x = data.iloc[:, 0].values
y = data.iloc[:, 1].values

# 提取数据列
x = data['descriptor 1'].values
y = data['descriptor 2'].values
numbers = data['Number'].values

# 创建画布
plt.figure(figsize=(12, 8))

# 设置坐标轴范围
plt.xlim(-10, 27)
plt.ylim(-30, 230)

# 绘制散点图，设置点为橘黄色且无边框
plt.scatter(x, y, color='#F4B722', edgecolor='none', s=70, alpha=0.8)

# 绘制决策边界
x_range = np.linspace(min(x) - 1, max(x) + 1, 1000)
y_boundary = (1.7365 * x_range - 5.3818) / 0.1219
plt.plot(x_range, y_boundary, 'k--', linewidth=2.5, label='Decision boundary')

# 设置图形标题和坐标轴标签
plt.title('Prediction Result', fontsize=20, weight="bold")
plt.xlabel('Descriptor 1', fontsize=20)
plt.ylabel('Descriptor 2', fontsize=20)

# 添加决策边界公式文本
formula_text = r'Decision boundary: $1.7365x₁ - 0.1219x₂ - 5.3818 = 0$'
plt.text(0.05, 0.08, formula_text, transform=plt.gca().transAxes, 
         fontsize=16, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# 完全隐藏坐标轴刻度（包括刻度值和刻度线）
plt.xticks([])
plt.yticks([])

# 添加图例
plt.legend(loc="lower right", fontsize=16)

# 显示图形
plt.tight_layout()
plt.savefig("Prediction Result.png", dpi=300, format="png", bbox_inches='tight')
plt.show()

In [None]:
# 计算每个点相对于决策边界的位置（使用Number作为编号）
decision_values = 1.7365 * x - 0.1219 * y - 5.3818
tolerance = 1e-6  # 容差值（可根据数据精度调整）

# 找出各区域数据点的Number
left_mask = decision_values < -tolerance
right_mask = decision_values > tolerance
boundary_mask = np.abs(decision_values) <= tolerance

# 创建结果DataFrame
result_df = pd.DataFrame({
    'Number': numbers,
    'descriptor 1': x,
    'descriptor 2': y,
    '决策边界值': decision_values,
    '区域分类': np.where(left_mask, 'no CPL', 
                     np.where(right_mask, 'CPL', 'Median'))
})

# 保存结果到Excel
output_file = '决策边界分类结果.xlsx'
result_df.to_excel(output_file, index=False)

# 输出汇总信息
print(f"分类结果已保存至 {output_file}")
print("数据点分布统计：")
print(result_df['区域分类'].value_counts())

In [None]:
# 提取原始数据点
x = data['descriptor 1'].values
y = data['descriptor 2'].values
numbers = data['Number'].values

# 创建画布
plt.figure(figsize=(12, 8))
plt.xlim(-10, 27)
plt.ylim(-30, 230)

# 绘制原始散点图（橘黄色点）
plt.scatter(x, y, color='#F4B722', edgecolor='none', s=70, alpha=0.8, label='Data points')

# 绘制决策边界
x_range = np.linspace(min(x) - 1, max(x) + 1, 1000)
y_boundary = (1.7365 * x_range - 5.3818) / 0.1219
plt.plot(x_range, y_boundary, 'k--', linewidth=2.5, label='Decision boundary')

# 读取实验数据文件并处理
exp_data = pd.read_excel("20-final.xlsx", sheet_name='Sheet1')

# 筛选有效数据点
valid_data = exp_data[['Number', '实际结果']].copy()
valid_data = valid_data.dropna(subset=['实际结果'])
valid_data = valid_data[valid_data['实际结果'].isin(['√', '×'])]

# 根据Number匹配坐标
valid_data = valid_data.merge(data[['Number', 'descriptor 1', 'descriptor 2']], 
                             on='Number', how='inner')

# 分离不同类别的点
CPL_points = valid_data[valid_data['实际结果'] == '√']
NoCPL_points = valid_data[valid_data['实际结果'] == '×']

# 绘制No CPL点（蓝色×）
plt.scatter(NoCPL_points['descriptor 1'], NoCPL_points['descriptor 2'], 
            marker='x', s=120, color='#008BFB', linewidth=2.5, 
            label='No CPL prediction')

# 绘制CPL点（红色五角星）
plt.scatter(CPL_points['descriptor 1'], CPL_points['descriptor 2'], 
            marker='*', s=200, color='#FF0D57', #edgecolor='darkred', linewidth=0.5,
            label='CPL prediction')

# 设置图形标题和坐标轴标签
plt.title('Prediction Result with Experimental Data', fontsize=20, weight="bold")
plt.xlabel('Descriptor 1', fontsize=20)
plt.ylabel('Descriptor 2', fontsize=20)

# 添加决策边界公式文本
formula_text = r'Decision boundary: $1.7365x_1 - 0.1219x_2 - 5.3818 = 0$'
plt.text(0.05, 0.08, formula_text, transform=plt.gca().transAxes, 
         fontsize=16, verticalalignment='top', 
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# 隐藏坐标轴刻度
plt.xticks([])
plt.yticks([])

# 添加图例并调整位置
plt.legend(loc="lower right", fontsize=16)

# 显示图形
plt.tight_layout()
plt.savefig("20-Prediction Result with Experimental Data.tif", dpi=300, format="tif", bbox_inches='tight')
plt.show()

In [None]:
# Parameter tuning process diagram for the SISSO model

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import matplotlib.ticker as ticker

In [None]:
data = {
    'nf_sis': [3000, 4000, 5000, 6000, 8000, 10000, 12000, 15000, 18000],
    'Size of Overlap': [-10574.7183, -10574.7183, -10770.57864, -10770.57864, 
                       -10574.7183, -11370.00123, -11370.00123, -11423.09374, -11423.09374]
}
df = pd.DataFrame(data)

In [None]:
# 设置全局字体为 Arial
plt.rcParams['font.family'] = 'Arial'

# 设置画布
plt.figure(figsize=(13, 8))
plt.rcParams['axes.facecolor'] = 'white'
plt.grid(True, linestyle='--', alpha=0.7)

# 绘制折线图连接各点
plt.plot(df['nf_sis'], df['Size of Overlap'], 
         '#008BFB', linewidth=4, alpha=1, 
         #marker='o', markersize=8, markerfacecolor='#008BFB', 
         #markeredgecolor='black', markeredgewidth=1,
         label='Trend Line', zorder=1)

# 绘制散点图
plt.scatter(df['nf_sis'], df['Size of Overlap'], 
            c='#FF0D57', s=200, alpha=1, label='Data Points', zorder=2)

# 设置坐标轴标签
plt.xlabel('nf_sis', fontsize=20, weight='bold')
plt.ylabel('Size of Overlap', fontsize=20, weight='bold')

# 设置标题
plt.title('SISSO Parameter Tuning Process', fontsize=20, weight='bold')

# 设置图例
plt.legend(loc='upper right', fontsize=16, frameon=True, framealpha=0.9)

# 设置科学计数法格式
plt.gca().yaxis.set_major_formatter(ticker.ScalarFormatter(useMathText=True))
plt.gca().ticklabel_format(axis='y', style='sci', scilimits=(0,0))

# 增加刻度标签字体大小
plt.xticks(fontsize=16)  # 增加x轴刻度字体大小
plt.yticks(fontsize=16)  # 增加y轴刻度字体大小

# 增加科学计数法偏移量的字体大小
ax = plt.gca()
ax.yaxis.get_offset_text().set_fontsize(14)  # y轴科学计数法偏移量字体大小

# 添加数据标签 - 增加字体大小
for i, row in df.iterrows():
    plt.annotate(f"{row['Size of Overlap']:.1f}", 
                 (row['nf_sis'], row['Size of Overlap']),
                 textcoords="offset points", 
                 xytext=(0,10), 
                 ha='center',
                 fontsize=11)  # 从9增加到12

# 调整布局
plt.tight_layout()

# 保存并显示
plt.gcf().canvas.draw()
plt.savefig("SISSO_Parameter_Tuning.png", dpi=300, format="png", bbox_inches='tight')