In [15]:
#Interface feature pairing
import pandas as pd
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
from matplotlib import pyplot as plt
from scipy.stats import gaussian_kde
from scipy.integrate import dblquad  
from sklearn.model_selection import GridSearchCV
from scipy.integrate import quad 
import warnings
warnings.filterwarnings("ignore")

# 读取数据集
df=pd.read_csv('PRODIGY dataset.csv')
#筛选的ICs信息列表
selected_features_list=['Ics_polar-apolar', 'Ics_charg-apolar', 'Ics_charg-polar', 'Ics_charg-charg', 'Ics_apolar-apolar']
selected_features_list_y_BSAs=['Binding_affinity','Ics_polar-apolar', 'Ics_charg-apolar', 'Ics_charg-polar', 'Ics_charg-charg', 'Ics_apolar-apolar', 'BSAapolar','BSApolar']
data1=df[selected_features_list_y_BSAs]
print(data1)

#计算筛选的各ICs分别与各BSA的互信息
feature=selected_features_list
y1_name='BSApolar' # 此处可替换成'BSAapolar'，求出相应配对结果
Y1= data1[y1_name].values

xi_dict = {} 
for feature_name in feature: 
    xi_dict[feature_name] = data1[feature_name].values

data_values = {}

for feature_name in feature: 
    data_values[f'{feature_name}_{y1_name}'] = np.array([xi_dict[feature_name], Y1])

m=4 
n0=len(Y1) 
values0 = data1[y1_name].values.reshape(-1, 1) 
min_std0=min(np.std(data1[y1_name]),((data1[y1_name].quantile(0.75)-data1[y1_name].quantile(0.25))/1.34))
better_bandwidth0=1.06 * min_std0 * len(Y1) ** (-1 / 5) 
bandwidths0 = np.linspace(0.1 * better_bandwidth0, 2 * better_bandwidth0, 1000)
grid0 = GridSearchCV(KernelDensity(kernel='gaussian'),
                    {'bandwidth': bandwidths0},
                    cv=m) 
grid0.fit(values0)
best_bandwidth0 = grid0.best_estimator_.bandwidth
kde0 = KernelDensity(kernel='gaussian', bandwidth=best_bandwidth0)
kde0.fit(values0)

def pdf(y):
    log_prob = kde0.score_samples([[y]]) 
    return np.exp(log_prob)

MI_withy_dict={} 
for feature_name in feature: 
    covariance_matrix = np.cov(data_values[f'{feature_name}_{y1_name}']) 
    std_devs = np.sqrt(np.diag(covariance_matrix))
    n = data_values[f'{feature_name}_{y1_name}'].shape[1]
    d = data_values[f'{feature_name}_{y1_name}'].shape[0]
    bandwidths = ((4 / (d+2)) ** (1 / (d+4)) * n ** (-1 / (d+4))) * std_devs 
    def custom_bandwidth(*args):
        return np.diag(bandwidths)
    kde_xy = gaussian_kde(data_values[f'{feature_name}_{y1_name}'], bw_method=custom_bandwidth)

    def integrand(y,x):
        return kde_xy([x,y])

    valuesi = xi_dict[feature_name].reshape(-1, 1)  
    min_stdi=min(np.std(data1[feature_name]),((data1[feature_name].quantile(0.75)-data1[feature_name].quantile(0.25))/1.34))
    better_bandwidthi=1.06 * min_stdi * n ** (-1 / 5)
    bandwidthsi = np.linspace(0.1 * better_bandwidthi, 2 * better_bandwidthi, 1000)
    gridi = GridSearchCV(KernelDensity(kernel='gaussian'),
                        {'bandwidth': bandwidthsi},
                        cv=m) 
    gridi.fit(valuesi)
    best_bandwidthi = gridi.best_estimator_.bandwidth
    kdei = KernelDensity(kernel='gaussian', bandwidth=best_bandwidthi)
    kdei.fit(valuesi)

    def pdfi(x):
        log_prob = kdei.score_samples([[x]])  
        return np.exp(log_prob)

    def mutual_information_integrandi(y, x):
        eps = 1e-10 
        pxy = integrand(y, x)
        px = pdfi(x)
        py = pdf(y)

        if pxy > eps and px > eps and py> eps:
            return pxy * np.log((pxy + eps) / ((px + eps) * (py + eps)))
        else:
            return 0

    x_range = np.std(xi_dict[feature_name]) * 6
    y_range = np.std(Y1) * 6

    mi, mi_error = dblquad(
        mutual_information_integrandi, 
        np.mean(xi_dict[feature_name]) - x_range,   
        np.mean(xi_dict[feature_name]) + x_range,  
        lambda x: np.mean(Y1) - y_range,  
        lambda x: np.mean(Y1) + y_range,  
    )

    MI_withy_dict[feature_name] = mi

items = list(MI_withy_dict.items())
#-------------------------------------
print(f"筛选的各ICs特征与{y1_name}的互信息结果为：",items)
sorted_items = sorted(items, key=lambda item: item[1], reverse=True)

feature_number = 1  
selection_features = [item[0] for item in sorted_items[:feature_number]] 
print(f'与{y1_name}配对的ICs界面特征是:',selection_features)

    Binding_affinity  Ics_polar-apolar  Ics_charg-apolar  Ics_charg-polar  \
0                9.3                12                20                4   
1               13.1                19                19                2   
2                6.4                12                14                7   
3                5.3                18                 7               20   
4               12.1                15                20                8   
..               ...               ...               ...              ...   
76              10.7                28                12               14   
77               9.6                15                 6                7   
78               8.8                15                14                7   
79              14.5                20                14                5   
80              11.3                10                16                3   

    Ics_charg-charg  Ics_apolar-apolar  BSAapolar  BSApolar  
0            