In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

In [2]:
file_path = "1.xlsx"
df = pd.read_excel(file_path, sheet_name="Sheet1")

In [3]:
df = df.dropna(axis=1)

mol_names = df["name"]
X = df.drop(columns=["name"])

In [4]:
var_thresholder = VarianceThreshold(threshold=0.01)
X_var = var_thresholder.fit_transform(X)
var_selected_features = X.columns[var_thresholder.get_support()]

In [5]:
X_var_df = pd.DataFrame(X_var, columns=var_selected_features)
corr_matrix = X_var_df.corr().abs()

In [6]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.9)]

X_corr = X_var_df.drop(columns=to_drop)
final_features = X_corr.columns.tolist()

In [7]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_corr)

In [8]:
print("The original characteristic number:", len(pd.read_excel(file_path, sheet_name="Sheet1").columns) - 1)  # 减去name列
print("After deleting the blank column:", X.shape[1])
print("After low variance filtering:", X_var_df.shape[1])
print("After correlation filtering:", X_corr.shape[1])
print("Preserved descriptors:", final_features)
df_selected = pd.concat([mol_names, X_corr], axis=1)
output_file = "filtered_descriptors1.xlsx"
df_selected.to_excel(output_file, index=False)
print(f"The filtered descriptor data has been saved to: {output_file}")

The original characteristic number: 1444
After deleting the blank column: 1407
After low variance filtering: 877
After correlation filtering: 302
Preserved descriptors: ['nAcid', 'ALogP', 'ALogp2', 'AMR', 'apol', 'naAromAtom', 'nO', 'nS', 'nF', 'AATS0m', 'AATS4m', 'AATS5m', 'AATS6m', 'AATS0v', 'AATS4v', 'AATS3e', 'AATS4e', 'AATS5e', 'AATS6e', 'AATS7e', 'AATS8e', 'AATS0p', 'AATS0i', 'AATS1i', 'AATS2i', 'AATS3i', 'AATS4i', 'AATS5i', 'AATS4s', 'AATS5s', 'AATS6s', 'AATS7s', 'AATS8s', 'ATSC0c', 'ATSC1c', 'ATSC2c', 'ATSC3c', 'ATSC4c', 'ATSC5c', 'ATSC6c', 'ATSC7c', 'ATSC1m', 'ATSC2m', 'ATSC3m', 'ATSC4m', 'ATSC5m', 'ATSC6m', 'ATSC7m', 'ATSC8m', 'ATSC1v', 'ATSC2v', 'ATSC3v', 'ATSC4v', 'ATSC5v', 'ATSC6v', 'ATSC7v', 'ATSC8v', 'ATSC1e', 'ATSC2e', 'ATSC3e', 'ATSC4e', 'ATSC5e', 'ATSC6e', 'ATSC7e', 'ATSC8e', 'ATSC1p', 'ATSC2p', 'ATSC3p', 'ATSC4p', 'ATSC5p', 'ATSC6p', 'ATSC7p', 'ATSC8p', 'ATSC2i', 'ATSC3i', 'ATSC4i', 'ATSC5i', 'ATSC6i', 'ATSC7i', 'ATSC8i', 'ATSC1s', 'AATSC0m', 'AATSC1m', 'AATSC2m', 'A