# Filtering the Features

Remove features with a variance less than 0.2 and with correlation larger than 0.95

In [1]:
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold
import pandas as pd
import numpy as np
import glob
import os

In [2]:
# Read entire training dataset with all descriptors
dataset = pd.read_csv('../../3_train_test_split/train_reg.csv').set_index('Molecule ChEMBL ID')
print("The original data size is ", dataset.shape)

The original data size is  (555, 1008)


## Remove Descriptors with Variance < 0.2

In [3]:
sel = VarianceThreshold(threshold=0.2) 
seldataset=sel.fit_transform(dataset)
seldataset=dataset[dataset.columns[sel.get_support(indices=True)]]

print("after filtering with variance > 0.2, the data size is", seldataset.shape)

after filtering with variance > 0.2, the data size is (555, 1004)


## Drop Features with Correlation > 0.95

In [4]:
def drop_highcorr(df, threshold=0.95):
    """
    Removes features that have a correlation higher than the specified threshold.

    Parameters:
        df (pd.DataFrame): The input DataFrame with features.
        threshold (float): The correlation threshold above which features will be removed.
        
    Returns:
        pd.DataFrame: A DataFrame with highly correlated features removed.
    """
    # Calculate the correlation matrix
    corr_matrix = df.corr().abs()
    
    # Create a boolean mask for the upper triangle of the correlation matrix
    upper_tri = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )
    
    # Find columns with correlation greater than the threshold
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > threshold)]
    
    # Drop the identified columns
    return df.drop(columns=to_drop)


# Apply filtration to each molecule
filtered_df = drop_highcorr(seldataset, threshold=0.95)

print("after filtering correlations > 0.95, the data size is", filtered_df.shape)

after filtering correlations > 0.95, the data size is (555, 564)


## Save Train and Test Sets with Reduced Features

In [5]:
input_dir = "../../3_train_test_split/"
output_dir = "."
os.makedirs(output_dir, exist_ok=True)  # Ensure the output directory exists

# Load descriptors_all.csv to get -logIC50
descriptor_file = os.path.join(input_dir, "descriptors_all.csv")
descriptor_df = pd.read_csv(descriptor_file)[["Molecule ChEMBL ID", "-logIC50"]]

# Columns to keep (from filtered_df)
selected_columns = filtered_df.columns

# Define the dummy variable threshold and the target column
threshold = 1e-6
target_column = "-logIC50"

# Loop over train, test, and val CSV files
for file_type in ["train", "test", "val"]:
    files = glob.glob(f"{input_dir}{file_type}*.csv") # Find all matching CSV files for this type
    
    for file_path in files:
        df = pd.read_csv(file_path).set_index('Molecule ChEMBL ID') # Read the CSV file
        filtered_df = df[selected_columns]  # Keep only selected columns

        # Add 'is_imputed' dummy variable for regression files
        if "reg" in file_path:
            merged_df = pd.merge(
                descriptor_df, filtered_df, on="Molecule ChEMBL ID", how="inner"
            ) # Merge with descriptor_df to include -logIC50
            
            merged_df['is_imputed'] = (
                np.abs(merged_df[target_column] - (-np.log(10000 * 1e-9))) < threshold
            ).astype(int) # Add the dummy variable

            ### Note: interaction terms didn't help with the predictions 
            # interaction_features = [
            #     col for col in selected_columns 
            #     if col in merged_df.columns and col != 'is_imputed'
            # ] # Create interaction terms
            
            # X_interactions = merged_df[interaction_features].multiply(merged_df['is_imputed'], axis=0)
            # X_interactions.columns = [f"{col}_is_imputed" for col in interaction_features]

        #     # Combine features and interaction terms
        #     merged_df = pd.concat([merged_df[selected_columns], X_interactions], axis=1)

            # Assign the processed DataFrame back to filtered_df for saving
            filtered_df = merged_df.drop(columns=[target_column]).set_index('Molecule ChEMBL ID')

        # Construct the output file path
        file_name = os.path.basename(file_path)  # Extract file name
        output_path = os.path.join(output_dir, file_name)
        
        # Save the filtered DataFrame
        filtered_df.to_csv(output_path)
        print(f"Processed file saved to {output_path}")

Processed file saved to ./train_class_5.csv
Processed file saved to ./train_class_4.csv
Processed file saved to ./train_reg.csv
Processed file saved to ./train_class_3.csv
Processed file saved to ./train_class_2.csv
Processed file saved to ./train_class_1.csv
Processed file saved to ./train_reg_5.csv
Processed file saved to ./train_reg_4.csv
Processed file saved to ./train_reg_1.csv
Processed file saved to ./train_reg_3.csv
Processed file saved to ./train_reg_2.csv
Processed file saved to ./train_class.csv
Processed file saved to ./test_reg.csv
Processed file saved to ./test_class.csv
Processed file saved to ./val_class_5.csv
Processed file saved to ./val_class_4.csv
Processed file saved to ./val_class_1.csv
Processed file saved to ./val_class_3.csv
Processed file saved to ./val_class_2.csv
Processed file saved to ./val_reg_4.csv
Processed file saved to ./val_reg_5.csv
Processed file saved to ./val_reg_1.csv
Processed file saved to ./val_reg_2.csv
Processed file saved to ./val_reg_3.cs