In [None]:
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
from mordred import Calculator, descriptors
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from tqdm import tqdm

# read in qm9 data

In [None]:
data = pd.read_csv('../Computing Emin for QM9/data/clean/qm9.csv')

# get rdkit molecule objects and calculate Mordred descriptors

In [None]:
data['mol'] = data['smiles_1'].apply(Chem.MolFromSmiles)
data = data[data['mol'].to_numpy() != None]

flag_3D = False 
calc = Calculator(descriptors, ignore_3D=flag_3D)
descriptors = calc.pandas(data['mol'])

# add back Emin and other scores to feature list

In [None]:
columns_to_keep = ['Emin','smiles_1 sa_score','smiles_1 sc_score','smiles_1 syba_score','smiles_1 ra_score']

# add some zpe features
                   
zpe_to_u = data['zpe']/data['Emin']
data['zpe_to_u'] = zpe_to_u
columns_to_keep += ['zpe','zpe_to_u']
                   
descriptors = pd.concat([descriptors,data[columns_to_keep]],axis=1)
print(f'We begin with {descriptors.shape[1]} features.')

# remove features with ANY NaNs

In [None]:
for c in tqdm(descriptors.columns):
    descriptors[c] = pd.to_numeric(descriptors[c],errors='coerce')

# print number of NaNs
missing_values_bools = descriptors.isnull().any(axis=0).to_numpy()
keep_feature_names = descriptors.columns[~missing_values_bools]

# grab non-NaN columns
desc = descriptors[keep_feature_names]  # Gets only columns that do not (~ means not) have missing values
print(f'Removed {missing_values_bools.sum()} NaN-containing features (now there are {desc.shape[1]} features).')


# remove features that have zero variation

In [None]:
feature_stats = descriptors.describe().transpose()

# identify feature names with no feature variation (std = 0)
keep_features_names = feature_stats.query('std != 0').transpose().columns

# get the actual features that I want to keep
keep_features = descriptors[keep_features_names]

print(f'Removed {len(feature_stats)-keep_features.shape[1]} useless features (now there are {keep_features.shape[1]} features).')

# remove highly correlated features

In [None]:
pairwise_correlations = keep_features.corr().to_numpy()

threshold = 0.90 # correlation threshold above which a feature will be removed

indices_to_remove = []
for i in tqdm(range(len(pairwise_correlations))):
    feature_correlations = pairwise_correlations[i]
    high_correlation = np.argwhere(feature_correlations > threshold).flatten() # this will double count correlations below the diagonal
    feature_index_to_remove = high_correlation[i+1:] # prevents double counting and removing both features that are highly correlated with one another
    indices_to_remove += [feature_index_to_remove]

indices_to_remove = np.unique(np.concatenate(indices_to_remove))
all_indices = np.arange(len(pairwise_correlations))
indices_to_keep = all_indices[~np.isin(all_indices,indices_to_remove)]

keep_features = keep_features.iloc[:,indices_to_keep]

print(f'Removed {len(indices_to_remove)} highly correlated features (now there are {keep_features.shape[1]} features).')

# Save features to csv

In [None]:
if flag_3D == True: feature_3D = '_3D'
else: feature_3D = ''

keep_features['Reported'] = data['Reported'] # add back reported label
keep_features['smiles_1'] = data['smiles_1'] # add back smiles strings
keep_features.to_csv(f'qm9_mordred{feature_3D}.csv',index=False)
