# Make a quick and simple ML model that predicts the synthesizability of materials using Mordred features, SA score, and Emin as features

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from collections import defaultdict
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import glob

In [None]:
# read in data with qm9 energies, SA scores, and SC scores
mordred_features = list(pd.read_csv('../reproduce_figure_3/ML_mordred_feature_list/12_iteration_feature_33_features.csv')['Name'].to_numpy()[1:]) # start with index 1 since we want to skip over emin which is the first (most important) feature
score_names = ['smiles_1 sa_score','smiles_1 sc_score','smiles_1 ra_score','smiles_1 syba_score']
run_type_names = ['mordred_smiles 1 ra_score','mordred_smiles 1 sc_score','mordred_smiles 1 ra_score','mordred_smiles 1 syba_score']

data = pd.read_csv(f'../../Computing Mordred Features for QM9/qm9_mordred.csv',usecols=mordred_features + ['Reported'] + score_names + ['Emin'])

# Run model

In [None]:
def run_ML(training_features,training_labels,n_train):

    model = RandomForestClassifier(n_jobs=-1)
    # define training and testing based on splits
    X_train, X_test, y_train, y_test = train_test_split(training_features,training_labels, train_size = n_train, stratify=training_labels)

    # fit and predict
    model.fit(X_train,y_train)
    y_predicted = model.predict_proba(X_test)
    y_ground_truth = y_test
    
    return y_predicted, y_ground_truth

def perform_ML_for_feature_set(feature_names,run_type):
    training_features = data[feature_names].to_numpy()
    training_labels = data['Reported'].to_numpy()

    # scramble ordering of features and labels to get a more homogeneously distributed dataset
    np.random.seed(123) # do not change this seed from 123
    scramble_indices = np.random.choice(range(len(training_labels)),size=len(training_labels),replace=False)

    training_features = training_features[scramble_indices]
    training_labels = training_labels[scramble_indices]

    df_data = defaultdict(list)

    for n_train in tqdm(n_train_sizes,desc="Train Size", position=0,leave=True):

        df_data['Training Size'] += [n_train]

        for trial in tqdm(range(n_trials),desc="Trial", position=1,leave=False):
            y_predicted, y_ground_truth = run_ML(training_features,training_labels,n_train)

            roc_auc = roc_auc_score(y_ground_truth,y_predicted[:,1])

            df_data[f'Trial {trial+1}'] += [roc_auc]

    results = pd.DataFrame(data = df_data)
    trials_mask = results.columns.str.contains('Trial*')
    trials_data = results.loc[:,trials_mask].to_numpy()
    results['Mean AUC'] = np.mean(trials_data,axis=1)
    results['Std AUC'] = np.std(trials_data,axis=1)
    results.to_csv(f'results_{run_type}.csv',index=None)

# Gather features and labels

In [None]:
n_train_sizes = [50,250,1250,6250,31250,100000]
n_trials = 30

for i in range(len(score_names)):
    score = score_names[i]
        
    if i == 0: perform_ML_for_feature_set(mordred_features,'mordred') # only need to run this once
    perform_ML_for_feature_set([score] + mordred_features,f'mordred_{score}')
    perform_ML_for_feature_set(['Emin'] + [score] + mordred_features,f'mordred_emin_{score}')

# plot all 4 subplots for paper

In [None]:
fig, axs = plt.subplots(1, 4, sharex=True, sharey=True, figsize=(15, 3))
ax1 = plt.subplot(1, 4, 1)
all_results_1 = ['results_mordred_emin_smiles_1 sa_score.csv','results_mordred_smiles_1 sa_score.csv','results_mordred.csv']
all_labels_1 = [r'Mordred + SA + $E_{min}$', 'Mordred + SA', 'Mordred']
ax2 = plt.subplot(1, 4, 2)
all_results_2 = ['results_mordred_emin_smiles_1 sc_score.csv','results_mordred_smiles_1 sc_score.csv','results_mordred.csv']
all_labels_2 = [r'Mordred + SC + $E_{min}$', 'Mordred + SC', 'Mordred']
ax3 = plt.subplot(1, 4, 3)
all_results_3 = ['results_mordred_emin_smiles_1 syba_score.csv','results_mordred_smiles_1 syba_score.csv','results_mordred.csv']
all_labels_3 = ['Mordred + SYBA + $E_{min}$', 'Mordred + SYBA', 'Mordred']
ax4 = plt.subplot(1, 4, 4)
all_results_4 = ['results_mordred_emin_smiles_1 ra_score.csv','results_mordred_smiles_1 ra_score.csv','results_mordred.csv']
all_labels_4 = ['Mordred + RA + $E_{min}$', 'Mordred + RA', 'Mordred']

axs = [ax1,ax2,ax3,ax4]
all_results_list = [all_results_1,all_results_2,all_results_3,all_results_4]
all_labels_list = [all_labels_1,all_labels_2,all_labels_3,all_labels_4]

score_names_title = ['SA Score','SC Score','SYBA Score','RA Score']


for j in range(4): # 4 plots total
    
    all_results = all_results_list[j]
    all_labels = all_labels_list[j]
    subplot = axs[j]
    
    for i in range(len(all_results)):
        results = pd.read_csv(all_results[i])
        label = all_results[i]
        subplot.errorbar(results['Training Size'], results['Mean AUC'], yerr = results['Std AUC'].to_numpy(),marker='d',label=all_labels[i],capsize=5,alpha=0.7)
        subplot.set_xscale('log')
        ax1.set_ylabel('ROC AUC')
        subplot.legend(loc='lower right')
        subplot.set_xlabel('Training Set Size')
        subplot.set_title(score_names_title[j])