# Make a quick and simple ML model that predicts the synthesizability of materials using Mordred features

In [16]:
import numpy as np
from tqdm import *
import pandas as pd
import pickle
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score, plot_roc_curve
from collections import defaultdict
import os

# read in QM9 data with computed Mordred features


In [3]:
# read in QM9 data with computed Mordred features
mordred_most_important_features = list(pd.read_csv('ML_mordred_feature_list/12_iteration_feature_33_features.csv')['Name'].to_numpy()[1:]) # start with index 1 since we want to skip over emin which is the first (most important) feature

data = pd.read_csv(f'../../Computing Mordred Features for QM9/qm9_mordred.csv',usecols=mordred_most_important_features + ['Reported'])

# Gather features and labels

In [5]:
training_features = data[mordred_most_important_features].to_numpy()
training_labels = data['Reported'].to_numpy()

# scramble ordering of features and labels to get a more homogeneously distributed dataset
np.random.seed(123) # do not change this seed from 123
scramble_indices = np.random.choice(range(len(training_labels)),size=len(training_labels),replace=False)

training_features = training_features[scramble_indices]
training_labels = training_labels[scramble_indices]

# Run model

In [13]:
model = RandomForestClassifier(max_depth=20,n_jobs=-1)

splits = 10
kf = KFold(n_splits = splits,shuffle=False) # DO NOT SHUFFLE
kf_splits = kf.split(training_features,training_labels)
training_predicted = []
ground_truth = []

for train_index, test_index in tqdm(kf_splits,total=splits):
    
    # define training and testing sets
    X_train, X_test = training_features[train_index], training_features[test_index]
    y_train, y_test = training_labels[train_index], training_labels[test_index]

    # fit and predict
    model.fit(X_train,y_train)
    y_predicted = model.predict_proba(X_test)

    training_predicted += [y_predicted]
    ground_truth += [y_test]
    
all_predicted, all_ground_truth = np.concatenate(training_predicted,axis=0),np.concatenate(ground_truth,axis=0)

100%|██████████| 10/10 [00:20<00:00,  2.06s/it]


In [14]:
def get_accuracies(ground_truth, prediction_value, c):
                   
    ground_truth = 1*ground_truth # change bools to numbers
    prediction = 1*(prediction_value >= c)

    # count false positives
    fp = np.sum((ground_truth == 0) & (prediction == 1))

    # find false negatives
    fn = np.sum((ground_truth == 1) & (prediction == 0))

    # find true positives
    tp = np.sum((ground_truth == 1) & (prediction == 1))

    # find true negatives
    tn = np.sum((ground_truth == 0) & (prediction == 0))

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    return precision, recall, tp, tn, fp, fn

# Evaluate Accuracy

In [17]:
# process prediction probabilities [False, True] for reported/synthesizable

precision_list = []
recall_list = []
tp_list = []
tn_list = []
fp_list = []
fn_list = []

cutoffs = np.arange(0,1,0.01)

for c in cutoffs:
    
    # for predictions on the validation data points
    precision, recall, tp, tn, fp, fn = get_accuracies(all_ground_truth, all_predicted[:,1],c)
    
    precision_list += [precision]
    recall_list += [recall]
    tp_list += [tp]
    tn_list += [tn]
    fp_list += [fp]
    fn_list += [fn]

    

precision_recalls = pd.DataFrame(data={'Cutoff': cutoffs,
                                        'Precision': precision_list,
                                        'Recall': recall_list,
                                      'TP': tp_list,
                                      'TN': tn_list,
                                      'FP': fp_list,
                                      'FN': fn_list})
os.makedirs('precision_recalls',exist_ok=True)
precision_recalls.to_csv(f'precision_recalls/mordred_ML_precision_recalls.csv', index=False)