In [1]:
import pandas as pd
import numpy as np

def load_data(csv_file):
    loaded_data = pd.read_csv(csv_file)
    samples , feat_count = loaded_data.shape
    label_idx = '%s' %(feat_count - 2)
    features = np.zeros((samples , (feat_count - 2)))
    n_files = len(loaded_data)
    print(loaded_data.shape)
    Ids = loaded_data['0']
    for feat in range(1,(feat_count - 2)):
        features[:, feat] = loaded_data['%s' %feat]
    labels = loaded_data[label_idx]
    labels = labels.to_numpy()
    return Ids, features, labels

In [10]:
import numpy as np
from numpy.random import random_sample as rs, randint as r
from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.model_selection import cross_val_predict as CV
from sklearn.metrics import r2_score as r2

def KnnLooCV(features, labels):
    X = features
    y = labels
    model = KNR(n_neighbors=1)
    pred = CV(model, X, y, cv=len(X))
    r2_score = r2(pred,y)
    print(r2_score)
    return r2_score

In [3]:
Ids_Mac, features_Mac, labels_Mac = load_data('featured_MACKeys_train_data.csv')
Ids_CFP, features_CFP, labels_CFP = load_data('featured_CircularFP_train_data.csv')
Ids_RD, features_RD, labels_RD = load_data('featured_RDKit_train_data.csv')
Ids_PCFP, features_PCFP, labels_PCFP = load_data('featured_PubChemFP_train_data.csv')
features_RD_noNaN = np.nan_to_num(features_RD, copy=False, nan=0.0, posinf=0.0, neginf=0.0)

(9451, 170)
(9451, 2051)
(9451, 203)


In [11]:
r2_score_Mac = KnnLooCV(features_Mac, labels_Mac)
r2_score_CFP = KnnLooCV(features_CFP, labels_CFP)
r2_score_RD = KnnLooCV(features_RD_noNaN, labels_RD)
r2_score_PCFP = KnnLooCV(features_PCFP, labels_PCFP)

0.11829874685430508
0.051502964157197684
0.043100933471762026
0.15083629222973693


In [18]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn import datasets, model_selection

def feature_importance(feature_arrays, labels):
    y = labels
    x = feature_arrays
    _, n_features = feature_arrays.shape
    importance_values = np.zeros(n_features)
    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        x, y, test_size = 0.2, random_state = 1)
    kf = model_selection.KFold(n_splits = 5)
    for train_idx, val_idx in kf.split(x_train):
        feature_imp_model = RandomForestRegressor(
            n_estimators = 1000, criterion="mse",
            min_samples_leaf = 1, max_features = "sqrt")
        feature_imp_model.fit(x_train[train_idx], y_train[train_idx])
        fold_feat_imp = feature_imp_model.feature_importances_
        importance_values += fold_feat_imp
    return importance_values

In [77]:
import numpy as np

def feature_rearrage(feature_arrays, importance_values):
    samples, n_features = feature_arrays.shape
    ranked_feature_array = np.zeros((samples, n_features))
    new_order = np.argsort(importance_values)
    count = n_features - 1
    for new_ind in new_order:
        ranked_feature_array[:,count] = feature_arrays[:,new_ind]
        count -= 1
    ranked_features = pd.DataFrame(data=ranked_feature_array)
    ranked_features.to_csv('ranked_featured_PCFP_fin_data.csv')
    return ranked_feature_array

In [65]:
import pandas as pd
import numpy as np
from sklearn import svm, model_selection
from sklearn import metrics
import matplotlib.pyplot as plt
import time

def SVM_optimization(train_feats, train_labels, val_feats, val_labels):
    train_features = train_feats
    y_train = train_labels
    val_features = val_feats
    y_val = val_labels
    _, total_features = train_features.shape
    kernel = 'rbf'
    lmda = 0.0010
    MSEs = np.zeros(total_features)
    R2s = np.zeros(total_features)
    times = np.zeros(total_features)
    for feats_used in range(1,total_features):
        x_train = train_features[: , 0:feats_used]
        x_val = val_features[: , 0:feats_used]
        prev_time = time.time()
        svc = svm.SVR(kernel=kernel, C=1/lmda)
        svc.fit(x_train, y_train)
        curr_time = time.time()
        y_pred = svc.predict(x_val)
        MSEs[feats_used] = metrics.mean_squared_error(y_val, y_pred)
        R2s[feats_used] = metrics.r2_score(y_val, y_pred)
        times[feats_used] = curr_time - prev_time
    return MSEs, R2s, times

In [None]:
importance_values = feature_importance(features_PCFP, labels_PCFP)
print(importance_values)
ranked_train_feature_array = feature_rearrage(features_PCFP, importance_values)


In [76]:
Ids_PCFP_val, features_PCFP_val, labels_PCFP_val = load_data('featured_PubChemFP_validation_data.csv')
ranked_val_feature_array = feature_rearrage(features_PCFP_val, importance_values)

(1242, 884)


In [90]:
Ids_PCFP_fin, features_PCFP_fin, labels_PCFP_fin = load_data('featured_PubChemFP_final_data.csv')
ranked_fin_feature_array = feature_rearrage(features_PCFP_fin, importance_values)

(1247, 884)


In [88]:
import pandas as pd
import numpy as np
from sklearn import svm, model_selection
from sklearn import metrics
import matplotlib.pyplot as plt
import time

def final_model(train_feats, train_labels, fin_feats, fin_labels):
    train_features = train_feats
    y_train = train_labels
    test_features = fin_feats
    y_test = fin_labels
    _, total_features = train_features.shape
    kernel = 'rbf'
    lmda = 0.0010
    x_train = train_features[: , 0:311]
    x_test = test_features[: , 0:311]
    prev_time = time.time()
    svc = svm.SVR(kernel=kernel, C=1/lmda)
    svc.fit(x_train, y_train)
    curr_time = time.time()
    y_pred = svc.predict(x_test)
    MSE = metrics.mean_squared_error(y_test, y_pred)
    R2 = metrics.r2_score(y_test, y_pred)
    timed = curr_time - prev_time
    return MSE, R2, timed

In [89]:
ranked_val_feature_array = feature_rearrage(features_PCFP_val, importance_values)

In [None]:
    #removed graphing of svm experiment. data was exported and ploted in excel.
    #plt.subplot(311)
    #plt.plot(MSEs, 'o')
    #plt.xlabel('Features Used')
    #plt.ylabel('Model MSE')
    #plt.subplot(312)
    #plt.plot(R2s, 'o')
    #plt.xlabel('Features Used')
    #plt.ylabel('Model R2')
    #plt.subplot(313)
    #plt.plot(times, 'o')
    #plt.xlabel('Features Used')
    #plt.ylabel('Time to Train Model')

In [66]:
MSEs, R2s, times = SVM_optimization(ranked_train_feature_array, labels_PCFP, ranked_val_feature_array, labels_PCFP_val)

In [91]:
MSE, R2, time = final_model(ranked_train_feature_array, labels_PCFP, ranked_fin_feature_array, labels_PCFP_fin)

In [92]:
print(MSE, R2, time)

2.5611860228015355 0.3327654542817339 8.672730207443237
