In [3]:
import pandas as pd
import numpy as np

import itertools

from scipy.spatial.distance import pdist
from sklearn.svm import SVC, SVR
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler

In [3]:
tested_odor_desc = pd.read_csv("./compiled_desc_resp/filtered_tested_desc.csv", index_col = 0)
tested_odor_resp = pd.read_csv("./compiled_desc_resp/compiled_odor_sigResp_wide.csv", index_col = 0)

In [6]:
feature_df = pd.DataFrame()

for feature in tested_odor_desc.columns:
    df = tested_odor_desc.loc[:,feature].copy(deep=True)
    unique_value_count = df.nunique()
    zero_count = df[df == 0].shape[0]
    if (zero_count > 0) & (unique_value_count < 52):
        feature_df = pd.concat([feature_df, df], axis = 1)

feature_df_binary = pd.DataFrame()

for feature in feature_df.columns:
    df = feature_df.loc[:,feature].copy(deep=True)
    df[df != 0] = 1
    feature_df_binary = pd.concat([feature_df_binary, df], axis = 1)

In [7]:
feature_df_binary = feature_df_binary.loc[:,feature_df_binary.sum(axis = 0) > 1]
print(feature_df_binary.shape)
#(52, 822)
feature_df_binary = feature_df_binary.loc[:,feature_df_binary.sum(axis = 0) < (feature_df_binary.shape[0]-1)]
print(feature_df_binary.shape)
#(52, 763)

(52, 822)
(52, 763)


In [8]:
tested_odor_desc_for_ohe = tested_odor_desc.loc[:,feature_df_binary.columns]
print(tested_odor_desc_for_ohe.shape)

(52, 763)


In [10]:
def remove_corr_feats(input_df):
    dropped_features = np.empty(shape = (0,), dtype = object)
    dropped_correlates_concat = np.empty(shape = (0,2), dtype = object)
    input_arr = np.array(input_df)
    feature_names = np.array(input_df.columns)
    odor_cid = np.array(input_df.index)
    feature_corr_dist = pdist(input_arr.T, metric='correlation')
    corr_feat_count = np.where(feature_corr_dist==0)[0]
    while len(corr_feat_count) != 0:
        pw_comparisons = np.triu_indices(len(feature_names), 1)
        indices_to_drop = pw_comparisons[1][np.where(feature_corr_dist == 0)[0]]
        features_to_drop = np.unique(feature_names[indices_to_drop])
        dropped_features = np.concatenate((dropped_features, features_to_drop))
        dropped_correlates = np.stack((feature_names[pw_comparisons[0][np.where(feature_corr_dist == 0)]], 
                                       feature_names[pw_comparisons[1][np.where(feature_corr_dist == 0)]]),
                                       axis = 1)
        dropped_correlates_concat = np.concatenate((dropped_correlates_concat, dropped_correlates))
        unique_indices_to_drop = np.where(np.in1d(feature_names, features_to_drop))[0]
        input_arr = np.delete(input_arr, unique_indices_to_drop, axis = 1)
        feature_names = np.delete(feature_names, unique_indices_to_drop)
        input_df = pd.DataFrame(input_arr)
        input_df.columns = feature_names
        input_df.index = odor_cid
        input_arr = np.array(input_df)
        feature_names = np.array(input_df.columns)
        odor_cid = np.array(input_df.index)
        feature_corr_dist = pdist(input_arr.T, metric='correlation')
        corr_feat_count = np.where(feature_corr_dist ==0)[0]
    output_df = input_df.copy(deep=True)
    return output_df, dropped_features, dropped_correlates_concat

In [15]:
tested_odor_desc_decorr, tested_odor_desc_dropped, tested_odor_desc_correlated = remove_corr_feats(tested_odor_desc_for_ohe)
print(tested_odor_desc_decorr.shape)
#(52, 684)

(52, 684)


In [16]:
tested_odor_desc_decorr = tested_odor_desc_decorr.astype(str)
tested_odor_desc_decorr_ohe = pd.get_dummies(tested_odor_desc_decorr)
tested_odor_desc_decorr_ohe = tested_odor_desc_decorr_ohe.loc[:,tested_odor_desc_decorr_ohe.sum(axis = 0) != 1]
print(tested_odor_desc_decorr_ohe.shape)
#(52, 2193)

(52, 2193)


In [17]:
tested_odor_desc_decorr2, tested_odor_desc_dropped2, tested_odor_desc_correlated2 = remove_corr_feats(tested_odor_desc_decorr_ohe)
print(tested_odor_desc_decorr2.shape)

(52, 1050)


In [126]:
tested_odor_desc_correlated2[np.where(tested_odor_desc_correlated2.T[1] == 'nSH_1.0')]

array([['P_VSA_LogP_4_10.8184292306614', 'nSH_1.0'],
       ['P_VSA_s_6_38.563628990611', 'nSH_1.0'],
       ['P_VSA_charge_10_10.8184292306614', 'nSH_1.0']], dtype=object)

In [21]:
tested_odor_desc_decorr2_arr = np.array(tested_odor_desc_decorr2)
tested_odor_resp_arr = np.array(tested_odor_resp)
print(tested_odor_desc_decorr2_arr.shape)
print(tested_odor_resp_arr.shape)

(52, 1050)
(52, 375)


In [30]:
#x = tested_odor_resp_arr
#features = np.array(tested_odor_desc_decorr2.columns)
#full_preds = pd.DataFrame()
#pred_auroc = pd.DataFrame()
#
#for i in range(tested_odor_desc_decorr2_arr.shape[1]):
#    print(i)
#    y = tested_odor_desc_decorr2_arr.T[i]
#    yout_compiled = np.empty(shape=(y.shape[0],2))
#    current_feat = features[i]
#    for j in range(x.shape[0]):
#        x_test = x[j,:].reshape(1,-1)
#        x_train = np.delete(arr = x, obj = j, axis = 0)
#        zero_var = np.where(x_train.var(axis = 0) == 0)[0]
#        x_train = np.delete(arr = x_train, obj = zero_var, axis = 1)
#        x_test = np.delete(arr = x_test, obj = zero_var, axis = 1)
#        scaler = StandardScaler()
#        scaler.fit(x_train)
#        x_train_scaled = scaler.transform(x_train)
#        x_test_scaled = scaler.transform(x_test)
#        y_test = y[j]
#        y_train = np.delete(arr = y, obj = j, axis = 0)
#        classifier = SVC(C=0.01,
#                         random_state=42,
#                         probability=True,
#                         kernel='linear')
#        classifier.fit(x_train_scaled, y_train)
#        y_pred = classifier.predict_proba(x_test)[0][1]
#        yout_compiled[j] = y_test, y_pred
#    #Calculate AUROC
#    fpr, tpr, thresholds = roc_curve(yout_compiled.T[0], yout_compiled.T[1])
#    roc_auc = auc(fpr, tpr)
#    print(i, current_feat, roc_auc)
#    temp = pd.DataFrame(yout_compiled)
#    temp.columns = [current_feat+'*true',current_feat+'*pred']
#    temp2 = pd.DataFrame([current_feat, roc_auc]).T
#    full_preds = pd.concat([full_preds, temp], axis = 1)
#    pred_auroc = pd.concat([pred_auroc, temp2], axis = 0)
#    
#full_preds.to_csv("./ohe_features/ohe_svc_preds.csv")
#pred_auroc.to_csv("./ohe_features/ohe_svc_auroc.csv")

In [70]:
full_preds = pd.read_csv("./ohe_features/ohe_svc_preds.csv", index_col = 0)
pred_auroc = pd.read_csv("./ohe_features/ohe_svc_auroc.csv", index_col = 0)
pred_auroc.columns = ['feature', 'auroc']
sig_pred_auroc = pred_auroc[pred_auroc['auroc'] > 0.5]
print(sig_pred_auroc.shape)
#(448, 2)

(448, 2)


In [71]:
opt_ohe = tested_odor_desc_decorr2[sig_pred_auroc['feature'].values]

In [72]:
opt_ohe_dict = {}

for feature in opt_ohe.columns:
    feature_split = feature.split('_')
    feature_name = '_'.join(feature_split[:-1])
    feature_value = feature_split[-1:][0]
    if feature_name not in opt_ohe_dict:
        opt_ohe_dict[feature_name] = []
    opt_ohe_dict[feature_name].append(feature_value)

In [73]:
opt_ohe_dict2 = {}

for feature in opt_ohe_dict:
    if len(opt_ohe_dict[feature]) == 1:
        opt_ohe_dict2[feature] = opt_ohe_dict[feature]
    if len(opt_ohe_dict[feature]) == 2:
        if '0.0' in opt_ohe_dict[feature]:
            values = opt_ohe_dict[feature]
            for number in values:
                if number != '0.0':
                    opt_ohe_dict2[feature] = []
                    opt_ohe_dict2[feature].append(number)
        else:
            opt_ohe_dict2[feature] = opt_ohe_dict[feature]
    else:
        opt_ohe_dict2[feature] = opt_ohe_dict[feature]

In [74]:
ohe_df = pd.DataFrame()

for f in opt_ohe_dict2:
    for f_value in opt_ohe_dict2[f]:
        f_f_value = '_'.join([f, str(f_value)])
        data_vector = tested_odor_desc_decorr2[f_f_value]
        ohe_df = pd.concat([ohe_df, data_vector], axis = 1)

In [75]:
def remove_corr_feats2(input_df):
    dropped_features = np.empty(shape = (0,), dtype = object)
    dropped_correlates_concat = np.empty(shape = (0,2), dtype = object)
    input_arr = np.array(input_df)
    feature_names = np.array(input_df.columns)
    odor_cid = np.array(input_df.index)
    feature_corr_dist = pdist(input_arr.T, metric='correlation')
    corr_feat_count = np.where(feature_corr_dist==2)[0]
    while len(corr_feat_count) != 0:
        pw_comparisons = np.triu_indices(len(feature_names), 1)
        indices_to_drop = pw_comparisons[1][np.where(feature_corr_dist == 2)[0]]
        features_to_drop = np.unique(feature_names[indices_to_drop])
        dropped_features = np.concatenate((dropped_features, features_to_drop))
        dropped_correlates = np.stack((feature_names[pw_comparisons[0][np.where(feature_corr_dist == 2)]], 
                                       feature_names[pw_comparisons[1][np.where(feature_corr_dist == 2)]]),
                                       axis = 1)
        dropped_correlates_concat = np.concatenate((dropped_correlates_concat, dropped_correlates))
        unique_indices_to_drop = np.where(np.in1d(feature_names, features_to_drop))[0]
        input_arr = np.delete(input_arr, unique_indices_to_drop, axis = 1)
        feature_names = np.delete(feature_names, unique_indices_to_drop)
        input_df = pd.DataFrame(input_arr)
        input_df.columns = feature_names
        input_df.index = odor_cid
        input_arr = np.array(input_df)
        feature_names = np.array(input_df.columns)
        odor_cid = np.array(input_df.index)
        feature_corr_dist = pdist(input_arr.T, metric='correlation')
        corr_feat_count = np.where(feature_corr_dist ==2)[0]
    output_df = input_df.copy(deep=True)
    return output_df, dropped_features, dropped_correlates_concat

In [76]:
opt_desc, dropped_corr, corrs = remove_corr_feats(ohe_df)

In [77]:
opt_desc2, dropped_anti_corr, anti_corrs = remove_corr_feats2(opt_desc)
print(opt_desc2.shape)
#(52, 367)

(52, 367)


In [78]:
corr_mat = opt_desc2.corr()
corr_mat = corr_mat.reset_index().melt('index')

perf_corrs = corr_mat[corr_mat['value'] == 1]
perf_corrs = perf_corrs[perf_corrs['index'] != perf_corrs['variable']]
perf_corrs = perf_corrs.reset_index(drop=True)

In [79]:
perf_corrs_noV = perf_corrs.drop('value', axis = 1)
perf_corrs_noV2 = pd.DataFrame()

for row in range(perf_corrs_noV.shape[0]):
    vec1 = pd.DataFrame(perf_corrs_noV.iloc[row,:].sort_values().reset_index(drop=True)).T
    perf_corrs_noV2 = pd.concat([perf_corrs_noV2, vec1], axis = 0)

perf_corrs_noV2['value'] = perf_corrs['value']
perf_corrs = perf_corrs_noV2.drop_duplicates()
opt_desc2 = opt_desc2.drop(perf_corrs[1].unique(), axis = 1)

In [80]:
print(opt_desc2.shape)
#(52, 321)

(52, 321)


In [130]:
#opt_desc2.to_csv("./ohe_features/optimized_desc_svc_ohe.csv")

# Setup SVC outcome for plotting

In [10]:
#svc_predictions = pd.read_csv("./ohe_features/ohe_svc_preds.csv", index_col = 0)
#svc_auroc = pd.read_csv("./ohe_features/ohe_svc_auroc.csv", index_col = 0)
#
#compile_feat_predictions = pd.DataFrame()
#for i in range(0,svc_predictions.shape[1],2):
#    feat_name = svc_predictions.iloc[:,i].name
#    feat_name = feat_name.removesuffix("*true")
#    true_val = svc_predictions.iloc[:,i].values
#    pred_val = svc_predictions.iloc[:,i+1].values
#    temp_df = pd.DataFrame([true_val, pred_val]).transpose()
#    temp_df['feature'] = feat_name
#    compile_feat_predictions = pd.concat([compile_feat_predictions, temp_df])
#compile_feat_predictions.columns = ['true','prediction','feature']
#compile_feat_predictions = compile_feat_predictions.reset_index(drop=True)
#compile_feat_predictions.to_csv("./ohe_features/ohe_svc_preds_long.csv")

# Calculate distances with optimized descriptors

In [15]:
#Center-scale function for pandas
def normalize(x):
    return (x - x.mean())/(x - x.mean()).std()

In [16]:
##Load data
#tested_odor_desc = pd.read_csv("./ohe_features/optimized_desc_svc_ohe.csv", index_col = 0)
#tested_odor_resp = pd.read_csv("./compiled_desc_resp/compiled_odor_sigResp_wide.csv", index_col = 0)
##Normalize data
#norm_tested_odor_desc = tested_odor_desc.transform(normalize)
#norm_tested_odor_resp = tested_odor_resp.transform(normalize)
##Calculate distances
#odor_combns2 = pd.DataFrame(itertools.combinations(list(tested_odor_resp.index), 2))
#norm_tested_resp_dist_euc = pd.DataFrame(pdist(norm_tested_odor_resp, 'euclidean'))
#norm_tested_desc_dist_euc = pd.DataFrame(pdist(norm_tested_odor_desc, 'euclidean'))
#norm_tested_resp_dist_corr = pd.DataFrame(pdist(norm_tested_odor_resp, 'correlation'))
#norm_tested_desc_dist_corr = pd.DataFrame(pdist(norm_tested_odor_desc, 'correlation'))
#norm_tested_resp_dist_cos = pd.DataFrame(pdist(norm_tested_odor_resp, 'cosine'))
#norm_tested_desc_dist_cos = pd.DataFrame(pdist(norm_tested_odor_desc, 'cosine'))
##Combine distances
#compiled_distances = pd.concat([odor_combns2, norm_tested_resp_dist_euc,
#                                norm_tested_desc_dist_euc,norm_tested_resp_dist_corr,
#                                norm_tested_desc_dist_corr,norm_tested_resp_dist_cos,
#                                norm_tested_desc_dist_cos], axis = 1)
#compiled_distances.columns = ['odor1','odor2',
#                              'response_euc_distance','feature_euc_distance',
#                              'response_corr_distance','feature_corr_distance',
#                              'response_cos_distance','feature_cos_distance']
#compiled_distances.to_csv("./ohe_features/ohe_optimized_descriptors_distances.csv")
opt_compiled_distances = pd.read_csv("./ohe_features/ohe_optimized_descriptors_distances.csv", index_col = 0)
