In [None]:
from time import time
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import recall_score as rs
from sklearn.metrics import precision_score as ps
from sklearn.metrics import roc_auc_score as roc_auc_s
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn.svm import SVR
import matplotlib.pyplot as plt
from sklearn.cross_decomposition import PLSRegression
from sklearn.cross_decomposition import PLSCanonical
from sklearn.model_selection import LeaveOneOut
from sklearn.svm import SVC
import seaborn as sns
from itertools import product

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Import and clean relevant data

In [None]:
meta = pd.read_csv("./CAMDA_Model_Data/LINCS_metadata_Peter_July19.csv")
GEXP_data = pd.read_csv("./CAMDA_Model_Data/L1000_Data_uncollapsed_all_conditionsv2_new.csv")
external_test = pd.read_csv("./CAMDA_Model_Data/external_ambiguous_smiles.csv")

In [None]:
meta['vDILIConcern'].unique()

In [None]:
ambiguous = external_test['Compound Name'].tolist()
ambiguos_meta = meta[meta['vDILIConcern'].isin(['Ambiguous DILI-concern'])]

In [None]:
temp_list = []
for index,row in ambiguos_meta.iterrows():
    temp_list.append(str(row['cell_id'])+'_'+str(row['pert_time'])+'_'+str(row['pert_dose'])+'_'+str(row['pert_iname']))

ambiguos_meta['combined_name'] = temp_list

In [None]:
temp_list = []
for index,row in meta.iterrows():
    temp_list.append(str(row['cell_id'])+'_'+str(row['pert_time'])+'_'+str(row['pert_dose'])+'_'+str(row['pert_iname']))

meta['combined_name'] = temp_list

In [None]:
meta.head()

In [None]:
# Split into seperate 'experiment' dataframes
dict_of_dfs = {k: v for k, v in meta.groupby(['cell_id','pert_time','pert_dose'])} # https://datascience.stackexchange.com/questions/29825/create-new-data-frames-from-existing-data-frame-based-on-unique-column-values
ambiguous_dict_of_dfs = {k: v for k, v in ambiguos_meta.groupby(['cell_id','pert_time','pert_dose'])}

In [None]:
dict_of_dfs['All_Cell_Lines'] = meta
ambiguous_dict_of_dfs['All_Cell_Lines'] = ambiguos_meta

In [None]:
dict_of_best_params = {}

for dic in dict_of_dfs:
    
    print(dic)
    dict_of_dfs[dic]
    id_list = dict_of_dfs[dic]['id'].tolist()
    pert_meta = dict_of_dfs[dic]['combined_name'].tolist() # meta to discren replicates
    pert_name = dict_of_dfs[dic]['pert_iname'].tolist()
    DILI_label = dict_of_dfs[dic]['vDILIConcern'].tolist() # output label
    data = GEXP_data[GEXP_data.columns & id_list]
    data = data.T # cell-time-dose specific data
    data['pert_meta'] = pert_meta
    data['pert_name'] = pert_name
    data['vDILIConcern'] = DILI_label

    data.reset_index(inplace=True)
    

    data.to_csv("./Cell_line_Training_Data/"+str(dic)+".csv")

(2) External ambiguos compounds

In [None]:
for dic in dict_of_dfs:
    print(dic)
    ambiguous_meta = ambiguous_dict_of_dfs[dic][ambiguous_dict_of_dfs[dic]['pert_iname'].isin(ambiguous)]

    pert_name = ambiguous_meta['pert_iname'].tolist() # meta to discren replicates
    pert_meta = ambiguous_meta['combined_name'].tolist() # meta to discren replicates

    id_list = ambiguous_meta['id'].tolist()
    data = GEXP_data[GEXP_data.columns & id_list]
    data = data.T # cell-time-dose specific data
    data['pert_meta'] = pert_meta
    data['pert_name'] = pert_name
    print(data)
    data.to_csv("./Cell_line_External_Data/External_"+str(dic)+".csv")