How well do Chemical genetic interactions predict drug classes? Here, the cleaned data generated is used to train and test a logistic regression model. Testing is accomplished by leaving one drug out in each iteration

In [1]:
import itertools
import pathlib
import re
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xlsxwriter
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression

%matplotlib inline



## Functions

In [2]:
def get_unique_drugs(drug_list):
    """returns unique drugs maintaining order"""
    out = []
    for drug in drug_list:
        if drug not in out:
            out.append(drug)
    return out

In [3]:
def collapse_confusion(confusion, drug_class, dir_name):
    """takes a confusion matrix of size [ndrugs, nclasses] and collapses it to make [nclasses, nclasses]. Saves the result as a figure"""
    name=os.path.basename(dir_name)
    drug_class = drug_class.set_index('Drug')
    confusion = confusion.join(drug_class, how='inner')
    confusion_classes = confusion.groupby('Class').mean()
    heat_c = sns.heatmap(confusion_classes, annot=True, cmap="YlGnBu", fmt='1.2f', square=True)
    heat_c.set(xlabel='Predicted Class', ylabel='True Class', title=name)
    fig_c = heat_c.get_figure()
    
    fig_path = pathlib.Path.cwd().joinpath(dir_name, name+'_conf_pp_collapsed.png')
    fig_c.savefig(fig_path, bbox_inches='tight', dpi=300)    
    plt.close()

In [4]:
def make_coef_df(coef_df, strain_list, y, dir_name):
    """Takes in a matrix of weights of size [nclasses, nstrains] and saves a transposed and annotated version with abs values"""
    name=os.path.basename(dir_name)
    coef_df = coef_df.transpose()
    coef_df.columns = sorted(list(set(y)))
    coef_df.index = strain_list
    coef_df_abs = coef_df.abs()
    pathlib.Path.cwd().joinpath(dir_name, name+'.png')
    coef_df.to_csv(pathlib.Path.cwd().joinpath(dir_name, name+'_coefs.csv'))
    coef_df_abs.to_csv(pathlib.Path.cwd().joinpath(dir_name, name+'_coefs_abs.csv'))


In [5]:
def fit_model(X, y, drug_list, C, correct_imbal):
    """Implements Logistic regression w lasso and CV. correct_imbal can be None or SMOTE. C is for lasso"""
    
    print ("Starting predictions")
    
    classes = sorted(list(set(y)))
    indexes = np.arange(len(y))
    unique_drugs = get_unique_drugs(drug_list)
    confusion = np.empty((len(unique_drugs),len(classes)))
    coef_df = pd.DataFrame(np.zeros((len(classes), X.shape[1])))
    
    #Loop
    for j,drug in enumerate(unique_drugs):
        print(drug, end=',')
        #Get train and test indices
        train_ind = indexes[drug_list != drug]
        test_ind = indexes[drug_list == drug]
        
        #Correct imbalance
        if correct_imbal == 'SMOTE':
            X_train, y_train= SMOTE().fit_sample(X[train_ind], y[train_ind])
        elif correct_imbal == None:
            X_train,y_train=(X[train_ind], y[train_ind]) 
            
        #fit    
        model = LogisticRegression(penalty='l1', C=C, solver='liblinear', multi_class='ovr')
        model.fit(X_train, y_train)
        
        #add the weights to a dataframe.These get added every iteration
        coef_df = coef_df.add(model.coef_)        
        #calculate predict probabilities for each test. Yields matrix of size [ntests, nclasses]
        confusion_drug = model.predict_proba(X[test_ind])
        #collapse this matrix by calculating mean over all replicates. Yields a matrix of size[1,nclasses]. Append to confusion
        confusion[j] = confusion_drug.mean(axis=0)
        
    confusion_df = pd.DataFrame(confusion, columns=classes, index=unique_drugs)
    coef_df = coef_df/len(unique_drugs)
    print("Predictions finished")
    
    return confusion_df, coef_df
   

In [6]:
def do_LR(df, dir_name, drug_class, C, correct_imbal=None):
    """Prepares input data and implements LR. Uses '.' to look for columns for X. Requires colnames 'Class' and 'Drugs'"""
    X = df.filter(regex=('\.')).values
    y = df['Class'].values
    drug_list = np.array(df['Drug'])
    print(X.shape, len(y))
    
    #Create directory for results
    try:
        pathlib.Path.cwd().joinpath(dir_name).mkdir(parents=True, exist_ok=False)
    except FileExistsError:
        pass
    
    #Run
    confusion, coef = fit_model(X, y, drug_list, C, correct_imbal)
    
    #Generate a csv file with the coefficients of the logistic regression. This allows identification of possible important predictors of the model.
    make_coef_df(coef, df.filter(regex=('\.')).columns, y, dir_name)
    #Generates a confusion matrix
    collapse_confusion(confusion, drug_class, dir_name)
    return None 

## HypoIII

In [15]:
dir_name = './Analysis/9-Logistic_regression/HypoIII'

In [16]:
hypoIII = pd.read_csv(pathlib.Path.cwd().joinpath('Analysis','8-Clean_for_ML', 'HypoIII_for_ML_encoded.csv'))

In [17]:
hypoIII.head()

Unnamed: 0,Drug,Conc,Replicate,strain.0,strain.1,strain.10,strain.100,strain.101,strain.102,strain.103,...,strain.91,strain.92,strain.93,strain.94,strain.95,strain.96,strain.97,strain.98,strain.99,Class
0,drug_0,2xMIC,1,0.359612,-0.911401,1.554297,0.069421,-2.97296,1.185445,0.421721,...,0.176432,0.449538,0.131855,-0.794105,1.381503,-0.938048,-2.939168,1.49205,-0.38757,Class 4
1,drug_0,2xMIC,2,0.92268,0.156397,1.587909,0.587814,0.183369,0.565838,1.133654,...,0.702362,0.783339,0.747126,0.113741,1.673887,-0.256643,-0.483839,1.684535,1.228144,Class 4
2,drug_0,2xMIC,3,1.003276,0.171781,1.870643,0.381407,-0.678374,1.030591,1.469478,...,0.911449,0.94432,0.48079,-0.326351,1.69171,0.168192,-0.8347,1.609109,1.397204,Class 4
3,drug_0,2xMIC,4,1.82545,0.914839,1.97614,1.693167,0.974058,2.142402,2.025873,...,1.264043,1.414149,1.483434,0.8336,2.156906,1.275261,1.326924,2.090184,1.211598,Class 4
4,drug_0,2xMIC,5,1.862371,0.871132,1.93745,1.371979,0.936343,1.996058,2.160803,...,1.460437,1.147487,1.533885,1.618346,2.086125,0.782927,0.709278,2.169011,1.499092,Class 4


In [18]:
hypoIII['Class'].unique()

array(['Class 4', 'Class 1', 'Class 6', nan, 'Class 5', 'Class 3',
       'Class 2'], dtype=object)

In [19]:
hypoIII = hypoIII[~hypoIII['Class'].isna()]
hypoIII = hypoIII.sort_values(by=['Class', 'Drug'])
hypoIII.head()

Unnamed: 0,Drug,Conc,Replicate,strain.0,strain.1,strain.10,strain.100,strain.101,strain.102,strain.103,...,strain.91,strain.92,strain.93,strain.94,strain.95,strain.96,strain.97,strain.98,strain.99,Class
32,drug_12,1xMIC,1,0.157042,-0.266909,0.778431,-0.742298,-1.403845,-0.119274,-0.121111,...,0.248958,0.932941,0.60717,-0.821963,0.601467,-0.789908,-1.460669,0.122685,0.123163,Class 1
33,drug_12,1xMIC,2,0.708403,0.006475,1.13962,-0.038633,-0.329288,0.379601,0.358161,...,0.603231,0.170081,1.12157,0.065348,0.88481,-0.419019,-1.035201,0.685974,-0.275755,Class 1
34,drug_12,1xMIC,3,0.39804,0.012963,1.16,-0.41828,-1.001709,0.449131,0.061164,...,0.540565,-0.076217,1.090796,-0.544073,1.135033,-0.384275,-0.99274,0.700167,-0.416506,Class 1
35,drug_12,1xMIC,4,0.525875,0.079063,1.317477,0.219213,-0.382533,0.427696,0.341746,...,0.680785,0.199835,1.306156,-0.056335,1.093339,-0.090207,-0.782503,0.515622,-0.228918,Class 1
36,drug_12,1xMIC,5,0.592619,0.178949,1.161345,-0.299858,-0.651762,0.184864,0.167023,...,0.534262,0.038167,1.13653,-0.312592,0.814637,-0.620377,-0.86542,0.348924,0.005737,Class 1


In [20]:
hypoIII['Class'].value_counts()

Class 1    527
Class 4    143
Class 5    125
Class 6    115
Class 3     56
Class 2     32
Name: Class, dtype: int64

In [21]:
drug_class = hypoIII[['Drug', 'Class']].drop_duplicates()
drug_class.head()

Unnamed: 0,Drug,Class
32,drug_12,Class 1
49,drug_13,Class 1
53,drug_14,Class 1
94,drug_2,Class 1
108,drug_20,Class 1


In [23]:
do_LR(hypoIII, dir_name, drug_class=drug_class, C=1, correct_imbal='SMOTE')

(998, 425) 998
Starting predictions
drug_12,drug_13,drug_14,drug_2,drug_20,drug_21,drug_22,drug_24,drug_25,drug_26,drug_28,drug_29,drug_31,drug_32,drug_33,drug_34,drug_41,drug_42,drug_47,drug_48,drug_49,drug_50,drug_51,drug_59,drug_63,drug_64,drug_65,drug_67,drug_68,drug_71,drug_73,drug_77,drug_8,drug_81,drug_82,drug_83,drug_85,drug_70,drug_78,drug_46,drug_53,drug_62,drug_69,drug_0,drug_1,drug_30,drug_4,drug_55,drug_56,drug_57,drug_58,drug_61,drug_66,drug_84,drug_19,drug_3,drug_36,drug_37,drug_38,drug_44,drug_74,drug_75,drug_76,drug_17,drug_39,drug_40,drug_45,drug_5,drug_52,drug_54,drug_72,drug_79,drug_80,drug_9,Predictions finished
