Purpose: Figure out how to balance all stressors without errors when drought is the random forest test set.<br>
Author: Anna Pardo<br>
Date initiated: July 20, 2023

In [1]:
import os
import argparse
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler

In [2]:
def load_clean_data(path_to_tpm,single_stress="none"):
    """
    Args:
        path_to_tpm = full path to file containing raw TPM, columns for Sample, BioProject, & Treatment
        single_stress = a single stressor to which the data must be subsetted, or "none" (default)
    """
    # load the TPM data
    raw_tpm = pd.read_csv(path_to_tpm,sep="\t",header="infer")
    # return the dataframe
    return raw_tpm

In [3]:
def variance_threshold_selector(data):
    selector = VarianceThreshold()
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

In [4]:
def sampling_strategy(X,y,n_samples, t='majority'):
    target_classes = ''
    if t == 'majority':
        target_classes = y.value_counts() > n_samples
    elif t == 'minority':
        target_classes = y.value_counts() < n_samples
    tc = target_classes[target_classes == True].index
    sampling_strategy = {}
    for target in tc:
        sampling_strategy[target] = n_samples
    return sampling_strategy

In [5]:
def median_sample(X,y):
    count = y.value_counts()
    n_samples = count.median().astype(np.int64)

    # downsample majority classes
    under_sampler = ClusterCentroids(sampling_strategy=sampling_strategy(X,y,n_samples,t='majority'))
    X_under, y_under = under_sampler.fit_resample(X, y)

    # upsample minority classes
    over_sampler = SMOTE(sampling_strategy=sampling_strategy(X_under, y_under,n_samples, t='minority'),k_neighbors=2)
    X_bal, y_bal = over_sampler.fit_resample(X_under, y_under)

    return X_bal,y_bal

In [6]:
def resample(train,sampling):
    """
    Args:
        train = TPM dataframe with columns for Sample, Treatment, BioProject, and Label (training set only!)
        sampling = "up" or "median" (note, should be lowercase)
    """
    # drop control samples
    traintreat = train[train["Treatment"]!="Control"]
    # pull out control samples and save for later
    cont = train[train["Treatment"]=="Control"]

    # generate list of stressors in dataframe
    stressors = list(traintreat["Treatment"].unique())

    # set up 5-way class labeling
    cl = []
    for i in range(len(stressors)):
        cl.append(i)
    cldf = pd.DataFrame(list(zip(stressors,cl)),columns=["Treatment","Class"])

    # merge with traintreat
    ttclass = cldf.merge(traintreat,how="right")

    # drop BioProject & Treatment
    ttclass = ttclass.drop(["BioProject","Treatment"],axis=1)
    # set Sample as index
    ttclass = ttclass.set_index("Sample")
    # split into X and y
    X = ttclass.drop("Class",axis=1)
    y = ttclass["Class"]

    if sampling=="up":
        sm = SMOTE(random_state=42)
        X_res, y_res = sm.fit_resample(X,y)
    elif sampling=="median":
        X_res, y_res = median_sample(X,y)

    # stick X and y back together
    upsamp = pd.concat([y_res,X_res],axis=1)
    # set a Label column
    upsamp["Label"] = 1
    # drop the Class column
    upsamp = upsamp.drop("Class",axis=1)

    # drop Sample, BioProject, and Treatment from control data
    cont = cont.drop(["Sample","BioProject","Treatment"],axis=1)
    # set Label column for controls
    cont["Label"] = 0

    # append cont to upsamp
    alldf = pd.concat([upsamp,cont],axis=0)

    # split into X and y again
    X_all = alldf.drop("Label",axis=1)
    y_all = alldf["Label"]
    # upsample the controls to match the stressed samples
    sm2 = SMOTE(random_state=42)
    X_all_res,y_all_res = sm2.fit_resample(X_all,y_all)

    return X_all_res,y_all_res

In [7]:
def pre_split_transform(raw_tpm):
    """
    Args:
        raw_tpm = dataframe containing raw TPM values, columns for Sample, BioProject, Treatment
    """
    # temporarily, drop BioProject & Treatment columns
    blt = raw_tpm[["Sample","BioProject","Treatment"]]
    tpmi = raw_tpm.set_index("Sample").drop(["BioProject","Treatment"],axis=1)
    # remove zero-variance genes
    vttpm = variance_threshold_selector(tpmi)
    # log-transform TPM
    vttpm_log = vttpm.apply(lambda x: np.log2(x+1))
    # add treatment, labels, and BioProject back in
    labeled = blt.merge(vttpm_log.reset_index().rename(columns={"index":"Sample"}))
    # drop rows containing NaN values
    labeled = labeled.dropna(axis=0)
    # return dataframe
    return labeled

In [15]:
def split_prep_stressor(stressor,dataframe,sampling):
    """
    Args:
        stressor = stressor to hold out for testing (all BioProjects)
        dataframe = log TPM dataframe with Sample, BioProject, Treatment columns (or Sample as index)
        sampling = str: "median","up" (should be lowercase; to be fed into resample())
    """
    # replace DroughtRepeat with Drought in TPM matrix (if exists)
    if "DroughtRepeat" in dataframe["Treatment"].unique():
        dataframe["Treatment"].mask(dataframe["Treatment"]=="DroughtRepeat","Drought",inplace=True)
    # generate list of unique BioProjects containing the test stressor
    sbp = dataframe[dataframe["Treatment"]==stressor]["BioProject"].unique()
    # split test from train data
    test = dataframe[dataframe["BioProject"].isin(sbp)]
    test = test[test["Treatment"].isin([stressor,"Control"])]
    # add Label column to test data
    label = []
    for i in test["Treatment"].unique():
        if i=="Control":
            label.append(0)
        else:
            label.append(1)
    labdf = pd.DataFrame(list(zip(test["Treatment"].unique(),label)),columns=["Treatment","Label"])
    test = test.merge(labdf,how="right")
    # pull out training data
    train = dataframe[~dataframe["Sample"].isin(test["Sample"])]
    # resample training data
    train_X, y_train = resample(train,sampling)
    # for test set, make Sample the index again
    test = test.set_index("Sample")
    # drop BioProject and Treatment columns from test set
    test = test.drop(["BioProject","Treatment"],axis=1)
    # generate X_train, X_test, y_train, and y_test
    ## where X = gene expression values and y = class labels
    test_X = test.drop("Label",axis=1)
    y_test = test["Label"]
    # for X_train and X_test: scale data to a z-score
    scalar = StandardScaler()
    X_train = scalar.fit_transform(train_X)
    X_test = scalar.fit_transform(test_X)
    # return training and test data
    return X_train, y_train, X_test, y_test

In [9]:
cleaned_tpm = load_clean_data("../../data/rawtpm_bptreat_noPEG.tsv")

In [10]:
log_tpm = pre_split_transform(cleaned_tpm)

In [11]:
stressor = "Drought"
single_stress = "Drought"

In [12]:
sampling = "median"

In [13]:
dataframe = log_tpm

In [16]:
X_train, y_train, X_test, y_test = split_prep_stressor(single_stress,log_tpm,sampling)

  upsamp["Label"] = 1


In [18]:
# break down the function into little pieces to find out where it went wrong

# generate list of unique BioProjects containing the test stressor
sbp = dataframe[dataframe["Treatment"]==stressor]["BioProject"].unique()

In [19]:
# split test from train data
test = dataframe[dataframe["BioProject"].isin(sbp)]
test = test[test["Treatment"].isin([stressor,"Control"])]

In [20]:
# add Label column to test data
label = []
for i in test["Treatment"].unique():
    if i=="Control":
        label.append(0)
    else:
        label.append(1)
labdf = pd.DataFrame(list(zip(test["Treatment"].unique(),label)),columns=["Treatment","Label"])
test = test.merge(labdf,how="right")

In [21]:
# pull out training data
train = dataframe[~dataframe["Sample"].isin(test["Sample"])]

In [22]:
## from here we are breaking down chunks of the resample() function

# drop control samples
traintreat = train[train["Treatment"]!="Control"]

In [23]:
# pull out control samples and save for later
cont = train[train["Treatment"]=="Control"]

In [24]:
# generate list of stressors in dataframe
stressors = list(traintreat["Treatment"].unique())
stressors

['Heat', 'Cold', 'Salt', 'DroughtRepeat', 'Low_Nitrogen', 'Flooding']