Purpose: Run random forest on heat and associated control data only, holding out one BioProject at a time for the test set.<br>
Author: Anna Pardo<br>
Date initiated: August 10, 2023

In [1]:
# load modules
import pandas as pd
import statistics
import scipy.stats as stats
import matplotlib.pyplot as plt
import os
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
import json
from sklearn.metrics import f1_score

In [2]:
# define functions
def load_clean_data(path_to_tpm,single_stress="none"):
    """
    Args:
        path_to_tpm = full path to file containing raw TPM, columns for Sample, BioProject, & Treatment
        single_stress = a single stressor to which the data must be subsetted, or "none" (default)
    """
    # load the TPM data
    raw_tpm = pd.read_csv(path_to_tpm,sep="\t",header="infer")
    # if there is a single stress to subset to, subset to that stress
    if single_stress != "none":
        if single_stress == "Drought":
            raw_tpm = raw_tpm[raw_tpm["Treatment"].isin(["Drought","DroughtRepeat","Control"])]
        else:
            raw_tpm = raw_tpm[raw_tpm["Treatment"].isin([single_stress,"Control"])]
        # remove BioProjects that only have control samples left
        testdf = raw_tpm[["Sample","BioProject","Treatment"]]
        for b in testdf["BioProject"].unique():
            df = testdf[testdf["BioProject"]==b]
            if len(df["Treatment"].unique())==1:
                testdf = testdf[testdf["BioProject"]!=b]
        raw_tpm = raw_tpm.merge(testdf)
    # labeling: set Control to 0 and any stress to 1
    proxy = []
    for i in range(len(raw_tpm.index)):
        if raw_tpm.iloc[i,raw_tpm.columns.get_loc("Treatment")] == "Control":
            proxy.append(0)
        else:
            proxy.append(1)
    raw_tpm["Label"] = proxy
    # return the dataframe
    return raw_tpm

In [3]:
def variance_threshold_selector(data):
    selector = VarianceThreshold()
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

def check_if_balanced(labeled_tpm):
    """
    Args:
        labeled_tpm = raw TPM with columns for Sample, BioProject, Label, Treatment
    """
    if labeled_tpm["Label"].value_counts()[0] == labeled_tpm["Label"].value_counts()[1]:
        return True
    else:
        return False

def downsample(dataframe):
    """
    Args:
        dataframe = a log TPM dataframe with a Label column and Sample set as the index
    """
    # generate a variable of value counts
    vc = dataframe["Label"].value_counts()

    # subset data to only samples labeled 1
    ones_only = dataframe[dataframe["Label"]==1]

    # downsample from the subsetted dataframe
    ds = ones_only.sample(n=vc[1],random_state=42)

    # subset original data to control samples
    zeroes = dataframe[dataframe["Label"]==0]

    # concatenate controls and downsampled stress samples
    downsampled = pd.concat([ds,zeroes])
    # return dataframe
    return downsampled

In [4]:
def pre_split_transform(raw_tpm,balanced,downsample=False):
    """
    Args:
        raw_tpm = dataframe containing raw TPM values, columns for Sample, BioProject, Treatment, Label
        balanced = Boolean variable, True or False (result of check_if_balanced())
        downsample = Boolean variable, True or False, default False (set manually outside function)
    """
    # if data have treatment column, drop it
    if "Treatment" in dataframe.columns:
        dataframe = dataframe.drop("Treatment",axis=1)
    # temporarily, set index to Sample and drop BioProject, Label, & Treatment columns
    blt = raw_tpm[["Sample","BioProject","Treatment","Label"]]
    tpmi = raw_tpm.set_index("Sample").drop(["BioProject","Treatment","Label"],axis=1)
    # remove zero-variance genes
    vttpm = variance_threshold_selector(tpmi)
    # log-transform TPM
    vttpm_log = vttpm.apply(lambda x: np.log2(x+1))
    # downsample data if needed
    if balanced==True:
        if downsample==True:
            # add back labels
            vttpm_log = blt[["Sample","Label"]].merge(vttpm_log.reset_index().rename(columns={"index":"Sample"}))
            # set Sample as index
            vttpm_log = vttpm_log.set_index("Sample")
            # downsample the data
            vttpm_log = downsample(vttpm_log)
    # add treatment, labels, and BioProject back in, set Sample as the index again
    labeled = blt.merge(vttpm_log.reset_index().rename(columns={"index":"Sample"}))
    labeled.set_index("Sample",inplace=True)
    # return dataframe
    return labeled

In [5]:
def split_prep_bioproject(bioproject,dataframe,balance="Up"):
    """
    Args:
        bioproject = BioProject to hold out for testing (PRJNAXXXXXX)
        dataframe = starting dataframe of log TPM with labels
        balance = str: "none","up" (downsampling will be done before splitting, outside of this function)
    """
    # split training and testing sets
    test = dataframe[dataframe["BioProject"]==bioproject]
    train = dataframe[dataframe["BioProject"]!=bioproject]
    # drop BioProject column from both sets
    test = test.drop("BioProject",axis=1)
    train = train.drop("BioProject",axis=1)
    # generate X_train, X_test, y_train, and y_test
    ## where X = gene expression values and y = class labels
    train_X = train.drop("Label",axis=1)
    y_train = train["Label"]
    test_X = test.drop("Label",axis=1)
    y_test = test["Label"]
    # if upsampling: do the upsampling using SMOTE
    if balance=="Up":
        sm = SMOTE(random_state=42)
        train_X, y_train = sm.fit_resample(train_X,y_train)
    # for X_train and X_test: scale data to a z-score
    scalar = StandardScaler()
    X_train = scalar.fit_transform(train_X)
    X_test = scalar.fit_transform(test_X)
    # return training and test data
    return X_train, y_train, X_test, y_test

In [3]:
# print list of BioProjects in heat stress
heattpm = load_clean_data("../../data/rawtpm_bptreat_noPEG.tsv",single_stress="Heat")
heattpm["Treatment"].unique()

array(['Control', 'Heat'], dtype=object)

In [4]:
heattpm["BioProject"].unique()

array(['PRJNA506720', 'PRJNA244661', 'PRJNA747925', 'PRJNA520822',
       'PRJNA548548', 'PRJNA791560', 'PRJNA349117', 'PRJNA646054'],
      dtype=object)

In [8]:
with open("../../data/bioprojects_heat.tsv","w+") as outfile:
    for b in list(heattpm["BioProject"].unique()):
        outfile.write(b+"\n")

In [3]:
# Oct. 13, 2023
# make BioProject files for each of the other stressors

# write a function to do this since I'm doing it 5 times
def write_bp_file(tpmfile,stressor,bpfile):
    # load TPM for the single stressor listed
    tpm = load_clean_data(tpmfile,single_stress=stressor)
    # write BioProject file
    with open(bpfile,"w+") as outfile:
        for b in list(tpm["BioProject"].unique()):
            outfile.write(b+"\n")

In [5]:
write_bp_file("../../data/rawtpm_bptreat_noPEG.tsv","Drought","../../data/bioprojects_drought.tsv")

In [6]:
write_bp_file("../../data/rawtpm_bptreat_noPEG.tsv","Cold","../../data/bioprojects_cold.tsv")

In [7]:
write_bp_file("../../data/rawtpm_bptreat_noPEG.tsv","Salt","../../data/bioprojects_salt.tsv")

In [8]:
write_bp_file("../../data/rawtpm_bptreat_noPEG.tsv","Flooding","../../data/bioprojects_flooding.tsv")

In [9]:
write_bp_file("../../data/rawtpm_bptreat_noPEG.tsv","Low_Nitrogen","../../data/bioprojects_low_nitrogen.tsv")

In [3]:
salt = load_clean_data("../../data/rawtpm_bptreat_noPEG.tsv","Salt")

In [8]:
len(ln["BioProject"].unique())

5

In [5]:
cold = load_clean_data("../../data/rawtpm_bptreat_noPEG.tsv","Cold")

In [7]:
ln = load_clean_data("../../data/rawtpm_bptreat_noPEG.tsv","Low_Nitrogen")