# Functions from random_forest.py

In [1]:
import os
import numpy as np
import pandas as pd
import random
import json
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance

In [2]:
def load_clean_data(path_to_tpm,single_stress="none"):
    raw_tpm = pd.read_csv(path_to_tpm,sep="\t",header="infer")
    if single_stress != "none":
        if single_stress == "Drought":
            raw_tpm = raw_tpm[raw_tpm["Treatment"].isin(["Drought","DroughtRepeat","Control"])]
        else:
            raw_tpm = raw_tpm[raw_tpm["Treatment"].isin([single_stress,"Control"])]
        testdf = raw_tpm[["Sample","BioProject","Treatment"]]
        for b in testdf["BioProject"].unique():
            df = testdf[testdf["BioProject"]==b]
            if len(df["Treatment"].unique())==1:
                testdf=testdf[testdf["BioProject"]!=b]
        raw_tpm = raw_tpm.merge(testdf)
    proxy=[]
    for i in range(len(raw_tpm.index)):
        if raw_tpm.iloc[i,raw_tpm.columns.get_loc("Treatment")] == "Control":
            proxy.append(0)
        else:
            proxy.append(1)
    raw_tpm["Label"] = proxy
    return raw_tpm

In [3]:
def variance_threshold_selector(data):
    selector = VarianceThreshold()
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

In [4]:
def check_if_balanced(labeled_tpm):
    if labeled_tpm["Label"].value_counts()[0] == labeled_tpm["Label"].value_counts()[1]:
        return True
    else:
        return False

In [5]:
def downsample(dataframe):
    vc = dataframe["Label"].value_counts()
    ones_only = dataframe[dataframe["Label"]==1]
    ds = ones_only.sample(n=vc[1],random_state=42)
    zeroes = dataframe[dataframe["Label"]==0]
    downsampled = pd.concat([ds,zeroes])
    return downsampled

In [6]:
def pre_split_transform(raw_tpm,balanced,downsample=False):
    blt = raw_tpm[["Sample","BioProject","Treatment","Label"]]
    tpmi = raw_tpm.set_index("Sample").drop(["BioProject","Treatment","Label"],axis=1)
    vttpm = variance_threshold_selector(tpmi)
    vttpm_log = vttpm.apply(lambda x: np.log2(x+1))
    if balanced==True:
        if downsample==True:
            vttpm_log = blt[["Sample","Label"]].merge(vttpm_log.reset_index().rename(columns={"index":"Sample"}))
            vttpm_log = vttpm_log.set_index("Sample")
            vttpm_log = downsample(vttpm_log)
    labeled = blt.merge(vttpm_log.reset_index().rename(columns={"index":"Sample"}))
    labeled.set_index("Sample",inplace=True)
    return labeled

In [7]:
def split_prep_stressor(stressor,dataframe,balance="Up"):
    if "Sample" not in dataframe.columns:
        dataframe = dataframe.reset_index().rename(columns={"index":"Sample"})
    sbp = dataframe[dataframe["Treatment"]==stressor]["BioProject"].unique()
    test = dataframe[dataframe["BioProject"].isin(sbp)]
    test = test[test["Treatment"].isin([stressor,"Control"])]
    train = dataframe[~dataframe["Sample"].isin(test["Sample"])]
    test = test.set_index("Sample")
    train = train.set_index("Sample")
    test = test.drop(["BioProject","Treatment"],axis=1)
    train = train.drop(["BioProject","Treatment"],axis=1)
    train_X = train.drop("Label",axis=1)
    y_train = train["Label"]
    test_X = test.drop("Label",axis=1)
    y_test = test["Label"]
    if balance=="up":
        sm = SMOTE(random_state=42)
        train_X,y_train = sm.fit_resample(train_X,y_train)
    scalar = StandardScaler()
    X_train = scalar.fit_transform(train_X)
    X_test = scalar.fit_transform(test_X)
    return X_train,y_train,X_test,y_test

In [8]:
def get_tuned_rf(X_train, y_train, random_grid):
    rf = RandomForestClassifier()
    rf_random = RandomizedSearchCV(estimator=rf,
                                  param_distributions=random_grid,
                                  n_iter=10,
                                  cv=5,
                                  verbose=2,
                                  random_state=42,
                                  n_jobs=-1)
    rf_random.fit(X_train, y_train)
    hyper = rf_random.best_params_
    rfclf_tune = RandomForestClassifier(n_estimators=hyper["n_estimators"],
                                min_samples_split=hyper["min_samples_split"],
                                    min_samples_leaf=hyper["min_samples_leaf"],
                                   max_features=hyper["max_features"],
                                   max_depth=hyper["max_depth"],
                                   bootstrap=hyper["bootstrap"])
    return rfclf_tune

## Low nitrogen: test to figure out error

In [20]:
ln = load_clean_data("../../data/rawtpm_bptreat_noPEG.tsv")

In [10]:
ds = False
us = "Up"

In [21]:
bal = check_if_balanced(ln)

In [22]:
log_ln = pre_split_transform(ln,bal,ds)

In [23]:
log_ln.head()

Unnamed: 0_level_0,BioProject,Treatment,Label,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,...,Zm00001eb442810,Zm00001eb442820,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR11933261,PRJNA637522,Drought,1,3.760627,1.731651,0.060075,3.8004,2.644723,3.597631,0.494946,...,0.227968,0.0,0.0,0.0,0.0,0.389017,0.0,0.0,0.0,0.0
SRR11933272,PRJNA637522,Drought,1,4.109013,2.039269,0.490802,3.038092,1.537277,1.742518,0.226885,...,0.148026,0.173611,0.0,0.0,0.0,2.945473,0.0,0.0,0.0,0.0
SRR11933250,PRJNA637522,Drought,1,3.326079,1.994391,0.0,2.032962,1.842599,4.103174,0.0,...,0.0,0.0,0.0,0.0,0.0,0.503415,0.0,0.326679,0.0,1.146231
SRR11933029,PRJNA637522,Control,0,3.201844,1.759475,0.0,1.447213,1.549133,4.364607,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.244933
SRR11933040,PRJNA637522,Drought,1,3.507319,1.925657,0.0,1.189432,2.254362,4.372791,0.193789,...,0.236712,0.017435,0.0,0.0,0.0,3.409421,0.0,0.0,0.0,1.745435


In [24]:
X_train, y_train, X_test, y_test = split_prep_stressor("Low_Nitrogen",log_ln,us)

In [16]:
# do split_prep_stressor but line by line
log_ln = log_ln.reset_index().rename(columns={"index":"Sample"})
log_ln.head()

Unnamed: 0,Sample,BioProject,Treatment,Label,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,...,Zm00001eb442780,Zm00001eb442800,Zm00001eb442810,Zm00001eb442820,Zm00001eb442840,Zm00001eb442870,Zm00001eb442890,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030
0,SRR6335599,PRJNA420600,Control,0,2.763903,1.225628,0.243199,4.099231,0.365753,3.301563,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,SRR6335600,PRJNA420600,Control,0,0.0,0.0,0.0,0.0,0.0,4.389291,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,SRR6335602,PRJNA420600,Control,0,2.568461,1.758329,0.0,4.09035,0.0,2.832657,...,0.0,0.0,0.088404,0.044152,0.0,0.0,0.0,0.0,0.0,0.0
3,SRR6335603,PRJNA420600,Control,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,SRR6335605,PRJNA420600,Low_Nitrogen,1,2.960973,1.761872,0.0,3.994822,0.787914,3.064413,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
sbp = log_ln[log_ln["Treatment"]=="Low_Nitrogen"]["BioProject"].unique()

In [18]:
sbp

array(['PRJNA420600', 'PRJNA436973', 'PRJNA304223', 'PRJNA587226',
       'Brandon_Webster'], dtype=object)

In [19]:
log_ln["BioProject"].unique()

array(['PRJNA420600', 'PRJNA436973', 'PRJNA304223', 'PRJNA587226',
       'Brandon_Webster'], dtype=object)