Purpose: Preliminary random forest (no dimensional reduction) to classify stressed vs. control for all stressors.<br>
Author: Anna Pardo<br>
Date initiated: June 14, 2023

In [1]:
# import modules
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE

In [2]:
# load log TPM matrix (low-variance features already removed with VarianceThreshold)

vttpm_log = pd.read_csv("../../data/logTPM_allsamples_16-May-2023.txt",sep="\t",header="infer")
vttpm_log.head

<bound method NDFrame.head of                    Sample  Zm00001eb000010  Zm00001eb000020  Zm00001eb000050  \
0     B73xMO17_HII_1_BRB1         1.638809         2.220060         0.000000   
1     B73xMO17_HII_2_BRB3         1.411726         1.015722         0.000000   
2      B73xMO17_HI_2_BRB4         1.525312         0.585677         0.000000   
3     B73xMO17_HII_3_BRB4         1.856313         1.782950         0.000000   
4      B73xMO17_HI_3_BRB6         1.212664         1.257071         0.000000   
...                   ...              ...              ...              ...   
2033          SRR11933084         4.594970         2.251670         0.999934   
2034          SRR11932980         3.527514         0.300106         0.000000   
2035           SRR4431530         1.555617         0.000000         0.000000   
2036          SRR11933278         2.220928         0.821438         0.106208   
2037          SRR11933248         3.721930         1.909847         0.000000   

      Zm0

In [3]:
# load metadata (only columns Sample, BioProject, and Treatment are required)
md = pd.read_csv("../../data/srr_numbers_with_metadata_12-May-2023.csv",sep=",",header="infer",usecols=["BioProject","Sample","Treatment"])
md.head()

Unnamed: 0,BioProject,Sample,Treatment
0,PRJNA637522,SRR11933261,Drought
1,PRJNA637522,SRR11933272,Drought
2,PRJNA637522,SRR11933250,Drought
3,PRJNA637522,SRR11933029,Control
4,PRJNA637522,SRR11933040,Drought


In [10]:
# from md, remove treatments ending in Rec (recovery samples)
# also remove UV samples
mds = md[~md["Treatment"].str.contains("Rec")]
mds = mds[mds["Treatment"]!="UV"]

In [12]:
len(mds.index)

2042

In [13]:
# remove all BioProjects that only have control samples
for b in mds["BioProject"].unique():
    df = mds[mds["BioProject"]==b]
    if df["Treatment"].unique()[0] == "Control":
        mds = mds[mds["BioProject"] != b]

In [14]:
len(mds.index)

1338

In [11]:
mds["Treatment"].unique()

array(['Drought', 'Control', 'Heat', 'Cold', 'Salt', 'DroughtSalt',
       'ColdDrought', 'DroughtRepeat', 'Low_Nitrogen', 'Flooding',
       'PEG6000'], dtype=object)

In [None]:
# set Control to 0 and any stress to 1
proxy = []
for i in range(len(mddc.index)):
    if mddc.iloc[i,2] == "Control":
        proxy.append(0)
    else:
        proxy.append(1)
mddc["Label"] = proxy