Purpose: Subset the samples in the core stress project to a list of only those suitable for differential expression.<br>
Author: Anna Pardo<br>
Date initiated: June 5, 2023

In [1]:
# load modules
import pandas as pd

In [2]:
# load sample metadata sheet
md = pd.read_csv("../../data/srr_numbers_with_metadata_12-May-2023.csv",sep=",",header="infer")
md.head()

Unnamed: 0,BioProject,Sample,sample_name,Replicate_num,Genotype,Technology,Library_layout,Treatment,Duration_hours,Time_after_treatment,...,Developmental_stage,Tissue,Day_length_hours,Day_temp_C,Night_temp_C,Relative humidity (%),Light (umol/m2/s),Growth Env.,Media,Notes
0,PRJNA637522,SRR11933261,238_WS2,,238,Illumina HiSeq 2500,PAIRED,Drought,312.0,13 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
1,PRJNA637522,SRR11933272,238_WS1,,238,Illumina HiSeq 2500,PAIRED,Drought,216.0,9 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
2,PRJNA637522,SRR11933250,268_WS1,,268,Illumina HiSeq 2500,PAIRED,Drought,216.0,9 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
3,PRJNA637522,SRR11933029,268_WW,,268,Illumina HiSeq 2500,PAIRED,Control,216.0,9 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
4,PRJNA637522,SRR11933040,268_WS2,,268,Illumina HiSeq 2500,PAIRED,Drought,312.0,13 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,


In [3]:
# generate a list of BioProjects NOT to be included (all samples are not suitable for DE)
# Brandon_Webster is included because it did not pass through tximport with the other samples and must be studied separately
bpremove = [
    "PRJNA172724",
    "PRJNA210356",
    "PRJNA255755",
    "PRJNA290180",
    "PRJNA291064",
    "PRJNA300830",
    "PRJNA304223",
    "PRJNA349117",
    "PRJNA378609",
    "PRJNA379712",
    "PRJNA436973",
    "PRJNA545969",
    "PRJNA646054",
    "PRJNA647980",
    "PRJNA689935",
    "PRJNA906711",
    "Brandon_Webster"
]

In [4]:
# since the list is hard-coded from a handwritten list: double-check that all these BioProjects are present in md
for b in bpremove:
    if b in md["BioProject"].unique():
        print(b,": check!")
    else:
        print(b,": not present")

PRJNA172724 : check!
PRJNA210356 : check!
PRJNA255755 : check!
PRJNA290180 : check!
PRJNA291064 : check!
PRJNA300830 : check!
PRJNA304223 : check!
PRJNA349117 : check!
PRJNA378609 : check!
PRJNA379712 : check!
PRJNA436973 : check!
PRJNA545969 : check!
PRJNA646054 : check!
PRJNA647980 : check!
PRJNA689935 : check!
PRJNA906711 : check!
Brandon_Webster : check!


In [5]:
# remove from md all BioProjects in this list
mdb = md[~md["BioProject"].isin(bpremove)]

In [6]:
# the dataframe mdb now contains only BioProjects with at least some samples suitable for differential expression
# certain samples need to be removed; this will require a long if/else statement

dflist = []
for i in mdb["BioProject"].unique():
    df = mdb[mdb["BioProject"]==i]
    if i=="PRJNA244661":
        df = df[~df["Treatment"].isin(["Salt","UV"])]
    elif i=="PRJNA267717":
        df = df[df["Duration_hours"]!=72]
    elif i=="PRJNA378714":
        df = df[df["Duration_hours"]==72]
    elif i=="PRJNA520822":
        df["proxy"] = df["Developmental_stage"]+"_"+str(df["Duration_hours"])+"_"+df["Tissue"]
        df = df[df["proxy"]!="R1_2_Ear"]
        df = df[df["proxy"]!="V3_48_Stalk"]
        df.drop("proxy",axis=1,inplace=True)
    elif i=="PRJNA747925":
        l = ["B73","Mo17","W22","B73xMo17","W22xB73","W22xMo17"]
        df = df[df["Genotype"].isin(l)]
        df = df[df["Duration_hours"].isin([1,25])]
    elif i=="PRJNA877073":
        df = df[df["Duration_hours"]!=0]
    dflist.append(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["proxy"] = df["Developmental_stage"]+"_"+str(df["Duration_hours"])+"_"+df["Tissue"]


In [7]:
mdsub = pd.concat(dflist)

In [8]:
# extract a list of the samples from mdsub
samples = list(mdsub["Sample"].unique())

In [14]:
# save the subsetted metadata as a csv file
mdsub.to_csv("../../data/metadata_for_DESeq_samples.csv",sep=",",header=True,index=False)

In [18]:
# save the list of samples to a file
with open("../../data/samples_for_de.txt","w+") as outfile:
    for i in samples:
        outfile.write(i+"\n")

In [9]:
# subset the master tximport table input file to only the samples in the list
## load master tximport table
t = pd.read_csv("../../data/master_tximport_table.csv",sep=",",header="infer")
t.head()

Unnamed: 0,File,Sample,Percent_Mapped
0,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR8857799,86.555924
1,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR8857793,86.877209
2,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR15241108,87.659877
3,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR15241156,89.027615
4,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR8857749,84.932979


In [10]:
len(t.index)

1823

In [11]:
# subset the table
ts = t[t["Sample"].isin(samples)]
len(ts.index)

1451

In [13]:
# output the new table
ts.to_csv("../../data/master_tximport_table_subsetted.csv",sep=",",header=True,index=False)

In [12]:
# June 7, 2023: Split up the data by BioProject so I can run tximport and DESeq individually on each BioProject's samples
# split up master table for tximport
## first add BioProject column
mdbp = mdsub[["Sample","BioProject"]]
mdbp.head()

Unnamed: 0,Sample,BioProject
0,SRR11933261,PRJNA637522
1,SRR11933272,PRJNA637522
2,SRR11933250,PRJNA637522
3,SRR11933029,PRJNA637522
4,SRR11933040,PRJNA637522


In [13]:
# merge mdbp with ts
tsb = ts.merge(mdbp)
tsb.head()

Unnamed: 0,File,Sample,Percent_Mapped,BioProject
0,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR8857799,86.555924,PRJNA520822
1,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR8857793,86.877209,PRJNA520822
2,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR15241108,87.659877,PRJNA747925
3,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR15241156,89.027615,PRJNA747925
4,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR8857749,84.932979,PRJNA520822


In [18]:
# loop through BioProjects and for each one, save a csv for tximport of the samples in that BioProject
directory = "../../data/bioproject_tximport_tables/"
for b in tsb["BioProject"].unique():
    df = tsb[tsb["BioProject"]==b]
    filename = directory+b+"_tximport.csv"
    df.to_csv(filename,sep=",",header=True,index=False)

In [14]:
# troubleshooting on 6/9/23: subset tsb to just PRJNA420600
tsbp = tsb[tsb["BioProject"]=="PRJNA420600"]
tsbp

Unnamed: 0,File,Sample,Percent_Mapped,BioProject
105,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR6335608,80.407266,PRJNA420600
143,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR6335607,61.619388,PRJNA420600
145,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR6335606,58.689937,PRJNA420600
146,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR6335609,62.826718,PRJNA420600
154,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR6335602,84.838812,PRJNA420600
192,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR6335599,74.425878,PRJNA420600
194,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR6335600,58.25305,PRJNA420600
197,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR6335605,73.399203,PRJNA420600
223,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR6335603,60.973505,PRJNA420600
251,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR6335610,64.516129,PRJNA420600


In [15]:
# subset metadata to just PRJNA420600
md4 = mdsub[mdsub["BioProject"]=="PRJNA420600"]
md4

Unnamed: 0,BioProject,Sample,sample_name,Replicate_num,Genotype,Technology,Library_layout,Treatment,Duration_hours,Time_after_treatment,...,Developmental_stage,Tissue,Day_length_hours,Day_temp_C,Night_temp_C,Relative humidity (%),Light (umol/m2/s),Growth Env.,Media,Notes
401,PRJNA420600,SRR6335599,4mM nitrogen (HN),1.0,B73,Illumina HiSeq 2500,PAIRED,Control,,,...,V6,Leaf,14.0,28.0,22.0,,250-300,Chamber,Nutrient solution,
402,PRJNA420600,SRR6335600,4mM nitrogen (HN),2.0,B73,Illumina HiSeq 2500,PAIRED,Control,,,...,V6,Leaf,14.0,28.0,22.0,,250-300,Chamber,Nutrient solution,
403,PRJNA420600,SRR6335601,4mM nitrogen (HN),3.0,B73,Illumina HiSeq 2500,PAIRED,Control,,,...,V6,Leaf,14.0,28.0,22.0,,250-300,Chamber,Nutrient solution,
404,PRJNA420600,SRR6335602,4mM nitrogen (HN),4.0,B73,Illumina HiSeq 2500,PAIRED,Control,,,...,V6,Leaf,14.0,28.0,22.0,,250-300,Chamber,Nutrient solution,
405,PRJNA420600,SRR6335603,4mM nitrogen (HN),5.0,B73,Illumina HiSeq 2500,PAIRED,Control,,,...,V6,Leaf,14.0,28.0,22.0,,250-300,Chamber,Nutrient solution,
406,PRJNA420600,SRR6335604,4mM nitrogen (HN),6.0,B73,Illumina HiSeq 2500,PAIRED,Control,,,...,V6,Leaf,14.0,28.0,22.0,,250-300,Chamber,Nutrient solution,
407,PRJNA420600,SRR6335605,0.04mM nitrogen (LN),1.0,B73,Illumina HiSeq 2500,PAIRED,Low_Nitrogen,,,...,V6,Leaf,14.0,28.0,22.0,,250-300,Chamber,Nutrient solution,
408,PRJNA420600,SRR6335606,0.04mM nitrogen (LN),2.0,B73,Illumina HiSeq 2500,PAIRED,Low_Nitrogen,,,...,V6,Leaf,14.0,28.0,22.0,,250-300,Chamber,Nutrient solution,
409,PRJNA420600,SRR6335607,0.04mM nitrogen (LN),3.0,B73,Illumina HiSeq 2500,PAIRED,Low_Nitrogen,,,...,V6,Leaf,14.0,28.0,22.0,,250-300,Chamber,Nutrient solution,
410,PRJNA420600,SRR6335608,0.04mM nitrogen (LN),4.0,B73,Illumina HiSeq 2500,PAIRED,Low_Nitrogen,,,...,V6,Leaf,14.0,28.0,22.0,,250-300,Chamber,Nutrient solution,


In [18]:
#md4 contains the correct number of samples, tsbp is incorrect (missing 2 out of 12)
list(tsbp["File"])[0]

'/mnt/scratch/haberan2/Core_Stress_Response/01_pipeline_outputs/B73other_Apr24_nf-core/salmon/SRR6335608/quant.sf'