Purpose: Subset the samples in the core stress project to a list of only those suitable for differential expression.<br>
Author: Anna Pardo<br>
Date initiated: June 5, 2023

In [1]:
# load modules
import pandas as pd

In [2]:
# load sample metadata sheet
md = pd.read_csv("../../data/srr_numbers_with_metadata_12-May-2023.csv",sep=",",header="infer")
md.head()

Unnamed: 0,BioProject,Sample,sample_name,Replicate_num,Genotype,Technology,Library_layout,Treatment,Duration_hours,Time_after_treatment,...,Developmental_stage,Tissue,Day_length_hours,Day_temp_C,Night_temp_C,Relative humidity (%),Light (umol/m2/s),Growth Env.,Media,Notes
0,PRJNA637522,SRR11933261,238_WS2,,238,Illumina HiSeq 2500,PAIRED,Drought,312.0,13 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
1,PRJNA637522,SRR11933272,238_WS1,,238,Illumina HiSeq 2500,PAIRED,Drought,216.0,9 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
2,PRJNA637522,SRR11933250,268_WS1,,268,Illumina HiSeq 2500,PAIRED,Drought,216.0,9 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
3,PRJNA637522,SRR11933029,268_WW,,268,Illumina HiSeq 2500,PAIRED,Control,216.0,9 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
4,PRJNA637522,SRR11933040,268_WS2,,268,Illumina HiSeq 2500,PAIRED,Drought,312.0,13 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,


In [4]:
# generate a list of BioProjects NOT to be included (all samples are not suitable for DE)
# Brandon_Webster is included because it did not pass through tximport with the other samples and must be studied separately
bpremove = [
    "PRJNA172724",
    "PRJNA210356",
    "PRJNA255755",
    "PRJNA290180",
    "PRJNA291064",
    "PRJNA300830",
    "PRJNA304223",
    "PRJNA349117",
    "PRJNA378609",
    "PRJNA379712",
    "PRJNA436973",
    "PRJNA545969",
    "PRJNA646054",
    "PRJNA647980",
    "PRJNA689935",
    "PRJNA906711",
    "Brandon_Webster"
]

In [5]:
# since the list is hard-coded from a handwritten list: double-check that all these BioProjects are present in md
for b in bpremove:
    if b in md["BioProject"].unique():
        print(b,": check!")
    else:
        print(b,": not present")

PRJNA172724 : check!
PRJNA210356 : check!
PRJNA255755 : check!
PRJNA290180 : check!
PRJNA291064 : check!
PRJNA300830 : check!
PRJNA304223 : check!
PRJNA349117 : check!
PRJNA378609 : check!
PRJNA379712 : check!
PRJNA436973 : check!
PRJNA545969 : check!
PRJNA646054 : check!
PRJNA647980 : check!
PRJNA689935 : check!
PRJNA906711 : check!
Brandon_Webster : check!


In [6]:
# remove from md all BioProjects in this list
mdb = md[~md["BioProject"].isin(bpremove)]

In [7]:
# the dataframe mdb now contains only BioProjects with at least some samples suitable for differential expression
# certain samples need to be removed; this will require a long if/else statement

dflist = []
for i in mdb["BioProject"].unique():
    df = mdb[mdb["BioProject"]==i]
    if i=="PRJNA244661":
        df = df[~df["Treatment"].isin(["Salt","UV"])]
    elif i=="PRJNA267717":
        df = df[df["Duration_hours"]!=72]
    elif i=="PRJNA378714":
        df = df[df["Duration_hours"]==72]
    elif i=="PRJNA520822":
        df["proxy"] = df["Developmental_stage"]+"_"+str(df["Duration_hours"])+"_"+df["Tissue"]
        df = df[df["proxy"]!="R1_2_Ear"]
        df = df[df["proxy"]!="V3_48_Stalk"]
        df.drop("proxy",axis=1,inplace=True)
    elif i=="PRJNA747925":
        l = ["B73","Mo17","W22","B73xMo17","W22xB73","W22xMo17"]
        df = df[df["Genotype"].isin(l)]
        df = df[df["Duration_hours"].isin([1,25])]
    elif i=="PRJNA877073":
        df = df[df["Duration_hours"]!=0]
    dflist.append(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["proxy"] = df["Developmental_stage"]+"_"+str(df["Duration_hours"])+"_"+df["Tissue"]


In [8]:
mdsub = pd.concat(dflist)

In [9]:
# extract a list of the samples from mdsub
samples = list(mdsub["Sample"].unique())

In [28]:
# save the subsetted metadata as a csv file
mdsub.to_csv("../data/metadata_for_DESeq_samples.csv",sep=",",header=True,index=False)

In [18]:
# save the list of samples to a file
with open("../../data/samples_for_de.txt","w+") as outfile:
    for i in samples:
        outfile.write(i+"\n")

In [10]:
# subset the master tximport table input file to only the samples in the list
## load master tximport table
t = pd.read_csv("../../data/master_tximport_table.csv",sep=",",header="infer")
t.head()

Unnamed: 0,File,Sample,Percent_Mapped
0,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR8857799,86.555924
1,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR8857793,86.877209
2,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR15241108,87.659877
3,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR15241156,89.027615
4,/mnt/scratch/haberan2/Core_Stress_Response/01_...,SRR8857749,84.932979


In [12]:
len(t.index)

1823

In [11]:
# subset the table
ts = t[t["Sample"].isin(samples)]
len(ts.index)

1451

In [13]:
# output the new table
ts.to_csv("../../data/master_tximport_table_subsetted.csv",sep=",",header=True,index=False)