Purpose: Generate lists of SRR numbers for each genotype of maize for the core stress transcriptome project.<br>
Author: Anna Pardo<br>
Date initiated: April 14, 2023

In [1]:
# import modules
import pandas as pd

In [2]:
# load metadata sheet
md = pd.read_csv("srr_numbers_with_metadata_14-Apr-2023.tsv",sep="\t",header="infer")
md.head()

Unnamed: 0,BioProject,SRA_number,sample_name,Replicate_num,Genotype,Technology,Library_layout,Treatment,Duration_hours,Time_after_treatment,...,Developmental_stage,Tissue,Day_length_hours,Day_temp_C,Night_temp_C,Relative humidity (%),Light (umol/m2/s),Growth Env.,Media,Notes
0,PRJNA637522,SRR11933261,238_WS2,,238,Illumina HiSeq 2500,PAIRED,Drought,312.0,13 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
1,PRJNA637522,SRR11933272,238_WS1,,238,Illumina HiSeq 2500,PAIRED,Drought,216.0,9 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
2,PRJNA637522,SRR11933250,268_WS1,,268,Illumina HiSeq 2500,PAIRED,Drought,216.0,9 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
3,PRJNA637522,SRR11933029,268_WW,,268,Illumina HiSeq 2500,PAIRED,Control,216.0,9 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
4,PRJNA637522,SRR11933040,268_WS2,,268,Illumina HiSeq 2500,PAIRED,Drought,312.0,13 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,


In [3]:
md.columns

Index(['BioProject', 'SRA_number', 'sample_name', 'Replicate_num', 'Genotype',
       'Technology', 'Library_layout', 'Treatment', 'Duration_hours',
       'Time_after_treatment', 'Concentration_mM', 'Concentration',
       'Developmental_stage', 'Tissue', 'Day_length_hours', 'Day_temp_C',
       'Night_temp_C', 'Relative humidity (%)', 'Light (umol/m2/s)',
       'Growth Env.', 'Media', 'Notes'],
      dtype='object')

In [3]:
# clean up the metadata sheet - remove unused columns
mdclean = md[["BioProject","SRA_number","Replicate_num","Genotype","Technology","Library_layout","Treatment","Duration_hours",
                "Concentration_mM","Developmental_stage","Tissue","Day_length_hours","Day_temp_C","Night_temp_C",
                "Relative humidity (%)","Light (umol/m2/s)","Growth Env.","Media"]]
mdclean.head()

Unnamed: 0,BioProject,SRA_number,Replicate_num,Genotype,Technology,Library_layout,Treatment,Duration_hours,Concentration_mM,Developmental_stage,Tissue,Day_length_hours,Day_temp_C,Night_temp_C,Relative humidity (%),Light (umol/m2/s),Growth Env.,Media
0,PRJNA637522,SRR11933261,,238,Illumina HiSeq 2500,PAIRED,Drought,312.0,,V3,Leaf,16.0,25.0,,,,Greenhouse,
1,PRJNA637522,SRR11933272,,238,Illumina HiSeq 2500,PAIRED,Drought,216.0,,V3,Leaf,16.0,25.0,,,,Greenhouse,
2,PRJNA637522,SRR11933250,,268,Illumina HiSeq 2500,PAIRED,Drought,216.0,,V3,Leaf,16.0,25.0,,,,Greenhouse,
3,PRJNA637522,SRR11933029,,268,Illumina HiSeq 2500,PAIRED,Control,216.0,,V3,Leaf,16.0,25.0,,,,Greenhouse,
4,PRJNA637522,SRR11933040,,268,Illumina HiSeq 2500,PAIRED,Drought,312.0,,V3,Leaf,16.0,25.0,,,,Greenhouse,


In [6]:
# load the list of SRR numbers that have already been downloaded (B73 heat and salt stress and their controls)
predown = pd.read_csv("heat_salt_B73.txt",sep="\t",header=None)
predown.head()

Unnamed: 0,0
0,SRR2106180
1,SRR2106182
2,SRR2106184
3,SRR2106186
4,SRR2106196


In [9]:
# remove the SRR numbers that have already been downloaded from the metadata
unmdc = mdclean[~mdclean["SRA_number"].isin(predown[0])]

In [25]:
# remove rows where Genotype is NA
unmdc = unmdc[~unmdc["Genotype"].isna()]

In [24]:
# write a function to take a genotype and output a txt file containing that genotype's SRR numbers
## note, for B73 this will NOT include the 126 samples already downloaded ##

def genotype_srr(genotype,df):
    # remove whitespace from genotype
    g = genotype.replace(" ","")
    # set up output filenames
    if g == "B73":
        filename = g+"_other_conditions.txt"
    else:
        filename = g+"_all.txt"
        
    # subset metadata dataframe to just this genotype
    subdf = df[df["Genotype"]==genotype]
    # write the txt file
    with open(filename,"w+") as outfile:
        for i in list(subdf["SRA_number"]):
            outfile.write(i+"\n")

In [26]:
# loop through the genotypes and write the files
for g in list(unmdc["Genotype"].unique()):
    genotype_srr(g,unmdc)

In [28]:
len(list(unmdc["Genotype"].unique()))

305

In [4]:
# the above is the total number of genotypes in the dataset - this includes hybrids, inbreds, and doubled haploids
# let's pivot back into Basic Data Exploration mode now. For this we need the mdclean dataframe.

## how many samples do we have from each treatment?

for t in list(mdclean["Treatment"].unique()):
    l = len(mdclean[mdclean["Treatment"]==t].index)
    print("Number of samples for treatment",t,"=",l)

Number of samples for treatment Drought = 552
Number of samples for treatment Control = 734
Number of samples for treatment Heat = 193
Number of samples for treatment Cold = 225
Number of samples for treatment Salt = 29
Number of samples for treatment UV = 10
Number of samples for treatment ControlRec = 4
Number of samples for treatment DroughtRec = 5
Number of samples for treatment DroughtSalt = 2
Number of samples for treatment SaltRec = 2
Number of samples for treatment DroughtSaltRec = 2
Number of samples for treatment ColdRec = 1
Number of samples for treatment ColdDroughtRec = 1
Number of samples for treatment ColdDrought = 1
Number of samples for treatment DroughtRepeat = 2
Number of samples for treatment Low_Nitrogen = 27
Number of samples for treatment Flooding = 48
Number of samples for treatment PEG6000 = 24
Number of samples for treatment nan = 0


In [30]:
# what are the top 10 genotypes with the most samples?
## first remove NAs
mdclean = mdclean[~mdclean["Genotype"].isna()]

In [32]:
gtlist = []
nsamplist = []
for g in list(mdclean["Genotype"].unique()):
    gtlist.append(g)
    nsamplist.append(len(mdclean[mdclean["Genotype"]==g].index))
    
# make dataframe
gtsamp = pd.DataFrame(list(zip(gtlist,nsamplist)),columns=["Genotype","N_samples"])

In [36]:
gtsamp.sort_values(by="N_samples",axis=0,ascending=False,inplace=True)

In [37]:
gtsamp.head(n=10)

Unnamed: 0,Genotype,N_samples
23,B73,385
284,W22,70
220,Mo17,56
265,Shen5003,23
82,CIMBL55,21
285,W22xB73,21
286,W22xMo17,21
24,B73xMo17,21
17,B104,20
152,DKC 6664,14


New work May 1, 2023: generate sample list for all genotypes except the following:<br>
- B73
- Oh43 (& OH43)
- P39
- 05W002
- CML69
- W22
- Mo17 (but need to include "inbred line Mo17")

In [1]:
import pandas as pd

In [2]:
# load metadata sheet
md = pd.read_csv("srr_numbers_with_metadata_14-Apr-2023.tsv",sep="\t",header="infer")
md.head()

Unnamed: 0,BioProject,SRA_number,sample_name,Replicate_num,Genotype,Technology,Library_layout,Treatment,Duration_hours,Time_after_treatment,...,Developmental_stage,Tissue,Day_length_hours,Day_temp_C,Night_temp_C,Relative humidity (%),Light (umol/m2/s),Growth Env.,Media,Notes
0,PRJNA637522,SRR11933261,238_WS2,,238,Illumina HiSeq 2500,PAIRED,Drought,312.0,13 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
1,PRJNA637522,SRR11933272,238_WS1,,238,Illumina HiSeq 2500,PAIRED,Drought,216.0,9 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
2,PRJNA637522,SRR11933250,268_WS1,,268,Illumina HiSeq 2500,PAIRED,Drought,216.0,9 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
3,PRJNA637522,SRR11933029,268_WW,,268,Illumina HiSeq 2500,PAIRED,Control,216.0,9 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
4,PRJNA637522,SRR11933040,268_WS2,,268,Illumina HiSeq 2500,PAIRED,Drought,312.0,13 days,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,


In [3]:
# clean up the metadata sheet - remove unused columns
mdclean = md[["BioProject","SRA_number","Replicate_num","Genotype","Technology","Library_layout","Treatment","Duration_hours",
                "Concentration_mM","Developmental_stage","Tissue","Day_length_hours","Day_temp_C","Night_temp_C",
                "Relative humidity (%)","Light (umol/m2/s)","Growth Env.","Media"]]
mdclean.head()

Unnamed: 0,BioProject,SRA_number,Replicate_num,Genotype,Technology,Library_layout,Treatment,Duration_hours,Concentration_mM,Developmental_stage,Tissue,Day_length_hours,Day_temp_C,Night_temp_C,Relative humidity (%),Light (umol/m2/s),Growth Env.,Media
0,PRJNA637522,SRR11933261,,238,Illumina HiSeq 2500,PAIRED,Drought,312.0,,V3,Leaf,16.0,25.0,,,,Greenhouse,
1,PRJNA637522,SRR11933272,,238,Illumina HiSeq 2500,PAIRED,Drought,216.0,,V3,Leaf,16.0,25.0,,,,Greenhouse,
2,PRJNA637522,SRR11933250,,268,Illumina HiSeq 2500,PAIRED,Drought,216.0,,V3,Leaf,16.0,25.0,,,,Greenhouse,
3,PRJNA637522,SRR11933029,,268,Illumina HiSeq 2500,PAIRED,Control,216.0,,V3,Leaf,16.0,25.0,,,,Greenhouse,
4,PRJNA637522,SRR11933040,,268,Illumina HiSeq 2500,PAIRED,Drought,312.0,,V3,Leaf,16.0,25.0,,,,Greenhouse,


In [4]:
# make a list of genotypes NOT to be included - making sure any trailing spaces or whatnot are on the list
gtinit = ["B73","Mo17","W22","05W002","Oh43","OH43","P39","CML69"]
gt = []
for i in list(mdclean["Genotype"].unique()):
    if type(i) == str:
        g = i.replace(" ","")
        if g in gtinit:
            gt.append(i)
        
gt

['05W002', 'B73', 'CML69', 'Mo17', 'OH43', 'Oh43', 'P39', 'W22']

In [5]:
# subset the dataframe
## first drop NaN in the Genotype column
mdclean.dropna(subset="Genotype",inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mdclean.dropna(subset="Genotype",inplace=True)


In [6]:
other = mdclean[~mdclean["Genotype"].isin(gt)]
len(other.index)

1311

In [11]:
len(mdclean.index)

1850

In [21]:
# split other into two dataframes
o1 = other.iloc[:328]

In [18]:
o2 = other.iloc[655:]

In [22]:
# write txt file
with open("./firstqtr_remaining_SRR_01-May-23.txt","w+") as outfile:
    for i in list(o1["SRA_number"]):
        outfile.write(i+"\n")

In [20]:
with open("./secondhalf_remaining_SRR_01-May-23.txt","w+") as outfile:
    for i in list(o2["SRA_number"]):
        outfile.write(i+"\n")

In [7]:
# May 2, 2023
# how many fastq files will be downloaded from the dataframe other?
## count up number of PE & SE
se = other[other["Library_layout"]=="SINGLE"]
pe = other[other["Library_layout"]=="PAIRED"]

In [8]:
len(se.index)+(len(pe.index)*2)

2514

In [9]:
# load list of downloaded fastqs and output file of SRR numbers still needing to be downloaded
fastq = pd.read_csv("fastq_list.txt",sep="\t",header=None)
fastq.head()

Unnamed: 0,0
0,/mnt/scratch/haberan2/Core_Stress_Response/00_...
1,/mnt/scratch/haberan2/Core_Stress_Response/00_...
2,/mnt/scratch/haberan2/Core_Stress_Response/00_...
3,/mnt/scratch/haberan2/Core_Stress_Response/00_...
4,/mnt/scratch/haberan2/Core_Stress_Response/00_...


In [13]:
# pull out unique SRR numbers
SRR = []
for i in list(fastq[0]):
    SRR.append(i.strip().split("/")[7].split("_")[0])

In [14]:
fastq[1] = SRR

In [16]:
uniqSRR = list(fastq[1].unique())

In [17]:
uniqSRR[0:5]

['SRR10253751', 'SRR10253752', 'SRR10253753', 'SRR10253754', 'SRR10253755']

In [19]:
for i in list(other["SRA_number"].unique()):
    if i not in uniqSRR:
        print(i)

SRR8239716
SRR8239717
SRR8239718
SRR8239725
SRR8239726
SRR8239727
SRR9841917
SRR9841918
SRR9841942
SRR9841959
SRR9841997
SRR9841998
SRR9842058
SRR9842059
SRR1685963
SRR1819625
SRR1819627
SRR1819630
SRR1685964
SRR1819632
SRR1685982
SRR1819622
SRR1819624
SRR1663693
SRR1663694
SRR6179041
SRR6179042
SRR6179043
SRR6179044
SRR6179047
SRR6179048
SRR15241121
SRR15241132
SRR15241239
SRR15241110
SRR15241347
SRR15241358
SRR15241314
SRR15241325
SRR15241336
SRR15241309
SRR15241310
SRR15241311
SRR15241306
SRR15241307
SRR15241308
SRR15241303
SRR15241304
SRR15241305
SRR15241299
SRR15241300
SRR15241302
SRR15241394
SRR15241393
SRR15241392
SRR15241391
SRR12300529
SRR12300530
SRR12300531
SRR12300532
SRR12300533
SRR12300534
SRR12300543
SRR12300544
SRR13299751
SRR13299752
SRR13299753
SRR13299754
SRR13299759
SRR13299760
SRR6671824
SRR6671825
SRR6671826
SRR6671827
SRR6671828
SRR6671829
SRR6671830
SRR6671831
SRR6671832
SRR6671833
SRR6671846
SRR6671847
SRR6671848
SRR6671849
SRR6671850
SRR6671851
SRR6671852
SRR6

In [20]:
# go back to other and make a list of SRR numbers not in uniqSRR, then write to a file
with open("./srr_remaining_02-May-2023.txt","w+") as outfile:
    for i in list(other["SRA_number"].unique()):
        if i not in uniqSRR:
            outfile.write(i+"\n")