Purpose: To make sample tables to input to Jeremy's RNA-seq processing pipeline (https://github.com/pardojer23/RNAseqV2) for the core stress transcriptomic response project.<br>
Author: Anna Pardo<br>
Date initiated: March 24, 2023

In [1]:
# import modules
import pandas as pd
import numpy as np

In [2]:
# key considerations: the pipeline can only handle batches of 50-100 samples at a time, and all reads must be in the same directory.
## I have 126 samples to start and will run them in 2 batches of 63 samples each.

# load sample names
srr = pd.read_csv("heat_salt_B73.txt",sep="\t",header=None)
srr.rename(columns={0:"SRA_number"},inplace=True)
srr.head()

Unnamed: 0,SRA_number
0,SRR2106180
1,SRR2106182
2,SRR2106184
3,SRR2106186
4,SRR2106196


In [3]:
# on Bob's suggestion I am going to leave the SRR number as the sample name and go back into the metadata later (this is how 
## he's been doing things lately)

path_to_dir = "/mnt/scratch/haberan2/Core_Stress_Response/00_data/00_B73_Heat_Salt/"

In [4]:
# load metadata
md = pd.read_csv("srr_numbers_with_metadata_27-Mar-2023.tsv",sep="\t",header="infer",usecols=["SRA_number","Library_layout"])
md.head()

Unnamed: 0,SRA_number,Library_layout
0,SRR5344453,PAIRED
1,SRR5344560,PAIRED
2,SRR5344568,PAIRED
3,SRR5344570,PAIRED
4,SRR5344571,PAIRED


In [5]:
# subset the metadata
mdsub = md.merge(srr,how="inner")
mdsub.head()

Unnamed: 0,SRA_number,Library_layout
0,SRR15241117,PAIRED
1,SRR15241201,PAIRED
2,SRR15241212,PAIRED
3,SRR15241285,PAIRED
4,SRR15241116,PAIRED


In [6]:
# generate the read1 and read2 list
r1 = []
r2 = []
for i in range(len(mdsub.index)):
    if mdsub.iloc[i,1]=="SINGLE":
        r2.append(np.nan)
        r1.append(path_to_dir+mdsub.iloc[i,0]+".fastq")
    else:
        r1.append(path_to_dir+mdsub.iloc[i,0]+"_1.fastq")
        r2.append(path_to_dir+mdsub.iloc[i,0]+"_2.fastq")

In [7]:
# generate lists of NAs for the rest of the columns
r = [np.nan]*126
tis = [np.nan]*126
tim = [np.nan]*126
d = [np.nan]*126
con =[np.nan]*126
col =[np.nan]*126
l = [np.nan]*126
p = [np.nan]*126
gt = [np.nan]*126

In [8]:
# bind into a df
samptable = pd.DataFrame(data={"Read1":r1,"Read2":r2,"Replicate":r,"Tissue":tis,
                              "Time":tim,"Date":d,"Genotype":gt,"Condition":list(mdsub["SRA_number"]),"Collector":col,
                              "Location":l,"Platform":p})

In [9]:
samptable.tail()

Unnamed: 0,Read1,Read2,Replicate,Tissue,Time,Date,Genotype,Condition,Collector,Location,Platform
121,/mnt/scratch/haberan2/Core_Stress_Response/00_...,,,,,,,SRR2106186,,,
122,/mnt/scratch/haberan2/Core_Stress_Response/00_...,,,,,,,SRR2106196,,,
123,/mnt/scratch/haberan2/Core_Stress_Response/00_...,,,,,,,SRR2106198,,,
124,/mnt/scratch/haberan2/Core_Stress_Response/00_...,,,,,,,SRR2106200,,,
125,/mnt/scratch/haberan2/Core_Stress_Response/00_...,,,,,,,SRR2106202,,,


In [10]:
# save the table
samptable.to_csv("B73HS_SampTable_27-Mar-2023.csv",sep=",",header=True,index=False)

Work on March 28th: I forgot to split the table into two so the pipeline would actually work...split into two tables of 63 samples each.

In [11]:
st1 = samptable.iloc[:63]
st2 = samptable.iloc[63:]

In [12]:
# save the 2 tables
st1.to_csv("B73HS_SampTable1_28-Mar-2023.csv",sep=",",header=True,index=False)
st2.to_csv("B73HS_SampTable2_28-Mar-2023.csv",sep=",",header=True,index=False)

In [13]:
# make 3 tables of 50ish samples each
t1 = samptable.iloc[:50]
t2 = samptable.iloc[50:100]
t3 = samptable.iloc[100:]

In [14]:
t1.to_csv("B73HS_SampTable1_28-Mar-2023.csv",sep=",",header=True,index=False)
t2.to_csv("B73HS_SampTable2_28-Mar-2023.csv",sep=",",header=True,index=False)
t3.to_csv("B73HS_SampTable3_28-Mar-2023.csv",sep=",",header=True,index=False)

In [15]:
# split t1 and t2 into batches of 25
t1a = samptable.iloc[:25]
t1b = samptable.iloc[25:50]
t2a = samptable.iloc[50:75]
t2b = samptable.iloc[75:100]

In [16]:
t1a.to_csv("B73HS_SampTable1A_28-Mar-2023.csv",sep=",",header=True,index=False)
t1b.to_csv("B73HS_SampTable1B_28-Mar-2023.csv",sep=",",header=True,index=False)

In [17]:
t2a.to_csv("B73HS_SampTable2A_28-Mar-2023.csv",sep=",",header=True,index=False)
t2b.to_csv("B73HS_SampTable2B_28-Mar-2023.csv",sep=",",header=True,index=False)