# Metadata Generation

This notebook illustrates the process of generating metadata stored in `data` object in the Snakefile.
Each cell adds one column to `data` following the instructions and conditions

In [41]:
import os
import  pandas as pd
import numpy as np
import glob

In [42]:
data = pd.read_csv("config/table_siNipbl_cluster.txt", sep="\t")

In [43]:
## Genome bowtie2 index prefixes paths
genome_path = {
	"mm9":"/storage/scratch01/users/dgimenezl/genomes/mouse/mm9/mm9",
    "mm10":"/storage/scratch01/users/dgimenezl/genomes/mouse/mm10/mm10" ,
    "hg19":"/storage/scratch01/users/dgimenezl/genomes/human/hg19/hg19",
    "hg38":"/storage/scratch01/users/dgimenezl/genomes/human/hg38/hg38",
    "-":""}
refSeq_genes_path = {
	"mm9" : "",
	"mm10" : "",
	"hg19" : "/storage/scratch01/users/aquevedo/genomes/human/hg19/hg19_RefSeqCuratedGenes.bed",
	"hg38" : ""
}
## Genome sizes for big wig computation
genome_size={"mm9":2620345972,
    "mm10":2652783500,
    "hg19":2864785220,
    "hg38":2913022398}

In [44]:
## Add extra cols for salecting the appropriate wildcards path to files
data["Samples"] = data.Protein +"_"+data.Condition+"_"+ data.Rep 
data

Unnamed: 0,Protein,Condition,Rep,Ext,Run,File,Genome,Norm,Input,Samples
0,input,siC,S9,L001_R1_001.fastq.gz,,input-SiC_S9_L001_R1_001.fastq.gz,hg19,mm9,,input_siC_S9
1,input,siC,S9,L002_R1_001.fastq.gz,,input-SiC_S9_L002_R1_001.fastq.gz,hg19,mm9,,input_siC_S9
2,input,siC,S9,L003_R1_001.fastq.gz,,input-SiC_S9_L003_R1_001.fastq.gz,hg19,mm9,,input_siC_S9
3,input,siC,S9,L004_R1_001.fastq.gz,,input-SiC_S9_L004_R1_001.fastq.gz,hg19,mm9,,input_siC_S9
4,input,siNipbl,S10,L001_R1_001.fastq.gz,,input-SiNipbl_S10_L001_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10
5,input,siNipbl,S10,L002_R1_001.fastq.gz,,input-SiNipbl_S10_L002_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10
6,input,siNipbl,S10,L003_R1_001.fastq.gz,,input-SiNipbl_S10_L003_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10
7,input,siNipbl,S10,L004_R1_001.fastq.gz,,input-SiNipbl_S10_L004_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10
8,NiPBL,siC,S4,L001_R1_001.fastq.gz,,Sic-NiPBL_S4_L001_R1_001.fastq.gz,hg19,mm9,,NiPBL_siC_S4
9,NiPBL,siC,S4,L002_R1_001.fastq.gz,,Sic-NiPBL_S4_L002_R1_001.fastq.gz,hg19,mm9,,NiPBL_siC_S4


In [45]:
data["Input"] = [ data.Samples[(data.Protein=="input") & (data.Condition==Cond)].values[0] \
                 if Prot != "input" \
                 else "" \
                 for Prot,Cond in zip(data.Protein,data.Condition)  ]
data

Unnamed: 0,Protein,Condition,Rep,Ext,Run,File,Genome,Norm,Input,Samples
0,input,siC,S9,L001_R1_001.fastq.gz,,input-SiC_S9_L001_R1_001.fastq.gz,hg19,mm9,,input_siC_S9
1,input,siC,S9,L002_R1_001.fastq.gz,,input-SiC_S9_L002_R1_001.fastq.gz,hg19,mm9,,input_siC_S9
2,input,siC,S9,L003_R1_001.fastq.gz,,input-SiC_S9_L003_R1_001.fastq.gz,hg19,mm9,,input_siC_S9
3,input,siC,S9,L004_R1_001.fastq.gz,,input-SiC_S9_L004_R1_001.fastq.gz,hg19,mm9,,input_siC_S9
4,input,siNipbl,S10,L001_R1_001.fastq.gz,,input-SiNipbl_S10_L001_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10
5,input,siNipbl,S10,L002_R1_001.fastq.gz,,input-SiNipbl_S10_L002_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10
6,input,siNipbl,S10,L003_R1_001.fastq.gz,,input-SiNipbl_S10_L003_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10
7,input,siNipbl,S10,L004_R1_001.fastq.gz,,input-SiNipbl_S10_L004_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10
8,NiPBL,siC,S4,L001_R1_001.fastq.gz,,Sic-NiPBL_S4_L001_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4
9,NiPBL,siC,S4,L002_R1_001.fastq.gz,,Sic-NiPBL_S4_L002_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4


In [46]:
data["PATH_genome"] = [genome_path[i] for i in data.Genome] 
data

Unnamed: 0,Protein,Condition,Rep,Ext,Run,File,Genome,Norm,Input,Samples,PATH_genome
0,input,siC,S9,L001_R1_001.fastq.gz,,input-SiC_S9_L001_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...
1,input,siC,S9,L002_R1_001.fastq.gz,,input-SiC_S9_L002_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...
2,input,siC,S9,L003_R1_001.fastq.gz,,input-SiC_S9_L003_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...
3,input,siC,S9,L004_R1_001.fastq.gz,,input-SiC_S9_L004_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...
4,input,siNipbl,S10,L001_R1_001.fastq.gz,,input-SiNipbl_S10_L001_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...
5,input,siNipbl,S10,L002_R1_001.fastq.gz,,input-SiNipbl_S10_L002_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...
6,input,siNipbl,S10,L003_R1_001.fastq.gz,,input-SiNipbl_S10_L003_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...
7,input,siNipbl,S10,L004_R1_001.fastq.gz,,input-SiNipbl_S10_L004_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...
8,NiPBL,siC,S4,L001_R1_001.fastq.gz,,Sic-NiPBL_S4_L001_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4,/storage/scratch01/users/dgimenezl/genomes/hum...
9,NiPBL,siC,S4,L002_R1_001.fastq.gz,,Sic-NiPBL_S4_L002_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4,/storage/scratch01/users/dgimenezl/genomes/hum...


In [47]:
data["Genome_size"] = [genome_size[i] for i in data.Genome]
data

Unnamed: 0,Protein,Condition,Rep,Ext,Run,File,Genome,Norm,Input,Samples,PATH_genome,Genome_size
0,input,siC,S9,L001_R1_001.fastq.gz,,input-SiC_S9_L001_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220
1,input,siC,S9,L002_R1_001.fastq.gz,,input-SiC_S9_L002_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220
2,input,siC,S9,L003_R1_001.fastq.gz,,input-SiC_S9_L003_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220
3,input,siC,S9,L004_R1_001.fastq.gz,,input-SiC_S9_L004_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220
4,input,siNipbl,S10,L001_R1_001.fastq.gz,,input-SiNipbl_S10_L001_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220
5,input,siNipbl,S10,L002_R1_001.fastq.gz,,input-SiNipbl_S10_L002_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220
6,input,siNipbl,S10,L003_R1_001.fastq.gz,,input-SiNipbl_S10_L003_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220
7,input,siNipbl,S10,L004_R1_001.fastq.gz,,input-SiNipbl_S10_L004_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220
8,NiPBL,siC,S4,L001_R1_001.fastq.gz,,Sic-NiPBL_S4_L001_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220
9,NiPBL,siC,S4,L002_R1_001.fastq.gz,,Sic-NiPBL_S4_L002_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220


In [48]:
data["PATH_genome_cal"] = [genome_path[i] for i in data.Norm]
data

Unnamed: 0,Protein,Condition,Rep,Ext,Run,File,Genome,Norm,Input,Samples,PATH_genome,Genome_size,PATH_genome_cal
0,input,siC,S9,L001_R1_001.fastq.gz,,input-SiC_S9_L001_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...
1,input,siC,S9,L002_R1_001.fastq.gz,,input-SiC_S9_L002_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...
2,input,siC,S9,L003_R1_001.fastq.gz,,input-SiC_S9_L003_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...
3,input,siC,S9,L004_R1_001.fastq.gz,,input-SiC_S9_L004_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...
4,input,siNipbl,S10,L001_R1_001.fastq.gz,,input-SiNipbl_S10_L001_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...
5,input,siNipbl,S10,L002_R1_001.fastq.gz,,input-SiNipbl_S10_L002_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...
6,input,siNipbl,S10,L003_R1_001.fastq.gz,,input-SiNipbl_S10_L003_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...
7,input,siNipbl,S10,L004_R1_001.fastq.gz,,input-SiNipbl_S10_L004_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...
8,NiPBL,siC,S4,L001_R1_001.fastq.gz,,Sic-NiPBL_S4_L001_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...
9,NiPBL,siC,S4,L002_R1_001.fastq.gz,,Sic-NiPBL_S4_L002_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...


In [49]:
data["PATH_refSeq_genes"] = [refSeq_genes_path[i] for i in data.Genome]
data

Unnamed: 0,Protein,Condition,Rep,Ext,Run,File,Genome,Norm,Input,Samples,PATH_genome,Genome_size,PATH_genome_cal,PATH_refSeq_genes
0,input,siC,S9,L001_R1_001.fastq.gz,,input-SiC_S9_L001_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...
1,input,siC,S9,L002_R1_001.fastq.gz,,input-SiC_S9_L002_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...
2,input,siC,S9,L003_R1_001.fastq.gz,,input-SiC_S9_L003_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...
3,input,siC,S9,L004_R1_001.fastq.gz,,input-SiC_S9_L004_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...
4,input,siNipbl,S10,L001_R1_001.fastq.gz,,input-SiNipbl_S10_L001_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...
5,input,siNipbl,S10,L002_R1_001.fastq.gz,,input-SiNipbl_S10_L002_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...
6,input,siNipbl,S10,L003_R1_001.fastq.gz,,input-SiNipbl_S10_L003_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...
7,input,siNipbl,S10,L004_R1_001.fastq.gz,,input-SiNipbl_S10_L004_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...
8,NiPBL,siC,S4,L001_R1_001.fastq.gz,,Sic-NiPBL_S4_L001_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...
9,NiPBL,siC,S4,L002_R1_001.fastq.gz,,Sic-NiPBL_S4_L002_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...


In [50]:
## Remove .fastq.gz to use basename with expand() in rule "all"
data["fqBasename"] = [f.replace(".fastq.gz","") for f in data["File"]]
data

Unnamed: 0,Protein,Condition,Rep,Ext,Run,File,Genome,Norm,Input,Samples,PATH_genome,Genome_size,PATH_genome_cal,PATH_refSeq_genes,fqBasename
0,input,siC,S9,L001_R1_001.fastq.gz,,input-SiC_S9_L001_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,input-SiC_S9_L001_R1_001
1,input,siC,S9,L002_R1_001.fastq.gz,,input-SiC_S9_L002_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,input-SiC_S9_L002_R1_001
2,input,siC,S9,L003_R1_001.fastq.gz,,input-SiC_S9_L003_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,input-SiC_S9_L003_R1_001
3,input,siC,S9,L004_R1_001.fastq.gz,,input-SiC_S9_L004_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,input-SiC_S9_L004_R1_001
4,input,siNipbl,S10,L001_R1_001.fastq.gz,,input-SiNipbl_S10_L001_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,input-SiNipbl_S10_L001_R1_001
5,input,siNipbl,S10,L002_R1_001.fastq.gz,,input-SiNipbl_S10_L002_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,input-SiNipbl_S10_L002_R1_001
6,input,siNipbl,S10,L003_R1_001.fastq.gz,,input-SiNipbl_S10_L003_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,input-SiNipbl_S10_L003_R1_001
7,input,siNipbl,S10,L004_R1_001.fastq.gz,,input-SiNipbl_S10_L004_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,input-SiNipbl_S10_L004_R1_001
8,NiPBL,siC,S4,L001_R1_001.fastq.gz,,Sic-NiPBL_S4_L001_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,Sic-NiPBL_S4_L001_R1_001
9,NiPBL,siC,S4,L002_R1_001.fastq.gz,,Sic-NiPBL_S4_L002_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,Sic-NiPBL_S4_L002_R1_001
