# Metadata Generation

This notebook illustrates the process of generating metadata stored in `data` object in the Snakefile.
Each cell adds one column to `data` following the instructions and conditions

In [12]:
import os
import  pandas as pd
import numpy as np
import glob
from snakemake.io import expand

In [13]:
data = pd.read_csv("../config/table_siNipbl_cluster.tsv", sep="\t")

In [14]:
## Genome bowtie2 index prefixes paths
genome_path = {
	"mm9":"/storage/scratch01/users/dgimenezl/genomes/mouse/mm9/mm9",
    "mm10":"/storage/scratch01/users/dgimenezl/genomes/mouse/mm10/mm10" ,
    "hg19":"/storage/scratch01/users/dgimenezl/genomes/human/hg19/hg19",
    "hg38":"/storage/scratch01/users/dgimenezl/genomes/human/hg38/hg38",
    "-":""}
refSeq_genes_path = {
	"mm9" : "",
	"mm10" : "",
	"hg19" : "/storage/scratch01/users/aquevedo/genomes/human/hg19/hg19_RefSeqCuratedGenes.bed",
	"hg38" : ""
}
## Genome sizes for big wig computation
genome_size={"mm9":2620345972,
    "mm10":2652783500,
    "hg19":2864785220,
    "hg38":2913022398}

In [15]:
## Add extra cols for salecting the appropriate wildcards path to files
data["Samples"] = data.Protein +"_"+data.Condition+"_"+ data.Rep 
data

Unnamed: 0,Protein,Condition,Rep,Ext,Run,File,Genome,Norm,Input,Samples
0,input,siC,S9,L001_R1_001.fastq.gz,,input-SiC_S9_L001_R1_001.fastq.gz,hg19,mm9,,input_siC_S9
1,input,siC,S9,L002_R1_001.fastq.gz,,input-SiC_S9_L002_R1_001.fastq.gz,hg19,mm9,,input_siC_S9
2,input,siC,S9,L003_R1_001.fastq.gz,,input-SiC_S9_L003_R1_001.fastq.gz,hg19,mm9,,input_siC_S9
3,input,siC,S9,L004_R1_001.fastq.gz,,input-SiC_S9_L004_R1_001.fastq.gz,hg19,mm9,,input_siC_S9
4,input,siNipbl,S10,L001_R1_001.fastq.gz,,input-SiNipbl_S10_L001_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10
5,input,siNipbl,S10,L002_R1_001.fastq.gz,,input-SiNipbl_S10_L002_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10
6,input,siNipbl,S10,L003_R1_001.fastq.gz,,input-SiNipbl_S10_L003_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10
7,input,siNipbl,S10,L004_R1_001.fastq.gz,,input-SiNipbl_S10_L004_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10
8,NiPBL,siC,S4,L001_R1_001.fastq.gz,,Sic-NiPBL_S4_L001_R1_001.fastq.gz,hg19,mm9,,NiPBL_siC_S4
9,NiPBL,siC,S4,L002_R1_001.fastq.gz,,Sic-NiPBL_S4_L002_R1_001.fastq.gz,hg19,mm9,,NiPBL_siC_S4


In [16]:
data["Input"] = [ data.Samples[(data.Protein=="input") & (data.Condition==Cond)].values[0] \
                 if Prot != "input" \
                 else "" \
                 for Prot,Cond in zip(data.Protein,data.Condition)  ]
data

Unnamed: 0,Protein,Condition,Rep,Ext,Run,File,Genome,Norm,Input,Samples
0,input,siC,S9,L001_R1_001.fastq.gz,,input-SiC_S9_L001_R1_001.fastq.gz,hg19,mm9,,input_siC_S9
1,input,siC,S9,L002_R1_001.fastq.gz,,input-SiC_S9_L002_R1_001.fastq.gz,hg19,mm9,,input_siC_S9
2,input,siC,S9,L003_R1_001.fastq.gz,,input-SiC_S9_L003_R1_001.fastq.gz,hg19,mm9,,input_siC_S9
3,input,siC,S9,L004_R1_001.fastq.gz,,input-SiC_S9_L004_R1_001.fastq.gz,hg19,mm9,,input_siC_S9
4,input,siNipbl,S10,L001_R1_001.fastq.gz,,input-SiNipbl_S10_L001_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10
5,input,siNipbl,S10,L002_R1_001.fastq.gz,,input-SiNipbl_S10_L002_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10
6,input,siNipbl,S10,L003_R1_001.fastq.gz,,input-SiNipbl_S10_L003_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10
7,input,siNipbl,S10,L004_R1_001.fastq.gz,,input-SiNipbl_S10_L004_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10
8,NiPBL,siC,S4,L001_R1_001.fastq.gz,,Sic-NiPBL_S4_L001_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4
9,NiPBL,siC,S4,L002_R1_001.fastq.gz,,Sic-NiPBL_S4_L002_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4


In [17]:
data["PATH_genome"] = [genome_path[i] for i in data.Genome] 
data

Unnamed: 0,Protein,Condition,Rep,Ext,Run,File,Genome,Norm,Input,Samples,PATH_genome
0,input,siC,S9,L001_R1_001.fastq.gz,,input-SiC_S9_L001_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...
1,input,siC,S9,L002_R1_001.fastq.gz,,input-SiC_S9_L002_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...
2,input,siC,S9,L003_R1_001.fastq.gz,,input-SiC_S9_L003_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...
3,input,siC,S9,L004_R1_001.fastq.gz,,input-SiC_S9_L004_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...
4,input,siNipbl,S10,L001_R1_001.fastq.gz,,input-SiNipbl_S10_L001_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...
5,input,siNipbl,S10,L002_R1_001.fastq.gz,,input-SiNipbl_S10_L002_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...
6,input,siNipbl,S10,L003_R1_001.fastq.gz,,input-SiNipbl_S10_L003_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...
7,input,siNipbl,S10,L004_R1_001.fastq.gz,,input-SiNipbl_S10_L004_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...
8,NiPBL,siC,S4,L001_R1_001.fastq.gz,,Sic-NiPBL_S4_L001_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4,/storage/scratch01/users/dgimenezl/genomes/hum...
9,NiPBL,siC,S4,L002_R1_001.fastq.gz,,Sic-NiPBL_S4_L002_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4,/storage/scratch01/users/dgimenezl/genomes/hum...


In [18]:
data["Genome_size"] = [genome_size[i] for i in data.Genome]
data[1:12:2].Condition

1         siC
3         siC
5     siNipbl
7     siNipbl
9         siC
11        siC
Name: Condition, dtype: object

In [19]:
data["PATH_genome_cal"] = [genome_path[i] for i in data.Norm]
data

Unnamed: 0,Protein,Condition,Rep,Ext,Run,File,Genome,Norm,Input,Samples,PATH_genome,Genome_size,PATH_genome_cal
0,input,siC,S9,L001_R1_001.fastq.gz,,input-SiC_S9_L001_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...
1,input,siC,S9,L002_R1_001.fastq.gz,,input-SiC_S9_L002_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...
2,input,siC,S9,L003_R1_001.fastq.gz,,input-SiC_S9_L003_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...
3,input,siC,S9,L004_R1_001.fastq.gz,,input-SiC_S9_L004_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...
4,input,siNipbl,S10,L001_R1_001.fastq.gz,,input-SiNipbl_S10_L001_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...
5,input,siNipbl,S10,L002_R1_001.fastq.gz,,input-SiNipbl_S10_L002_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...
6,input,siNipbl,S10,L003_R1_001.fastq.gz,,input-SiNipbl_S10_L003_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...
7,input,siNipbl,S10,L004_R1_001.fastq.gz,,input-SiNipbl_S10_L004_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...
8,NiPBL,siC,S4,L001_R1_001.fastq.gz,,Sic-NiPBL_S4_L001_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...
9,NiPBL,siC,S4,L002_R1_001.fastq.gz,,Sic-NiPBL_S4_L002_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...


In [20]:
data["PATH_refSeq_genes"] = [refSeq_genes_path[i] for i in data.Genome]
data

Unnamed: 0,Protein,Condition,Rep,Ext,Run,File,Genome,Norm,Input,Samples,PATH_genome,Genome_size,PATH_genome_cal,PATH_refSeq_genes
0,input,siC,S9,L001_R1_001.fastq.gz,,input-SiC_S9_L001_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...
1,input,siC,S9,L002_R1_001.fastq.gz,,input-SiC_S9_L002_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...
2,input,siC,S9,L003_R1_001.fastq.gz,,input-SiC_S9_L003_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...
3,input,siC,S9,L004_R1_001.fastq.gz,,input-SiC_S9_L004_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...
4,input,siNipbl,S10,L001_R1_001.fastq.gz,,input-SiNipbl_S10_L001_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...
5,input,siNipbl,S10,L002_R1_001.fastq.gz,,input-SiNipbl_S10_L002_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...
6,input,siNipbl,S10,L003_R1_001.fastq.gz,,input-SiNipbl_S10_L003_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...
7,input,siNipbl,S10,L004_R1_001.fastq.gz,,input-SiNipbl_S10_L004_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...
8,NiPBL,siC,S4,L001_R1_001.fastq.gz,,Sic-NiPBL_S4_L001_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...
9,NiPBL,siC,S4,L002_R1_001.fastq.gz,,Sic-NiPBL_S4_L002_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...


In [21]:
## Remove .fastq.gz to use basename with expand() in rule "all"
data["fqBasename"] = [f.replace(".fastq.gz","") for f in data["File"]]
data

Unnamed: 0,Protein,Condition,Rep,Ext,Run,File,Genome,Norm,Input,Samples,PATH_genome,Genome_size,PATH_genome_cal,PATH_refSeq_genes,fqBasename
0,input,siC,S9,L001_R1_001.fastq.gz,,input-SiC_S9_L001_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,input-SiC_S9_L001_R1_001
1,input,siC,S9,L002_R1_001.fastq.gz,,input-SiC_S9_L002_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,input-SiC_S9_L002_R1_001
2,input,siC,S9,L003_R1_001.fastq.gz,,input-SiC_S9_L003_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,input-SiC_S9_L003_R1_001
3,input,siC,S9,L004_R1_001.fastq.gz,,input-SiC_S9_L004_R1_001.fastq.gz,hg19,mm9,,input_siC_S9,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,input-SiC_S9_L004_R1_001
4,input,siNipbl,S10,L001_R1_001.fastq.gz,,input-SiNipbl_S10_L001_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,input-SiNipbl_S10_L001_R1_001
5,input,siNipbl,S10,L002_R1_001.fastq.gz,,input-SiNipbl_S10_L002_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,input-SiNipbl_S10_L002_R1_001
6,input,siNipbl,S10,L003_R1_001.fastq.gz,,input-SiNipbl_S10_L003_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,input-SiNipbl_S10_L003_R1_001
7,input,siNipbl,S10,L004_R1_001.fastq.gz,,input-SiNipbl_S10_L004_R1_001.fastq.gz,hg19,mm9,,input_siNipbl_S10,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,input-SiNipbl_S10_L004_R1_001
8,NiPBL,siC,S4,L001_R1_001.fastq.gz,,Sic-NiPBL_S4_L001_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,Sic-NiPBL_S4_L001_R1_001
9,NiPBL,siC,S4,L002_R1_001.fastq.gz,,Sic-NiPBL_S4_L002_R1_001.fastq.gz,hg19,mm9,input_siC_S9,NiPBL_siC_S4,/storage/scratch01/users/dgimenezl/genomes/hum...,2864785220,/storage/scratch01/users/dgimenezl/genomes/mou...,/storage/scratch01/users/aquevedo/genomes/huma...,Sic-NiPBL_S4_L002_R1_001


## Generate Arguments for MergeSamFiles
Demonstrate how the command string for MergeSamFiles is produced. We take advantage of the `data` table and the auxiliary function `Input_merge_bam`

In [70]:
def Input_merge_bam(*bams):
    '''
    Generate formated string for -I option for MergeSamFiles
    '''
    input_bams=["-I " + str(b) for b in bams]
    input_bams=" ".join(input_bams)
    return input_bams

In [71]:
## generate a list with .bam filenames the same way as in the Snakefile
sample=data.Samples[(data.Protein == 'SA2') & (data.Condition == 'siC')].unique()
print("sample: ", sample)
bams=[s +'_final.bam' for s in sample]
print("Input bams: ", bams)

sample:  ['SA2_siC_S11' 'SA2_siC_S2']
Input bams:  ['SA2_siC_S11_final.bam', 'SA2_siC_S2_final.bam']


In [72]:
## Apply function to the list of bams filenames
Input_merge_bam(*bams)

'-I SA2_siC_S11_final.bam -I SA2_siC_S2_final.bam'

In [86]:
## Column needed to match properly different replicates.
## Each entry is a string of (Protein, Condition) joined by "_"
data["Prot_Cond"] = ["_".join((Prot,Cond)) for Prot,Cond in zip(data.Protein,data.Condition)]

array(['input_siC', 'input_siNipbl', 'NiPBL_siC', 'SA1_siC', 'SA2_siC',
       'SMC1_siC', 'NiPBL_siNipbl', 'SA1_siNipbl', 'SA2_siNipbl',
       'SMC1_siNipbl'], dtype=object)

In [87]:
## Check that expand function works as expected
expand( "/align/{Prot_Cond}_final_merged.bam",
            Prot_Cond=data.Prot_Cond.unique())

['/align/input_siC_final_merged.bam',
 '/align/input_siNipbl_final_merged.bam',
 '/align/NiPBL_siC_final_merged.bam',
 '/align/SA1_siC_final_merged.bam',
 '/align/SA2_siC_final_merged.bam',
 '/align/SMC1_siC_final_merged.bam',
 '/align/NiPBL_siNipbl_final_merged.bam',
 '/align/SA1_siNipbl_final_merged.bam',
 '/align/SA2_siNipbl_final_merged.bam',
 '/align/SMC1_siNipbl_final_merged.bam']

In [22]:
## Esto es una prueba, quitarlo


" ".join(
    np.unique(
        expand({"/bw/{sample}_RPKM_scaled.bw"},
                       sample=list(filter(None,
                                          [ sampl \
                                    if prot != "input" \
                                    else "" \
         for (sampl,prot) in zip(data.Samples, data.Protein)]
                                 )
                          )
               )
    )
)


'/bw/NiPBL_siC_S4_RPKM_scaled.bw /bw/NiPBL_siNipbl_S8_RPKM_scaled.bw /bw/SA1_siC_S1_RPKM_scaled.bw /bw/SA1_siNipbl_S5_RPKM_scaled.bw /bw/SA2_siC_S11_RPKM_scaled.bw /bw/SA2_siC_S2_RPKM_scaled.bw /bw/SA2_siNipbl_S6_RPKM_scaled.bw /bw/SMC1_siC_S12_RPKM_scaled.bw /bw/SMC1_siC_S3_RPKM_scaled.bw /bw/SMC1_siNipbl_S7_RPKM_scaled.bw'

In [23]:
data.Samples.unique()

array(['input_siC_S9', 'input_siNipbl_S10', 'NiPBL_siC_S4', 'SA1_siC_S1',
       'SA2_siC_S11', 'SA2_siC_S2', 'SMC1_siC_S12', 'SMC1_siC_S3',
       'NiPBL_siNipbl_S8', 'SA1_siNipbl_S5', 'SA2_siNipbl_S6',
       'SMC1_siNipbl_S7'], dtype=object)

In [24]:
expand('{a}.gz', a=np.arange(0,10))

['0.gz',
 '1.gz',
 '2.gz',
 '3.gz',
 '4.gz',
 '5.gz',
 '6.gz',
 '7.gz',
 '8.gz',
 '9.gz']

## Filenames and Labels for compute_matrix

In [25]:
matrixFiles = data.Samples[data.Protein != 'input'].unique()
matrixFiles

array(['NiPBL_siC_S4', 'SA1_siC_S1', 'SA2_siC_S11', 'SA2_siC_S2',
       'SMC1_siC_S12', 'SMC1_siC_S3', 'NiPBL_siNipbl_S8',
       'SA1_siNipbl_S5', 'SA2_siNipbl_S6', 'SMC1_siNipbl_S7'],
      dtype=object)

In [26]:
import re

In [27]:
[re.sub("_S.+$","",f) for f in matrixFiles]

['NiPBL_siC',
 'SA1_siC',
 'SA2_siC',
 'SA2_siC',
 'SMC1_siC',
 'SMC1_siC',
 'NiPBL_siNipbl',
 'SA1_siNipbl',
 'SA2_siNipbl',
 'SMC1_siNipbl']

## Find samples to cluster heatmap

In [28]:
## Find where SA2 protein appears in matr
ix=np.where(np.array(["SA2" in Samp for Samp in matrixFiles ]))[0]
ix=ix+1 # first column is 1 in deeptools but np.where is 0 based
" ".join(ix.astype(str))

'3 4 9'

## Motif Enrichment

We are using homer's `findMotifsGenome.pl` script to find e riched motifs between conditions.
We must be able to 

In [31]:
PEAKSDIR = "/Users/aqo/Desktop/siNipblChipPeaks/unique/"

In [32]:
## Data frame with .narrowPeak files
naPeak = glob.glob(PEAKSDIR + "*uniquePeaks.bed")
meta=pd.DataFrame(naPeak, columns=["uniqPeakFile"])

In [33]:
## Extract basename
meta["Basename"]=[re.sub(pattern='^.+/', repl="",string=f) for f in meta.uniqPeakFile]
meta


Unnamed: 0,uniqPeakFile,Basename
0,/Users/aqo/Desktop/siNipblChipPeaks/unique/SMC...,SMC1_Common_siNipbl_uniquePeaks.bed
1,/Users/aqo/Desktop/siNipblChipPeaks/unique/SA1...,SA1_Common_siNipbl_uniquePeaks.bed
2,/Users/aqo/Desktop/siNipblChipPeaks/unique/SMC...,SMC1_treatOnly_siNipbl_uniquePeaks.bed
3,/Users/aqo/Desktop/siNipblChipPeaks/unique/SA1...,SA1_treatOnly_siNipbl_uniquePeaks.bed
4,/Users/aqo/Desktop/siNipblChipPeaks/unique/SA2...,SA2_treatOnly_siNipbl_uniquePeaks.bed
5,/Users/aqo/Desktop/siNipblChipPeaks/unique/NiP...,NiPBL_Common_siNipbl_uniquePeaks.bed
6,/Users/aqo/Desktop/siNipblChipPeaks/unique/SA1...,SA1_ContOnly_siNipbl_uniquePeaks.bed
7,/Users/aqo/Desktop/siNipblChipPeaks/unique/NiP...,NiPBL_ContOnly_siNipbl_uniquePeaks.bed
8,/Users/aqo/Desktop/siNipblChipPeaks/unique/SA2...,SA2_Common_siNipbl_uniquePeaks.bed
9,/Users/aqo/Desktop/siNipblChipPeaks/unique/SMC...,SMC1_ContOnly_siNipbl_uniquePeaks.bed


In [34]:
## EXtract Prot Condition and Rep from Basename
meta[["Protein","uniqueIn","Treatment","tmp"]] = meta["Basename"].str.split("_", expand=True)
meta=meta.drop(columns=['tmp'])

In [35]:
meta['Sample']=meta.Protein +"_"+meta.uniqueIn+"_"+ meta.Treatment
meta

Unnamed: 0,uniqPeakFile,Basename,Protein,uniqueIn,Treatment,Sample
0,/Users/aqo/Desktop/siNipblChipPeaks/unique/SMC...,SMC1_Common_siNipbl_uniquePeaks.bed,SMC1,Common,siNipbl,SMC1_Common_siNipbl
1,/Users/aqo/Desktop/siNipblChipPeaks/unique/SA1...,SA1_Common_siNipbl_uniquePeaks.bed,SA1,Common,siNipbl,SA1_Common_siNipbl
2,/Users/aqo/Desktop/siNipblChipPeaks/unique/SMC...,SMC1_treatOnly_siNipbl_uniquePeaks.bed,SMC1,treatOnly,siNipbl,SMC1_treatOnly_siNipbl
3,/Users/aqo/Desktop/siNipblChipPeaks/unique/SA1...,SA1_treatOnly_siNipbl_uniquePeaks.bed,SA1,treatOnly,siNipbl,SA1_treatOnly_siNipbl
4,/Users/aqo/Desktop/siNipblChipPeaks/unique/SA2...,SA2_treatOnly_siNipbl_uniquePeaks.bed,SA2,treatOnly,siNipbl,SA2_treatOnly_siNipbl
5,/Users/aqo/Desktop/siNipblChipPeaks/unique/NiP...,NiPBL_Common_siNipbl_uniquePeaks.bed,NiPBL,Common,siNipbl,NiPBL_Common_siNipbl
6,/Users/aqo/Desktop/siNipblChipPeaks/unique/SA1...,SA1_ContOnly_siNipbl_uniquePeaks.bed,SA1,ContOnly,siNipbl,SA1_ContOnly_siNipbl
7,/Users/aqo/Desktop/siNipblChipPeaks/unique/NiP...,NiPBL_ContOnly_siNipbl_uniquePeaks.bed,NiPBL,ContOnly,siNipbl,NiPBL_ContOnly_siNipbl
8,/Users/aqo/Desktop/siNipblChipPeaks/unique/SA2...,SA2_Common_siNipbl_uniquePeaks.bed,SA2,Common,siNipbl,SA2_Common_siNipbl
9,/Users/aqo/Desktop/siNipblChipPeaks/unique/SMC...,SMC1_ContOnly_siNipbl_uniquePeaks.bed,SMC1,ContOnly,siNipbl,SMC1_ContOnly_siNipbl


In [36]:
## add Background col to know which file use as bg in homer
meta['Background']= [ meta.uniqPeakFile[(meta.Protein == P) & (meta.uniqueIn == 'Common')].values[0] \
                     if (UniqIn != 'Common') else "" \
                     for P,UniqIn in zip(meta.Protein, meta.uniqueIn)]
meta

Unnamed: 0,uniqPeakFile,Basename,Protein,uniqueIn,Treatment,Sample,Background
0,/Users/aqo/Desktop/siNipblChipPeaks/unique/SMC...,SMC1_Common_siNipbl_uniquePeaks.bed,SMC1,Common,siNipbl,SMC1_Common_siNipbl,
1,/Users/aqo/Desktop/siNipblChipPeaks/unique/SA1...,SA1_Common_siNipbl_uniquePeaks.bed,SA1,Common,siNipbl,SA1_Common_siNipbl,
2,/Users/aqo/Desktop/siNipblChipPeaks/unique/SMC...,SMC1_treatOnly_siNipbl_uniquePeaks.bed,SMC1,treatOnly,siNipbl,SMC1_treatOnly_siNipbl,/Users/aqo/Desktop/siNipblChipPeaks/unique/SMC...
3,/Users/aqo/Desktop/siNipblChipPeaks/unique/SA1...,SA1_treatOnly_siNipbl_uniquePeaks.bed,SA1,treatOnly,siNipbl,SA1_treatOnly_siNipbl,/Users/aqo/Desktop/siNipblChipPeaks/unique/SA1...
4,/Users/aqo/Desktop/siNipblChipPeaks/unique/SA2...,SA2_treatOnly_siNipbl_uniquePeaks.bed,SA2,treatOnly,siNipbl,SA2_treatOnly_siNipbl,/Users/aqo/Desktop/siNipblChipPeaks/unique/SA2...
5,/Users/aqo/Desktop/siNipblChipPeaks/unique/NiP...,NiPBL_Common_siNipbl_uniquePeaks.bed,NiPBL,Common,siNipbl,NiPBL_Common_siNipbl,
6,/Users/aqo/Desktop/siNipblChipPeaks/unique/SA1...,SA1_ContOnly_siNipbl_uniquePeaks.bed,SA1,ContOnly,siNipbl,SA1_ContOnly_siNipbl,/Users/aqo/Desktop/siNipblChipPeaks/unique/SA1...
7,/Users/aqo/Desktop/siNipblChipPeaks/unique/NiP...,NiPBL_ContOnly_siNipbl_uniquePeaks.bed,NiPBL,ContOnly,siNipbl,NiPBL_ContOnly_siNipbl,/Users/aqo/Desktop/siNipblChipPeaks/unique/NiP...
8,/Users/aqo/Desktop/siNipblChipPeaks/unique/SA2...,SA2_Common_siNipbl_uniquePeaks.bed,SA2,Common,siNipbl,SA2_Common_siNipbl,
9,/Users/aqo/Desktop/siNipblChipPeaks/unique/SMC...,SMC1_ContOnly_siNipbl_uniquePeaks.bed,SMC1,ContOnly,siNipbl,SMC1_ContOnly_siNipbl,/Users/aqo/Desktop/siNipblChipPeaks/unique/SMC...


In [41]:
meta.Background.astype(str)

0                                                      
1                                                      
2     /Users/aqo/Desktop/siNipblChipPeaks/unique/SMC...
3     /Users/aqo/Desktop/siNipblChipPeaks/unique/SA1...
4     /Users/aqo/Desktop/siNipblChipPeaks/unique/SA2...
5                                                      
6     /Users/aqo/Desktop/siNipblChipPeaks/unique/SA1...
7     /Users/aqo/Desktop/siNipblChipPeaks/unique/NiP...
8                                                      
9     /Users/aqo/Desktop/siNipblChipPeaks/unique/SMC...
10    /Users/aqo/Desktop/siNipblChipPeaks/unique/SA2...
11    /Users/aqo/Desktop/siNipblChipPeaks/unique/NiP...
Name: Background, dtype: object