## We utilized the [TSS annotation](https://www.encodeproject.org/files/ENCFF493CCB/) provided by the [ENCODE ATAC-seq pipeline](https://github.com/ENCODE-DCC/atac-seq-pipeline) to annotate genomic regions based on their proximity to the nearest gene. This annotation file employs UCSC names as the gene identifiers. To convert the gene identifier to gene names, the code provided below was used.

In [2]:
library("org.Hs.eg.db") # remember to install it if you don't have it already


In [44]:
convert_ensembleID_to_geneSymbol_bed <- function(file_name){
    bed_input_path =  paste("/project/Mechanogenomics_data/ATACseq_analysis/DiffAcc/results/TSS_annotated_bedFiles/", file_name, ".bed", sep="")
    bed_file = read.table(bed_input_path, sep = "\t", header = F)
    ENS_IDs = gsub("\\..*","",bed_file$V14)
    symbols <- mapIds(org.Hs.eg.db, keys = ENS_IDs, keytype = "ENSEMBL", column="SYMBOL")
    bed_file$V4 = as.data.frame(symbols)$symbols
    out_name = paste("/project/Mechanogenomics_data/ATACseq_analysis/DiffAcc/results/TSS_annotated_bedFiles/TSS_geneSymbolAnnotated_bedFiles/", file_name, "_geneSymbol.bed", sep="")
    write.table(bed_file, file=out_name, quote=F, sep="\t", row.names=F, col.names=F)
    }

In [30]:
files <- list.files(path="/project/Mechanogenomics_data/ATACseq_analysis/DiffAcc/results/TSS_annotated_bedFiles/", pattern="*.bed$", full.names=F, recursive=FALSE)


In [39]:
files_noextension = sub('\\.bed$', '', files) 

In [45]:
for (file in files_noextension){
    convert_ensembleID_to_geneSymbol_bed(file)
}

'select()' returned 1:many mapping between keys and columns

'select()' returned 1:many mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:many mapping between keys and columns

'select()' returned 1:many mapping between keys and columns

'select()' returned 1:many mapping between keys and columns



## convert TSS bed file

In [50]:
TSS_bed_file = read.table("/project/Mechanogenomics_data/ATACseq_analysis/DiffAcc/annotation/ENCFF493CCB_TSS_sorted.bed", sep = "\t", header = F)
head(TSS_bed_file)


Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>,<int>,<chr>
1,chr1,65417,65418,ENSG00000186092.6,0,+
2,chr1,451697,451698,ENSG00000284733.1,0,-
3,chr1,686673,686674,ENSG00000284662.1,0,-
4,chr1,923926,923927,ENSG00000187634.11,0,+
5,chr1,959309,959310,ENSG00000188976.10,0,-
6,chr1,960585,960586,ENSG00000187961.13,0,+


In [49]:
gsub("\\..*","",TSS_bed_file$V14)

In [51]:
TSS_symbols <- mapIds(org.Hs.eg.db, keys = gsub("\\..*","",TSS_bed_file$V4), keytype = "ENSEMBL", column="SYMBOL")

'select()' returned 1:many mapping between keys and columns



In [52]:
TSS_bed_file$V4 = as.data.frame(TSS_symbols)$TSS_symbols
head(TSS_bed_file)

Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>,<int>,<chr>
1,chr1,65417,65418,OR4F5,0,+
2,chr1,451697,451698,OR4F29,0,-
3,chr1,686673,686674,OR4F16,0,-
4,chr1,923926,923927,SAMD11,0,+
5,chr1,959309,959310,NOC2L,0,-
6,chr1,960585,960586,KLHL17,0,+


In [53]:
write.table(TSS_bed_file, file="/project/Mechanogenomics_data/ATACseq_analysis/DiffAcc/annotation/ENCFF493CCB_TSS_sorted_geneSymbol.bed", quote=F, sep="\t", row.names=F, col.names=F)

## then, we used bedtools to assign closest genes to each regions as follows:

with USCS naming:

bedtools closest -a ./sortedBedFiles/flow_bmp9_VS_stat_bmp9_sorted.bed -b /project/Mechanogenomics_data/ATACseq_analysis/DiffAcc/annotation/ENCFF493CCB_TSS_sorted.bed -d > ../TSS_annotated_bedFiles/flow_bmp9_VS_stat_bmp9_TSS_annotated.bed

with gene symbols:

bedtools closest -a ./sortedBedFiles/flow_bmp9_VS_stat_bmp9_sorted.bed -b /project/Mechanogenomics_data/ATACseq_analysis/DiffAcc/annotation/ENCFF493CCB_TSS_sorted.bed -d > ../TSS_annotated_bedFiles/flow_bmp9_VS_stat_bmp9_TSS_annotated.bed

bedtools closest -a ./sortedBedFiles/flow_bmp9_VS_stat_cont_sorted.bed -b /project/Mechanogenomics_data/ATACseq_analysis/DiffAcc/annotation/ENCFF493CCB_TSS_sorted.bed -d > ../TSS_annotated_bedFiles/flow_bmp9_VS_stat_cont_TSS_annotated.bed

bedtools closest -a ./sortedBedFiles/flow_cont_VS_flow_bmp9_sorted.bed -b /project/Mechanogenomics_data/ATACseq_analysis/DiffAcc/annotation/ENCFF493CCB_TSS_sorted.bed -d > ../TSS_annotated_bedFiles/flow_cont_VS_flow_bmp9_TSS_annotated.bed

bedtools closest -a ./sortedBedFiles/flow_cont_VS_stat_bmp9_sorted.bed -b /project/Mechanogenomics_data/ATACseq_analysis/DiffAcc/annotation/ENCFF493CCB_TSS_sorted.bed -d > ../TSS_annotated_bedFiles/flow_cont_VS_stat_bmp9_TSS_annotated.bed

bedtools closest -a ./sortedBedFiles/flow_cont_VS_stat_cont_sorted.bed -b /project/Mechanogenomics_data/ATACseq_analysis/DiffAcc/annotation/ENCFF493CCB_TSS_sorted.bed -d > ../TSS_annotated_bedFiles/flow_cont_VS_stat_cont_TSS_annotated.bed

bedtools closest -a ./sortedBedFiles/stat_bmp9_VS_stat_cont_sorted.bed -b /project/Mechanogenomics_data/ATACseq_analysis/DiffAcc/annotation/ENCFF493CCB_TSS_sorted.bed -d > ../TSS_annotated_bedFiles/stat_bmp9_VS_stat_cont_TSS_annotated.bed

