# Gene orientation at TAD boundaries genome-wide
# 1.Finding pairs of genes that are closest to the TAD boundaries
Gene file: azucena_genes.bed

TADs (1kb resolution): /scratch/ak8725/az_mrg/hicexplorer_1kb_tads.bed
this is essentially the same file as /scratch/ak8725/az_mrg/hicFindTADs/hicFindTADs2_out/az_mrg2_domains.bed

TAD boundaries are here /scratch/ak8725/az_mrg/hicFindTADs/hicFindTADs2_out/az_mrg2_boundaries.bed

In [5]:
sed -i 's/ /\t/g' /scratch/ak8725/az_mrg/azucena_genes.bed

In [1]:
head /scratch/ak8725/az_mrg/hicexplorer_1kb_tads.bed

chr01	72000	84000
chr01	84000	175000
chr01	175000	246000
chr01	246000	303000
chr01	303000	317000
chr01	317000	422000
chr01	422000	467000
chr01	467000	483000
chr01	483000	494000
chr01	494000	513000


In [6]:
cut -f 1,2,3,4,6 /scratch/ak8725/az_mrg/azucena_genes.bed > /scratch/ak8725/az_mrg/az_genes.bed

In [4]:
head /scratch/ak8725/az_mrg/azucena_genes.bed

chr01	2325	10183	gene:OsAzu_01g0000010	+
chr01	10736	11801	gene:OsAzu_01g0000020	+
chr01	10737	11793	gene:OsAzu_01g0000030	-
chr01	12076	15281	gene:OsAzu_01g0000040	+
chr01	15657	19273	gene:OsAzu_01g0000050	+
chr01	22199	26348	gene:OsAzu_01g0000060	+
chr01	26494	28061	gene:OsAzu_01g0000070	+
chr01	29224	33894	gene:OsAzu_01g0000080	+
chr01	35014	40593	gene:OsAzu_01g0000090	+
chr01	56572	57695	gene:OsAzu_01g0000100	+


In [None]:
#/scratch/ak8725/az_mrg/closest.sh

#!/bin/bash
#
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --time=4:00:00
#SBATCH --mem=32GB
#SBATCH --mail-type=END,FAIL
#SBATCH --mail-user=ak8725@nyu.edu

module load bedtools/intel/2.29.2

# define input files
tads_file="/scratch/ak8725/az_mrg/hicexplorer_1kb_tads.bed"
genes_file="/scratch/ak8725/az_mrg/azucena_genes.bed"

# define output file
output_file="closest_genes.bed"

# create header for output file
echo -e "chr\tstart_tad\tgene1\tstart1\tend1\tstrand1\tend_tad\tgene2\tstart2\tend2\tstrand2" > $output_file

# loop through each feature in tads.bed
while read -r chr start_tad end_tad; do

    echo "Processing TAD: $chr $start_tad $end_tad"

    # find the closest gene to the start coordinate of the feature
    gene1=$(bedtools closest -a <(echo -e "$chr\\t$start_tad\\t$start_tad" | bedtools sort -i -) -b $genes_file -d -t first | awk '{print $7}')

    # find the closest gene to the end coordinate of the feature
    gene2=$(bedtools closest -a <(echo -e "$chr\\t$end_tad\\t$end_tad" | bedtools sort -i -) -b $genes_file -d -t first | awk '{print $7}')

    # if either gene1 or gene2 is empty, skip this TAD
    if [[ -z $gene1 || -z $gene2 ]]; then
        echo "Could not find closest gene for TAD: $chr $start_tad $end_tad"
        continue
    fi

    # get the information for gene1, removing duplicates if present
    gene1_info=$(grep -w $gene1 $genes_file | sort -u -k 4,4)
    gene1_chr=$(echo $gene1_info | awk '{print $1}')
    gene1_start=$(echo $gene1_info | awk '{print $2}')
    gene1_end=$(echo $gene1_info | awk '{print $3}')
    gene1_name=$(echo $gene1_info | awk '{print $4}' )
    gene1_strand=$(echo $gene1_info | awk '{print $5}')
    
    # get the information for gene2, removing duplicates if present
    gene2_info=$(grep -w $gene2 $genes_file | sort -u -k 4,4)
    gene2_chr=$(echo $gene2_info | awk '{print $1}')
    gene2_start=$(echo $gene2_info | awk '{print $2}')
    gene2_end=$(echo $gene2_info | awk '{print $3}')
    gene2_name=$(echo $gene2_info | awk '{print $4}' )
    gene2_strand=$(echo $gene2_info | awk '{print $5}')

    # print the output
    echo -e "$chr\t$start_tad\t$gene1_name\t$gene1_start\t$gene1_end\t$gene1_strand\t$gene2_name\t$gene2_start\t$gene2_end\t$gene2_strand" >> $output_file

done < $tads_file

In [13]:
wc -l genes_close_to_1kb_tads.bed

4651 genes_close_to_1kb_tads.bed


### Report how many gene pairs are convergent (+-), divergent (-+), none (++, --)

In [3]:
with open("/scratch/ak8725/az_mrg/genes_close_to_1kb_tads.bed", "r") as f:
    plus_minus = 0
    minus_plus = 0
    plus_plus = 0
    minus_minus = 0
    
    for line in f:
        line = line.strip().split("\t")
        if line[5] == "+" and line[10] == "-":
            plus_minus += 1
        elif line[5] == "-" and line[10] == "+":
            minus_plus += 1
        elif line[5] == "+" and line[10] == "+":
            plus_plus += 1
        else:
            minus_minus += 1

print("Number of TADs with convergent genes near/at boundaries = {}".format(plus_minus))
print("Number of TADs with divergent genes near/at boundaries = {}".format(minus_plus))
print("Number of TADs with genes in the same orientation near/at boundaries = {}".format(plus_plus + minus_minus))
print("Both forward = {}".format(plus_plus))
print("Both reverse = {}".format(minus_minus))

Number of TADs with convergent genes near/at boundaries = 1127
Number of TADs with divergent genes near/at boundaries = 1128
Number of TADs with genes in the same orientation near/at boundaries = 2395
Both forward = 1247
Both reverse = 1148


In [1]:
module load bedtools/intel/2.29.2

# Finding pairs of closest promoters to the TAD boundary

In [4]:
head azucena_genes.bed

chr01	2325	10183	gene:OsAzu_01g0000010	+
chr01	10736	11801	gene:OsAzu_01g0000020	+
chr01	10737	11793	gene:OsAzu_01g0000030	-
chr01	12076	15281	gene:OsAzu_01g0000040	+
chr01	15657	19273	gene:OsAzu_01g0000050	+
chr01	22199	26348	gene:OsAzu_01g0000060	+
chr01	26494	28061	gene:OsAzu_01g0000070	+
chr01	29224	33894	gene:OsAzu_01g0000080	+
chr01	35014	40593	gene:OsAzu_01g0000090	+
chr01	56572	57695	gene:OsAzu_01g0000100	+


In [9]:
awk '{if ($5 == "+") print $1, $2-1000, $2, $4, $5; else if ($5 == "-") print $1, $3, $3+1000, $4, $5;}' azucena_genes.bed > azucena_1kb_promoter+genes.bed

In [10]:
sort -k1,1 -k2,2n -k3,3n azucena_1kb_promoter+genes.bed > temp.bed && mv temp.bed azucena_1kb_promoter+genes.bed

In [19]:
sed -i 's/ /\t/g' /scratch/ak8725/az_mrg/azucena_1kb_promoters_genes.bed

In [6]:
mv azucena_1kb_promoter+genes.bed azucena_1kb_promoters_genes.bed

In [1]:
#/scratch/ak8725/az_mrg/closest.sh

#!/bin/bash
#
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --time=4:00:00
#SBATCH --mem=32GB
#SBATCH --mail-type=END,FAIL
#SBATCH --mail-user=ak8725@nyu.edu

module load bedtools/intel/2.29.2

# define input files
tads_file="/scratch/ak8725/az_mrg/hicexplorer_1kb_tads.bed"
genes_file="/scratch/ak8725/az_mrg/azucena_1kb_promoters_genes.bed"

# define output file
output_file="promoters_close_to_1kb_tads.bed"

# create header for output file
echo -e "chr\tstart_tad\tgene1\tstart1\tend1\tstrand1\tend_tad\tgene2\tstart2\tend2\tstrand2" > $output_file

# loop through each feature in tads.bed
while read -r chr start_tad end_tad; do

    echo "Processing TAD: $chr $start_tad $end_tad"

    # find the closest gene to the start coordinate of the feature
    gene1=$(bedtools closest -a <(echo -e "$chr\\t$start_tad\\t$start_tad" | bedtools sort -i -) -b $genes_file -d -t first | awk '{print $7}')

    # find the closest gene to the end coordinate of the feature
    gene2=$(bedtools closest -a <(echo -e "$chr\\t$end_tad\\t$end_tad" | bedtools sort -i -) -b $genes_file -d -t first | awk '{print $7}')

    # if either gene1 or gene2 is empty, skip this TAD
    if [[ -z $gene1 || -z $gene2 ]]; then
        echo "Could not find closest gene for TAD: $chr $start_tad $end_tad"
        continue
    fi

    # get the information for gene1, removing duplicates if present
    gene1_info=$(grep -w $gene1 $genes_file | sort -u -k 4,4)
    gene1_chr=$(echo $gene1_info | awk '{print $1}')
    gene1_start=$(echo $gene1_info | awk '{print $2}')
    gene1_end=$(echo $gene1_info | awk '{print $3}')
    gene1_name=$(echo $gene1_info | awk '{print $4}' )
    gene1_strand=$(echo $gene1_info | awk '{print $5}')
    
    # get the information for gene2, removing duplicates if present
    gene2_info=$(grep -w $gene2 $genes_file | sort -u -k 4,4)
    gene2_chr=$(echo $gene2_info | awk '{print $1}')
    gene2_start=$(echo $gene2_info | awk '{print $2}')
    gene2_end=$(echo $gene2_info | awk '{print $3}')
    gene2_name=$(echo $gene2_info | awk '{print $4}' )
    gene2_strand=$(echo $gene2_info | awk '{print $5}')

    # print the output
    echo -e "$chr\t$start_tad\t$gene1_name\t$gene1_start\t$gene1_end\t$gene1_strand\t$gene2_name\t$gene2_start\t$gene2_end\t$gene2_strand" >> $output_file

done < $tads_file

SyntaxError: invalid syntax (<ipython-input-1-9f611e102aaa>, line 13)

### Report how many promoter pairs are convergent (+-), divergent (-+), none (++, --)

In [2]:
with open("/scratch/ak8725/az_mrg/promoters_close_to_1kb_tads.bed", "r") as f:
    plus_minus = 0
    minus_plus = 0
    plus_plus = 0
    minus_minus = 0
    
    for line in f:
        line = line.strip().split("\t")
        if line[5] == "+" and line[10] == "-":
            plus_minus += 1
        elif line[5] == "-" and line[10] == "+":
            minus_plus += 1
        elif line[5] == "+" and line[10] == "+":
            plus_plus += 1
        else:
            minus_minus += 1

print("Number of TADs with convergent promoters near/at boundaries = {}".format(plus_minus))
print("Number of TADs with divergent promoters near/at boundaries = {}".format(minus_plus))
print("Number of TADs with promoters in the same orientation near/at boundaries = {}".format(plus_plus + minus_minus))
print("Both forward = {}".format(plus_plus))
print("Both reverse = {}".format(minus_minus))

Number of TADs with convergent promoters near/at boundaries = 1113
Number of TADs with divergent promoters near/at boundaries = 1116
Number of TADs with promoters in the same orientation near/at boundaries = 2422
Both forward = 1272
Both reverse = 1150


In [1]:
cd ../az_mrg

In [12]:
head promoters_close_to_1kb_tads.bed

chr	start_tad	gene1	start1	end1	strand1	end_tad	gene2	start2	end2	strand2
chr01	72000	gene:OsAzu_01g0000140	71184	72184	+	84000	gene:OsAzu_01g0000160	83753	84753	+
chr01	84000	gene:OsAzu_01g0000160	83753	84753	+	175000	gene:OsAzu_01g0000290	174293	175293	+
chr01	175000	gene:OsAzu_01g0000290	174293	175293	+	246000	gene:OsAzu_01g0000390	244874	245874	+
chr01	246000	gene:OsAzu_01g0000390	244874	245874	+	303000	gene:OsAzu_01g0000470	303599	304599	+
chr01	303000	gene:OsAzu_01g0000470	303599	304599	+	317000	gene:OsAzu_01g0000500	317218	318218	-
chr01	317000	gene:OsAzu_01g0000500	317218	318218	-	422000	gene:OsAzu_01g0000640	421623	422623	+
chr01	422000	gene:OsAzu_01g0000640	421623	422623	+	467000	gene:OsAzu_01g0000680	467701	468701	+
chr01	467000	gene:OsAzu_01g0000680	467701	468701	+	483000	gene:OsAzu_01g0000700	482401	483401	+
chr01	483000	gene:OsAzu_01g0000700	482401	483401	+	494000	gene:OsAzu_01g0000710	493334	494334	+


## Find pairs of promoters overlapping 1kb TAD boundaries

In [25]:
head /scratch/ak8725/az_mrg/hicexplorer_1kb_tads.bed

chr01	72000	84000
chr01	84000	175000
chr01	175000	246000
chr01	246000	303000
chr01	303000	317000
chr01	317000	422000
chr01	422000	467000
chr01	467000	483000
chr01	483000	494000
chr01	494000	513000


In [24]:
head /scratch/ak8725/az_mrg/azucena_1kb_promoters_genes.bed

chr01	1325	2325	gene:OsAzu_01g0000010	+
chr01	9736	10736	gene:OsAzu_01g0000020	+
chr01	11076	12076	gene:OsAzu_01g0000040	+
chr01	11793	12793	gene:OsAzu_01g0000030	-
chr01	14657	15657	gene:OsAzu_01g0000050	+
chr01	21199	22199	gene:OsAzu_01g0000060	+
chr01	25494	26494	gene:OsAzu_01g0000070	+
chr01	28224	29224	gene:OsAzu_01g0000080	+
chr01	34014	35014	gene:OsAzu_01g0000090	+
chr01	55572	56572	gene:OsAzu_01g0000100	+


In [None]:
#/scratch/ak8725/az_mrg/overlap.sh

#!/bin/bash
#
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --time=4:00:00
#SBATCH --mem=32GB
#SBATCH --mail-type=END,FAIL
#SBATCH --mail-user=ak8725@nyu.edu

module load bedtools/intel/2.29.2

# define input files
tads_file="/scratch/ak8725/az_mrg/hicexplorer_1kb_tads.bed"
genes_file="/scratch/ak8725/az_mrg/azucena_1kb_promoters_genes.bed"

# define output file
output_file="promoters_at_1kb_tads_boundaries.bed"

# create header for output file
echo -e "chr\tstart_tad\tgene1\tstrand1\tend_tad\tgene2\tstrand2" > $output_file

# loop through each feature in tads.bed
while read -r chr start_tad end_tad; do

    echo "Processing TAD: $chr $start_tad $end_tad"
    
    gene1=$(bedtools intersect -a <(echo -e "$chr\\t$start_tad-500\\t$start_tad-500" | bedtools sort -i -) -b $genes_file -wo | awk '{print $7}')
    gene2=$(bedtools closest -a <(echo -e "$chr\\t$end_tad-500\\t$end_tad+500" | bedtools sort -i -) -b $genes_file -wo | awk '{print $7}')

    if [[ -z $gene1 || -z $gene2 ]]; then
        echo "Could not find closest gene for TAD: $chr $start_tad $end_tad"
        continue
    fi

    gene1_info=$(grep -w $gene1 $genes_file | sort -u -k 4,4)
    gene1_name=$(echo $gene1_info | awk '{print $4}' )
    gene1_strand=$(echo $gene1_info | awk '{print $5}')

    gene2_info=$(grep -w $gene2 $genes_file | sort -u -k 4,4)
    gene2_name=$(echo $gene2_info | awk '{print $4}' )
    gene2_strand=$(echo $gene2_info | awk '{print $5}')

    echo -e "$chr\t$start_tad\t$gene1_name\t$gene1_strand\t$gene2_name\t$gene2_strand" >> $output_file

done < $tads_file

In [2]:
#calculate how many promoter pairs are con/div/none
with open("/scratch/ak8725/az_mrg/promoters_at_1kb_tads_boundaries.bed", "r") as f:
    plus_minus = 0
    minus_plus = 0
    plus_plus = 0
    minus_minus = 0
    
    for line in f:
        line = line.strip().split("\t")
        if line[3] == "+" and line[5] == "-":
            plus_minus += 1
        elif line[3] == "-" and line[5] == "+":
            minus_plus += 1
        elif line[3] == "+" and line[5] == "+":
            plus_plus += 1
        else:
            minus_minus += 1

print("Number of TADs with convergent promoters at boundaries = {}".format(plus_minus))
print("Number of TADs with divergent promoters at boundaries = {}".format(minus_plus))
print("Number of TADs with promoters in the same orientation at boundaries = {}".format(plus_plus + minus_minus))
print("Both forward = {}".format(plus_plus))
print("Both reverse = {}".format(minus_minus))

Number of TADs with convergent promoters at boundaries = 524
Number of TADs with divergent promoters at boundaries = 496
Number of TADs with promoters in the same orientation at boundaries = 1086
Both forward = 579
Both reverse = 507


In [2]:
wc -l promoters_at_1kb_tads_boundaries.bed

2106 promoters_at_1kb_tads_boundaries.bed


In [3]:
head promoters_at_1kb_tads_boundaries.bed

chr	start_tad	gene1	strand1	end_tad	gene2	strand2
chr01	72000	gene:OsAzu_01g0000140	+	gene:OsAzu_01g0000160	+
chr01	84000	gene:OsAzu_01g0000160	+	gene:OsAzu_01g0000290	+
chr01	175000	gene:OsAzu_01g0000290	+	gene:OsAzu_01g0000390	+
chr01	246000	gene:OsAzu_01g0000390	+	gene:OsAzu_01g0000470	+
chr01	317000	gene:OsAzu_01g0000500	-	gene:OsAzu_01g0000640	+
chr01	422000	gene:OsAzu_01g0000640	+	gene:OsAzu_01g0000680	+
chr01	483000	gene:OsAzu_01g0000690	-	gene:OsAzu_01g0000710	+
chr01	494000	gene:OsAzu_01g0000710	+	gene:OsAzu_01g0000750	-
chr01	513000	gene:OsAzu_01g0000750	-	gene:OsAzu_01g0000810	-
