### This script categorizes CRX-dependent activated genes by their associated HD motifs types

In [71]:
import os
import sys
import warnings
import re
import itertools
import random 
random.seed(4444)

import numpy as np
import pandas as pd
import scipy
from scipy import stats
from scipy.stats import mannwhitneyu, normaltest
from scipy.cluster import hierarchy
import statsmodels
import fastcluster
import statsmodels.api as sm

import sklearn
from sklearn.preprocessing import StandardScaler

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.gridspec as gridspec
from matplotlib.colors import Normalize
import matplotlib.font_manager
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.patches import Patch
from matplotlib.patches import Rectangle
from matplotlib.lines import Line2D
import logomaker
import seaborn as sns
from seaborn.utils import relative_luminance
#from statannotations.Annotator import Annotator

### I. housekeeping

In [72]:
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [74]:
# compatible naming between WinOS and MacOS
base_dir = os.path.dirname(os.getcwd())
os.chdir(base_dir)

chip_basedir = os.path.join(base_dir, "ChIPseq_analysis", "processed_data")
rna_basedir = os.path.join(base_dir, "RNAseq_analysis", "processed_data")

In [77]:
# crx chip
crx_chip_clustered_matrix = pd.read_csv(os.path.join(chip_basedir, "lfc_cutoff_clustered_chip_regions.tsv"), sep="\t", header=0).astype(object)
crx_chip_rna_matrix = pd.read_csv(os.path.join(rna_basedir, "wt_chip_rna_compiled_matrix.txt"), sep="\t", header=0).astype(object).drop_duplicates(subset=["seqnames","start"]).copy()

# crx rna
crx_rna_full_matrix = pd.read_csv(os.path.join(rna_basedir, "compiled_hdmuts_lfc.tsv"), sep="\t", header=0).astype(object)

aldiri_normalized_counts = pd.read_csv(os.path.join(rna_basedir, "aldiri_normalized_counts.tsv"), sep="\t", header=0).astype(object)
aldiri_normalized_rowz = pd.read_csv(os.path.join(rna_basedir, "aldiri_normalized_rowz.tsv"), sep="\t", header=0).astype(object)

### II. parse the FIMO tables

In [11]:
# read directly from file
masked_fimo_ser = pd.Series(dtype=object)
chip_fimo_raw_ser = pd.Series(dtype=object)
for motif in ["pr"]:
    chip_fimo_raw_ser[motif] = pd.read_csv(os.path.join(chip_basedir, f"chip_all_regions_{motif}", "fimo.tsv"), sep="\t", header=0)
    masked_fimo_ser[motif] = pd.read_csv(os.path.join(chip_basedir, f"masked_all_regions_{motif}", "fimo.tsv"), sep="\t", header=0)

#### identify instances of monomeric HD motifs

In [11]:
# monomeric motif from masked fasta scanning
prtf_fimo_table = masked_fimo_ser["pr"][:-3].copy().drop(columns=["motif_alt_id"])

k50_mono_fimo_table = prtf_fimo_table.loc[lambda df:df["motif_id"]=="CRX_Corbo",:].copy()
q50_mono_fimo_table = prtf_fimo_table.loc[lambda df:df["motif_id"]=="RAX2.MA0717.1",:].copy()

In [12]:
k50_mono_fimo_table["core"] = k50_mono_fimo_table["matched_sequence"].apply(lambda x: x[2:4])
k50_mono_fimo_table["6mer"] = k50_mono_fimo_table["matched_sequence"].apply(lambda x: x[1:-1])
k50_mono_fimo_table = k50_mono_fimo_table.loc[lambda df: df["core"]=="AA",:].copy()

q50_mono_fimo_table["core"] = q50_mono_fimo_table["matched_sequence"].apply(lambda x: x[2:4])
q50_mono_fimo_table["6mer"] = q50_mono_fimo_table["matched_sequence"].apply(lambda x: x[1:-1])
q50_mono_fimo_table = q50_mono_fimo_table.loc[lambda df: df["core"]=="AA",:].copy()

#### identify instances of dimeric HD motifs

In [15]:
# get the dimeric motif count from unmasked fasta scanning
# first filter by half site core motif match, at least to "TT" and "AA"
prtf_dimer_table = chip_fimo_raw_ser["pr"][:-3].copy().drop(columns=["motif_alt_id"])
k50_dimer_fimo_table = prtf_dimer_table.loc[lambda df:df["motif_id"]=="E80A_atacLost.MEME-19",:].copy()
q50_dimer_fimo_table = prtf_dimer_table.loc[lambda df:df["motif_id"]=="K88N_atacGain.STREME-1",:].copy()

In [16]:
k50_dimer_fimo_table["core"] = k50_dimer_fimo_table["matched_sequence"].apply(lambda x: x[1:3]+x[-3:-1]) 
k50_dimer_fimo_table["spacer"] = k50_dimer_fimo_table["matched_sequence"].apply(lambda x: x[4:7]) 
k50_dimer_fimo_table = k50_dimer_fimo_table.loc[lambda df: df["core"]=="AATT",:].copy().reset_index(drop=True)

q50_dimer_fimo_table["core"] = q50_dimer_fimo_table["matched_sequence"].apply(lambda x: x[2:4]+x[-4:-2]) 
q50_dimer_fimo_table["spacer"] = q50_dimer_fimo_table["matched_sequence"].apply(lambda x: x[5:8])
q50_dimer_fimo_table = q50_dimer_fimo_table.loc[lambda df: df["core"]=="AATT",:].copy().reset_index(drop=True)

In [17]:
compliment = {"A": "T", "C": "G", "G": "C", "T": "A", "N": "N"}
def reverse_rmatch(old_seq):
    rev_seq = old_seq[::-1]
    rev_comp = "".join([compliment[i] for i in rev_seq])
    return rev_comp
k50_dimer_fimo_table["halfsite1"] = k50_dimer_fimo_table["matched_sequence"].apply(lambda x: x[:6]) 
k50_dimer_fimo_table["halfsite2"] = k50_dimer_fimo_table["matched_sequence"].apply(lambda x: reverse_rmatch(x[-6:]))
q50_dimer_fimo_table["halfsite1"] = q50_dimer_fimo_table["matched_sequence"].apply(lambda x: x[1:7]) 
q50_dimer_fimo_table["halfsite2"] = q50_dimer_fimo_table["matched_sequence"].apply(lambda x: reverse_rmatch(x[-7:-1]))

In [38]:
crx_chip_clustered_matrix["cluster"].value_counts()

e80a.notDB     4303
e80a.gained    2848
e80a.lost      1994
k88n.gained     687
Name: cluster, dtype: int64

#### III. annotate CRX target genes by the type of HD motifs in their associated CRX-bound regulatory elements

In [81]:
# all crx chip peak associated genes
crx_dependent_genes = crx_chip_rna_matrix.sort_values(by="distTSS", key=lambda x: abs(x), ascending=True).drop_duplicates(subset=["peak.id"], keep="first").sort_values(by=["seqnames","start"]).reset_index(drop=True).copy()

In [43]:
subset_great_annot = crx_chip_clustered_matrix.copy()

In [22]:
def annotate_peak_motif(peak_id, motif_match_peaks):
    if peak_id in motif_match_peaks.values:
        return 1
    else:
        return 0

In [44]:
# instacnes of K50 motifs
tmp = subset_great_annot["peak.id"].apply(lambda x: annotate_peak_motif(x, k50_mono_fimo_table["sequence_name"])).to_frame("k50_mono")
subset_great_annot = subset_great_annot.merge(tmp, left_index=True, right_index=True).fillna(0)
tmp = subset_great_annot["peak.id"].apply(lambda x: annotate_peak_motif(x, k50_dimer_fimo_table["sequence_name"])).to_frame("k50_di")
subset_great_annot = subset_great_annot.merge(tmp, left_index=True, right_index=True).fillna(0)

In [45]:
# instances of Q50 motifs
tmp = subset_great_annot["peak.id"].apply(lambda x: annotate_peak_motif(x, q50_mono_fimo_table["sequence_name"])).to_frame("q50_mono")
subset_great_annot = subset_great_annot.merge(tmp, left_index=True, right_index=True).fillna(0)
tmp = subset_great_annot["peak.id"].apply(lambda x: annotate_peak_motif(x, q50_dimer_fimo_table["sequence_name"])).to_frame("q50_di")
subset_great_annot = subset_great_annot.merge(tmp, left_index=True, right_index=True).fillna(0)

In [46]:
# compile the chip and motif annotation dataframes
subset_great_annot = subset_great_annot.merge(crx_chip_clustered_matrix[["peak.id","cluster","row_cluster"]])
subset_great_annot = subset_great_annot.merge(crx_dependent_genes[["seqnames","start","end","gene"]])

In [82]:
# categorize into di+m, di, m, none groups
gene_motif_annot = pd.DataFrame(index=crx_dependent_genes["gene"].unique(), columns=["k50_group","q50_group"])
for gene in gene_motif_annot.index:
    for type in ["k50","q50"]:
        check_mono = subset_great_annot.loc[lambda df: df["gene"]==gene,f"{type}_mono"].sum()>0
        check_di = subset_great_annot.loc[lambda df: df["gene"]==gene,f"{type}_di"].sum()>0

        if check_mono == True and check_di == True:
            gene_motif_annot.at[gene, f"{type}_group"] = 3
        if check_mono == False and check_di == True:
            gene_motif_annot.at[gene, f"{type}_group"] = 2
        if check_mono == True and check_di == False:
            gene_motif_annot.at[gene, f"{type}_group"] = 1
        if check_mono == False and check_di == False:
            gene_motif_annot.at[gene, f"{type}_group"] = 0

In [49]:
subset_great_annot = subset_great_annot.merge(gene_motif_annot.reset_index(drop=False).rename(columns={"index":"gene"}), how="outer")

In [75]:
subset_great_annot.to_csv(os.path.join(rna_basedir, "crx_all_target_genes_k50_motif_annotation.tsv"), sep="\t", header=True, index=False)

#### IV. attach RNA lfc both p10 and p21

In [None]:
unique_gene_motif = pd.merge(gene_motif_annot, crx_rna_full_matrix.set_index("GENENAME"), left_index=True, right_index=True).reset_index(drop=False).rename(columns={"index":"gene"})
unique_gene_motif = unique_gene_motif.drop_duplicates("gene").reset_index(drop=True)

In [None]:
unique_gene_motif.to_csv(os.path.join(rna_basedir, "crx_all_target_genes_cre_group_annotation.tsv"), sep="\t", header=True, index=False)