# Imports

In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import scipy
import scipy.stats as st
from sklearn.decomposition import PCA
from lmfit import minimize, Parameters
from tqdm.notebook import tqdm

mpl.rcParams["font.sans-serif"] = "Arial"
mpl.rcParams["font.family"] = "sans-serif"
plt.rcParams["figure.figsize"] = (10, 6)
mpl.rcParams["pdf.fonttype"] = 42
sns.set_style(
    "ticks",
    {
        "xtick.major.size": 4,
        "ytick.major.size": 4,
        "font_color": "k",
        "axes.edgecolor": "k",
        "xtick.color": "k",
        "ytick.color": "k",
    },
)
sns.set_context("talk", font_scale=1.0)

act_rep_palette = ["tab:blue", "tab:orange", "tab:olive"]

d2_threshold = 0.639
d5_threshold = 0.103

d2_baseline_threshold = -0.938
d5_baseline_threshold = -0.300

# Importing Data

In [2]:
df = pd.read_csv("../../fig_2/01_activators_synergy/pairs_baselinesums.csv")
oligos = pd.read_csv("../../fig_1/01_raw_counts/csvs/base_oligo_library.csv")

# Fitting Data

In [16]:
def domain_line_fit(gene, ax, debug=False):
    def printdb(s):
        if debug:
            print(s)

    # get the data
    domain = list(oligos[oligos["Gene"] == gene]["label"])[0]
    printdb("Test domain:\t" + domain)
    smoldf = df[(df["domain1"] == domain) | (df["domain2"] == domain)]
    if smoldf.shape[0] == 0:
        print()
        print()
        print("No entries for", gene)
        print("Returning empty dataframe, not plotting")
        print()
        return pd.DataFrame.from_dict({"gene": [], "label": [], "m": [], "b": []})
    printdb("Length of smol df:\t" + str(smoldf.shape))
    that_scores = np.where(
        smoldf["domain1"] == domain, smoldf["d2_med_d5"], smoldf["d1_med_d5"]
    )
    this_scores = np.where(
        smoldf["domain1"] == domain, smoldf["d2_med_d5"], smoldf["d1_med_d5"]
    )
    combo_scores = np.array(smoldf["avg_enrichment_d5"])
    test_df = pd.DataFrame.from_dict(
        {"domain": this_scores, "other": that_scores, "combo": combo_scores}
    )

    # fit the line
    filt_df = test_df.dropna()

    def line(x, m, b):
        return m * x + b

    def residual(params, x, data):
        m = params["m"]
        b = params["b"]
        model = line(x, m, b)
        return data - model

    params = Parameters()
    params.add("m", min=-20, max=20)
    params.add("b", min=-20, max=20)
    out = minimize(
        residual, params, args=(filt_df["other"], filt_df["combo"]), method="nelder"
    )

    # plot
    ax.set_xlim(-3, 3)
    ax.set_ylim(-5.5, 5)
    m = out.params["m"].value
    b = out.params["b"].value
    g = sns.scatterplot(data=filt_df, x="other", y="combo", ax=ax, marker=".")
    ax.set_title(gene)
    ax.set_xlabel("Partner Repression $\log_2$(ON:OFF)")
    ax.set_ylabel("Combo Repression\n$\log_2$(ON:OFF)")
    x = np.linspace(-5.3, 5.3, 500)
    y = line(x, m, b)
    ax.plot(x, y, color="tab:red")

    return pd.DataFrame.from_dict(
        {"gene": [gene], "label": [domain], "m": [m], "b": [b]}
    )

In [17]:
rd1df = df[
    (df["d1_baseline_type"] == "Repressor") | (df["d1_description"].str.contains("Rep"))
]
repressors = sorted(list(set(list(rd1df["d1_Gene"]))))

df_list = []

for g in tqdm(repressors):
    fig, ax = plt.subplots(figsize=(4, 2))
    df_list.append(domain_line_fit(g, ax))
    sns.despine()
    fig.savefig("./fitplots/" + g + ".pdf", bbox_inches="tight")
    plt.close("all")

linedf = pd.concat(df_list)
linedf

  0%|          | 0/42 [00:00<?, ?it/s]

Unnamed: 0,gene,label,m,b
0,ADRM1,Silencer_tiles;ENSG00000130706;26,0.840653,0.197635
0,ASCL1,Short_nuclear_domain;ASCL1_HUMAN;HLH;105;52,0.212169,-1.010157
0,ATF1,Short_nuclear_domain;ATF1_HUMAN;bZIP_1;191;59,0.612362,-1.075269
0,ATRX,Silencer_tiles;ENSG00000085224;242,1.050732,-0.078799
0,BAZ2A,Silencer_tiles;ENSG00000076108;118,0.483654,-1.482712
0,BIN1,Short_nuclear_domain;BIN1_HUMAN;SH3_9;513;63,0.862839,-0.378814
0,CBX1,Short_nuclear_domain;CBX1_HUMAN;Chromo_shadow;...,0.087898,-2.358378
0,CBX7,Silencer_tiles;ENSG00000100307;18,0.31863,-1.770151
0,CDYL2,Short_nuclear_domain;CDYL2_HUMAN;Chromo;1;50,0.169697,-2.717624
0,CHD3,Short_nuclear_domain;CHD3_HUMAN;DUF1087;1286;60,0.905898,-0.033868


# Merge with prior data

In [14]:
bdf = pd.read_csv("../../fig_1/03_computing_baselines/baseline_scores.csv")
bdf["label"] = bdf["domain"]
bdf.head()

Unnamed: 0,domain,avg_d2,med_d2,sd_d2,avg_d5,med_d5,sd_d5,description,baseline_type,label
0,Short_nuclear_domain;CBX1_HUMAN;Chromo_shadow;...,-2.723895,-2.796018,0.688257,-2.736312,-2.750528,0.642879,Repressor,Repressor,Short_nuclear_domain;CBX1_HUMAN;Chromo_shadow;...
1,Short_nuclear_domain;SUMO3_HUMAN;Rad60-SLD;12;70,-2.559898,-2.672776,0.499699,-2.57408,-2.592874,1.076885,Repressor,Repressor,Short_nuclear_domain;SUMO3_HUMAN;Rad60-SLD;12;70
2,Short_nuclear_domain;CDYL2_HUMAN;Chromo;1;50,-2.537211,-2.326746,0.790697,-3.094397,-3.350849,0.825153,Repressor,Repressor,Short_nuclear_domain;CDYL2_HUMAN;Chromo;1;50
3,Short_nuclear_domain;MPP8_HUMAN;Chromo;44;50,-2.639417,-2.695604,0.44816,-2.568733,-2.822486,1.093412,Repressor,Repressor,Short_nuclear_domain;MPP8_HUMAN;Chromo;44;50
4,Short_nuclear_domain;YAF2_HUMAN;YAF2_RYBP;78;32,-1.868021,-2.05097,0.917464,-2.632738,-3.048019,1.199348,Repressor,Repressor,Short_nuclear_domain;YAF2_HUMAN;YAF2_RYBP;78;32


In [15]:
adf = bdf.merge(linedf, on="label", how="left").reset_index()
adf = adf[["domain", "gene", "med_d5", "m", "b"]]
adf = adf.dropna(subset=["m", "b"])
adf.to_csv("./repressors_fitted.csv", index=False)
adf.head()

Unnamed: 0,domain,gene,med_d5,m,b
0,Short_nuclear_domain;CBX1_HUMAN;Chromo_shadow;...,CBX1,-2.750528,0.087898,-2.358378
1,Short_nuclear_domain;SUMO3_HUMAN;Rad60-SLD;12;70,SUMO3,-2.592874,0.065138,-2.485794
2,Short_nuclear_domain;CDYL2_HUMAN;Chromo;1;50,CDYL2,-3.350849,0.169697,-2.717624
3,Short_nuclear_domain;MPP8_HUMAN;Chromo;44;50,MPP8,-2.822486,0.244652,-2.308743
4,Short_nuclear_domain;YAF2_HUMAN;YAF2_RYBP;78;32,YAF2,-3.048019,0.12672,-2.274764
