## Finding DEGs given raw counts matrix

In [None]:
import pandas as pd
import numpy as np
import csv
import requests
import json

from deseq2_analysis.DESeq2_TF_analysis import up_gene_list, down_gene_list, run_chea_kg, top_tfs

from maayanlab_bioinformatics.dge import deseq2_differential_expression
from maayanlab_bioinformatics.dge import characteristic_direction
from maayanlab_bioinformatics.dge import up_down_from_characteristic_direction
from maayanlab_bioinformatics.dge import limma_voom_differential_expression
from maayanlab_bioinformatics.dge import up_down_from_limma_voom
from maayanlab_bioinformatics.dge import limma_voom

In [None]:
# Applies to all DGE methods

raw_counts = pd.read_csv("/Users/andrewjenkinsvandusen/Downloads/GSE271120_RawCountFile_rsemgenes.CCBR1062.csv")
sample_metadata = pd.read_csv("/Users/andrewjenkinsvandusen/Downloads/sample_metadata.csv")

time_pt_list = []
for time_pt in sample_metadata["time_pt_annotation"].tolist():
    if time_pt not in time_pt_list:
        time_pt_list.append(time_pt)

time_pt_dict = {}
for i, time_pt in enumerate(time_pt_list):
    samples_at_time_pt = sample_metadata.loc[sample_metadata["time_pt_annotation"] == time_pt, "sample_name"].tolist()
    subset_counts = raw_counts[["gene_id"] + samples_at_time_pt]
    rev_subset_counts = subset_counts.set_index("gene_id")
    time_pt_dict[i] = (time_pt, rev_subset_counts)
# print(time_pt_dict)

In [None]:
# ### [DESeq2] degs from adjacent time pt comparisons
# deseq2_adj_time_pt_comparisons = []
# deseq2_adj_time_pt_degs = []
# for i in range(len(time_pt_list) - 1):
#     controls, cases = time_pt_dict[i][1], time_pt_dict[i+1][1]
#     results_df = deseq2_differential_expression(controls, cases)

#     p_vals = [0.05, 0.01, 0.001, 0.0001, 0.00001]
#     significant_genes = results_df[results_df["padj"] < p_vals[0]]
#     up_count = (significant_genes["log2FoldChange"] > 0).sum()
#     down_count = (significant_genes["log2FoldChange"] < 0).sum()

#     idx = 1
#     while (up_count + down_count) > 2000:
#         significant_genes = results_df[results_df["padj"] < p_vals[idx]]
#         up_count = (significant_genes["log2FoldChange"] > 0).sum()
#         down_count = (significant_genes["log2FoldChange"] < 0).sum()
#         idx += 1
#     print("total DEGs:", (up_count + down_count), "up:", up_count, "down:", down_count, "padj:", p_vals[idx-1])
#     print(significant_genes.head())

#     ctrl_time_pt, case_time_pt = time_pt_dict[i][0], time_pt_dict[i+1][0]
#     deseq2_adj_time_pt_comparisons.append(f"{case_time_pt} v {ctrl_time_pt}")

#     file = f"deseq2_{case_time_pt}_v_{ctrl_time_pt}.csv"
#     significant_genes.to_csv(file)
#     deseq2_adj_time_pt_degs.append(file)

In [None]:
# ### [DESeq2] degs from comparison with time pt 0
# deseq2_time_pt_0_comparisons = []
# deseq2_time_pt_0_degs = []
# for i in range(1, len(time_pt_list)):
#     controls, cases = time_pt_dict[0][1], time_pt_dict[i][1]
#     results_df = deseq2_differential_expression(controls, cases)

#     p_vals = [0.05, 0.01, 0.001, 0.0001, 0.00001]
#     significant_genes = results_df[results_df["padj"] < p_vals[0]]
#     up_count = (significant_genes["log2FoldChange"] > 0).sum()
#     down_count = (significant_genes["log2FoldChange"] < 0).sum()

#     idx = 1
#     while (up_count + down_count) > 2000:
#         significant_genes = results_df[results_df["padj"] < p_vals[idx]]
#         up_count = (significant_genes["log2FoldChange"] > 0).sum()
#         down_count = (significant_genes["log2FoldChange"] < 0).sum()
#         idx += 1
#     print("total DEGs:", (up_count + down_count), "up:", up_count, "down:", down_count, "padj:", p_vals[idx-1])
#     print(significant_genes.head())

#     ctrl_time_pt, case_time_pt = time_pt_dict[0][0], time_pt_dict[i][0]
#     deseq2_time_pt_0_comparisons.append(f"{case_time_pt} v {ctrl_time_pt}")

#     file = f"deseq2_{case_time_pt}_v_{ctrl_time_pt}.csv"
#     significant_genes.to_csv(file)
#     deseq2_time_pt_0_degs.append(file)

In [None]:
### [CD] degs from adjacent time pt comparisons
def run_chea_kg(gene_list, num_tfs):
    """
    Outputs JSON of TF subnetwork best corresponding to input gene list.
    """
    CHEA_KG = 'https://chea-kg.maayanlab.cloud/api/enrichment'

    description = "insert description here"
    payload = {
        'list': (None, "\n".join(gene_list)),
        'description': (None, description)
    }
    response=requests.post(f"{CHEA_KG}/addList", files=payload)
    data = json.loads(response.text)

    q = {
        'min_lib': 3, # minimum number of libraries that a TF must be ranked in
        'libraries': [
            {'library': "Integrated--meanRank", 'term_limit': num_tfs} # edit term_limit to change number of top-ranked TFs
        ],
        'limit':50, # controls number of edges returned - may cause issues with visualization if too large
        'userListId': data['userListId']
    }
    query_json=json.dumps(q)

    res = requests.post(CHEA_KG, data=query_json)
    if res.ok:
        data = json.loads(res.text)
        return data
    else:
        data = None
        return res.text


def top_tfs(gene_list, num_tfs=5):
    """
    Returns a list of the top N most enriched TFs corresponding to an input gene list.
    """
    enriched_tfs = run_chea_kg(gene_list, num_tfs)
    tfs_list = []
    for node in enriched_tfs["nodes"]:
        tfs_list.append(node["data"]["label"])
    return tfs_list


# cd_adj_time_pt_comparisons = []
# cd_tf_time_dict_1 = {}
# for i in range(len(time_pt_list) - 1):
#     ctrl_time_pt, case_time_pt = time_pt_dict[i][0], time_pt_dict[i+1][0]
#     cd_adj_time_pt_comparisons.append(f"{case_time_pt} v {ctrl_time_pt}")

#     controls, cases = time_pt_dict[i][1], time_pt_dict[i+1][1]
#     results_df = characteristic_direction(controls, cases)
#     # print(results_df.head())

#     up_genes = up_down_from_characteristic_direction(results_df).up
#     up_list = []
#     for gene in up_genes:
#         if "_" in gene:
#             up_list.append(gene.split("_", 1)[1])
#         else:
#             up_list.append(gene)

#     down_genes = up_down_from_characteristic_direction(results_df).down
#     down_list = []
#     for gene in down_genes:
#         if "_" in gene:
#             down_list.append(gene.split("_", 1)[1])
#         else:
#             down_list.append(gene)

#     print(up_list)
#     print(down_list)
#     print(len(up_list), len(down_list))

#     cd_tf_time_dict_1[i] = (top_tfs(up_list), top_tfs(down_list))
# print(cd_tf_time_dict_1)

In [None]:
# ### [CD] degs from time pt 0 comparions
# cd_time_pt_0_comparisons = []
# cd_tf_time_dict_2 = {}
# for i in range(1, len(time_pt_list)):
#     ctrl_time_pt, case_time_pt = time_pt_dict[0][0], time_pt_dict[i][0]
#     cd_time_pt_0_comparisons.append(f"{case_time_pt} v {ctrl_time_pt}")

#     controls, cases = time_pt_dict[0][1], time_pt_dict[i][1]
#     results_df = characteristic_direction(controls, cases)

#     up_genes = up_down_from_characteristic_direction(results_df).up
#     up_list = []
#     for gene in up_genes:
#         if "_" in gene:
#             up_list.append(gene.split("_", 1)[1])
#         else:
#             up_list.append(gene)

#     down_genes = up_down_from_characteristic_direction(results_df).down
#     down_list = []
#     for gene in down_genes:
#         if "_" in gene:
#             down_list.append(gene.split("_", 1)[1])
#         else:
#             down_list.append(gene)

#     print(up_list)
#     print(down_list)
#     print(len(up_list), len(down_list))

#     cd_tf_time_dict_2[i-1] = (top_tfs(up_list), top_tfs(down_list))
# print(cd_tf_time_dict_2)

In [None]:
### [limma] degs from adj time pt comparions
limma_adj_time_pt_comparisons = []
limma_tf_time_dict_1 = {}
for i in range(len(time_pt_list) - 1):
    ctrl_time_pt, case_time_pt = time_pt_dict[i][0], time_pt_dict[i+1][0]
    limma_adj_time_pt_comparisons.append(f"{case_time_pt} v {ctrl_time_pt}")

    controls, cases = time_pt_dict[i][1], time_pt_dict[i+1][1]
    results_df = limma_voom_differential_expression(controls, cases)
    # print(results_df.head())

    up_genes = up_down_from_limma_voom(results_df).up
    up_list = []
    for gene in up_genes:
        if "_" in gene:
            up_list.append(gene.split("_", 1)[1])
        else:
            up_list.append(gene)

    down_genes = up_down_from_limma_voom(results_df).down
    down_list = []
    for gene in down_genes:
        if "_" in gene:
            down_list.append(gene.split("_", 1)[1])
        else:
            down_list.append(gene)

    print(up_list)
    print(down_list)
    print(len(up_list), len(down_list))

    limma_tf_time_dict_1[i] = (top_tfs(up_list), top_tfs(down_list))
print(limma_tf_time_dict_1)

In [None]:
### [limma] degs from time pt 0 comparions
limma_time_pt_0_comparisons = []
limma_tf_time_dict_2 = {}
for i in range(1, len(time_pt_list)):
    ctrl_time_pt, case_time_pt = time_pt_dict[0][0], time_pt_dict[i][0]
    limma_time_pt_0_comparisons.append(f"{case_time_pt} v {ctrl_time_pt}")

    controls, cases = time_pt_dict[0][1], time_pt_dict[i][1]
    results_df = limma_voom_differential_expression(controls, cases)

    up_genes = up_down_from_limma_voom(results_df).up
    up_list = []
    for gene in up_genes:
        if "_" in gene:
            up_list.append(gene.split("_", 1)[1])
        else:
            up_list.append(gene)

    down_genes = up_down_from_limma_voom(results_df).down
    down_list = []
    for gene in down_genes:
        if "_" in gene:
            down_list.append(gene.split("_", 1)[1])
        else:
            down_list.append(gene)

    print(up_list)
    print(down_list)
    print(len(up_list), len(down_list))

    limma_tf_time_dict_2[i-1] = (top_tfs(up_list), top_tfs(down_list))
print(limma_tf_time_dict_2)

In [None]:
def csv_to_gmt(input_csv_list, comparisons, filename):
    gmt_dict = {}
    for i, file in enumerate(input_csv_list):
        up_genes = up_gene_list(file)
        print(len(up_genes))
        down_genes = down_gene_list(file)
        print(len(down_genes))
        gmt_dict[f"{comparisons[i]} up genes"] = up_genes
        gmt_dict[f"{comparisons[i]} down genes"] = down_genes

    with open(filename, "w") as file:
        for s,t in gmt_dict.items():
            file.write(str(s) + "\t\t" + "\t".join(t) + "\n")
    print("FINISHED CONVERTING TO GMT")
    return filename

# deseq2_degs_gmt_1 = csv_to_gmt(deseq2_adj_time_pt_degs, deseq2_adj_time_pt_comparisons, "appyter_deseq2_adj_time_pt_degs.gmt")
# deseq2_degs_gmt_2 = csv_to_gmt(deseq2_time_pt_0_degs, deseq2_time_pt_0_comparisons, "appyter_deseq2_compare_w_time_pt_0_degs.gmt")

In [None]:
def gmt_to_tf_time_dict(gmt_file):
    """
    Converts GMT file containing DEGs to tf_time_dict.
    """
    with open(gmt_file, 'r') as f:
        lines = f.readlines()

    temp_dict = {}
    for line in lines:
        tokens = line.split("\t\t")
        term = tokens[0]
        genes = [x.split(',')[0].strip() for x in tokens[1].split('\t')]
        temp_dict[term] = top_tfs(genes, 5)
        print("enriched TFs found")

    comparisons = list(temp_dict.keys())
    new_comparisons = []
    for item in comparisons:
        comp = item.rsplit(' ', 2)[0]
        if comp not in new_comparisons:
            new_comparisons.append(comp)
    print(new_comparisons)

    j = 0
    tf_time_dict = {}
    for i in range(len(comparisons) // 2):
        tf_time_dict[i] = (temp_dict[comparisons[j]], temp_dict[comparisons[j+1]])
        j += 2
    print(tf_time_dict)
    return tf_time_dict, new_comparisons

# print(gmt_to_tf_time_dict(deseq2_degs_gmt_1))
# print(gmt_to_tf_time_dict(deseq2_degs_gmt_2))