# Gene expression re-normalization

We need to renormalize gene expression to reduce chrM bias or MYC gene expression amplification bias.

# Outline
- [Introduction](#gene-expression-re-normalization)
- [Setup](#setup)
- [Data](#data)
- [Reproduce problem](#reproduce-chrm-bias)

# Setup

In [None]:
%load_ext autoreload
%autoreload 2
from taigapy import TaigaClient
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from cds import plotting
from mgenepy.utils import helper
#from depmap_omics_upload import tracker as track

tc = TaigaClient()

# Data

In [None]:
%%capture

OmicsExpressionProteinCodingGenesTPMLogp1 = tc.get(name='internal-23q4-ac2b', version=68, file='OmicsExpressionProteinCodingGenesTPMLogp1')

In [None]:
OmicsExpressionProteinCodingGenesTPMLogp1.shape

In [None]:
%%capture

OmicsExpressionAllGenesEffectiveLengthProfile = tc.get(name='internal-23q4-ac2b', version=68, file='OmicsExpressionAllGenesEffectiveLengthProfile')
OmicsExpressionAllGenesTPMLogp1Profile = tc.get(name='internal-23q4-ac2b', version=68, file='OmicsExpressionAllGenesTPMLogp1Profile')
OmicsExpressionGenesExpectedCountProfile = tc.get(name='internal-23q4-ac2b', version=68, file='OmicsExpressionGenesExpectedCountProfile')
OmicsDefaultModelProfiles = tc.get(name='internal-23q4-ac2b', version=68, file='OmicsDefaultModelProfiles')
OmicsDefaultModelConditionProfiles = tc.get(name='internal-23q4-ac2b', version=68, file='OmicsDefaultModelConditionProfiles')


In [None]:
profile_to_model_dict = OmicsDefaultModelProfiles.query("ProfileType == 'rna'").\
        set_index("ProfileID").\
        drop("ProfileType", axis=1).\
        to_dict()["ModelID"]

In [None]:
OmicsExpressionProteinCodingGenesTPMLogp1.shape, \
    OmicsExpressionAllGenesEffectiveLengthProfile.shape, \
    OmicsExpressionAllGenesTPMLogp1Profile.shape, \
    OmicsExpressionGenesExpectedCountProfile.shape, \
    OmicsExpressionProteinCodingGenesTPMLogp1.shape

In [None]:
OmicsExpressionAllGenesEffectiveLengthProfile_matched_genes = OmicsExpressionAllGenesEffectiveLengthProfile.loc[:, OmicsExpressionGenesExpectedCountProfile.columns]

In [None]:
OmicsExpressionAllGenesEffectiveLengthProfile_matched_genes.shape, OmicsExpressionGenesExpectedCountProfile.shape

# Reproduce TPM for protein coding genes

In [None]:
def calculate_tpm(counts_df, transcripts_length_df):
    """https://github.com/deweylab/RSEM/blob/8bc1e2115493c0cdf3c6bee80ef7a21a91b2acce/WriteResults.h#L77"""
    assert counts_df.shape == transcripts_length_df.shape

    denom = counts_df.sum(axis=1)
    denom = denom.where(denom > 1e-300, 1)
    print(denom.isnull().sum().sum())
    frac = counts_df.div(denom, axis=0)
    print(frac.isnull().sum().sum())

    transcripts_length_df = transcripts_length_df.where(transcripts_length_df > 1e-300, 1)
    fpkm = frac * 1e9 / transcripts_length_df
    print(fpkm.isnull().sum().sum())

    fpkm_sum = fpkm.sum(axis=1)
    fpkm_denom = fpkm_sum.where(fpkm_sum > 1e-300, 1)
    print(fpkm_denom.isnull().sum().sum())

    tpm_df = (fpkm.div(fpkm_denom, axis=0)) * 1e6
    print(tpm_df.isnull().sum().sum())
    return np.log2(tpm_df+1)

In [None]:
OmicsExpressionGenesExpectedCountProfile_tpm = calculate_tpm(OmicsExpressionGenesExpectedCountProfile, OmicsExpressionAllGenesEffectiveLengthProfile_matched_genes)

In [None]:
mybiomart = helper.generateGeneNames(ensemble_server="http://nov2020.archive.ensembl.org/biomart", useCache=False)

In [None]:
protcod_rename = {}
for _, i in mybiomart[
    (~mybiomart.entrezgene_id.isna()) & (mybiomart.gene_biotype == "protein_coding")
].iterrows():
    if i.ensembl_gene_id not in protcod_rename:
        protcod_rename.update(
            {
                i.hgnc_symbol + " (" + i.ensembl_gene_id + ")": i.hgnc_symbol
                + " ("
                + str(int(i.entrezgene_id))
                + ")"
            }
        )

In [None]:
protcod_rename

In [None]:
OmicsExpressionGenesExpectedCountProfile_tpm.columns = OmicsExpressionGenesExpectedCountProfile_tpm.columns.map(protcod_rename)
OmicsExpressionGenesExpectedCountProfile_tpm = OmicsExpressionGenesExpectedCountProfile_tpm.loc[:, ~OmicsExpressionGenesExpectedCountProfile_tpm.columns.isnull()]

In [None]:
OmicsExpressionAllGenesTPMLogp1Profile.columns = OmicsExpressionAllGenesTPMLogp1Profile.columns.map(protcod_rename)
OmicsExpressionAllGenesTPMLogp1Profile = OmicsExpressionAllGenesTPMLogp1Profile.loc[:, ~OmicsExpressionAllGenesTPMLogp1Profile.columns.isnull()]

In [None]:
from scipy.stats import pearsonr

tpm_correlation_list = []
for gene in OmicsExpressionAllGenesTPMLogp1Profile.columns:
    tpm_correlation_list.append(pearsonr(OmicsExpressionGenesExpectedCountProfile_tpm.loc[:, gene], OmicsExpressionAllGenesTPMLogp1Profile.loc[:, gene])[0])

In [None]:
np.isnan(tpm_correlation_list).sum(), np.min(tpm_correlation_list), np.mean(tpm_correlation_list)

In [None]:
sns.kdeplot(tpm_correlation_list)

In [None]:
OmicsExpressionGenesExpectedCountProfile_tpm.isnull().sum().sum()

In [None]:
OmicsExpressionGenesExpectedCountProfile_tpm.loc[:, "Model"] = OmicsExpressionGenesExpectedCountProfile_tpm.index.map(profile_to_model_dict)
OmicsExpressionGenesExpectedCountModel_tpm = OmicsExpressionGenesExpectedCountProfile_tpm.set_index("Model")
OmicsExpressionGenesExpectedCountModel_tpm = OmicsExpressionGenesExpectedCountModel_tpm.loc[OmicsExpressionProteinCodingGenesTPMLogp1.index, :]

In [None]:
OmicsExpressionGenesExpectedCountProfile_tpm.shape

# Reproduce chrM bias

In [None]:
sample_median_expression = OmicsExpressionProteinCodingGenesTPMLogp1.median(axis=1)
sample_median_expression_rank = sample_median_expression.rank()

In [None]:
top_1_sample_index = sample_median_expression.argmax()
top_1_sample_rank = sample_median_expression_rank[top_1_sample_index]
top_1_sample = sample_median_expression.index[top_1_sample_index]
top_1_sample_expression = sample_median_expression[top_1_sample_index]
bottom_1_sample_index = sample_median_expression.argmin()
bottom_1_sample_rank = sample_median_expression_rank[bottom_1_sample_index]
bottom_1_sample = sample_median_expression.index[bottom_1_sample_index]
bottom_1_sample_expression = sample_median_expression[bottom_1_sample_index]

fig, ax = plt.subplots()
fig.set_size_inches(8, 5)
sns.scatterplot(y=sample_median_expression, x=sample_median_expression_rank, ax=ax, s=1)
ax.set_xlabel("Rank")
ax.set_ylabel("Median gene-level expression for LogTPM+1")
ax.annotate(top_1_sample, xy=(top_1_sample_rank, top_1_sample_expression), xytext=(0, 2), ha='center', textcoords='offset points', va='bottom')
ax.annotate(bottom_1_sample, xy=(bottom_1_sample_rank, bottom_1_sample_expression), 
            xytext=(bottom_1_sample_rank+50, 2), ha='center', va='bottom', textcoords='offset points', 
                arrowprops = dict(arrowstyle="simple", facecolor='red'))

sns.despine()

In [None]:
plotting.waterfall_plot(sample_median_expression, s=1)

In [None]:
new_sample_median_expression = OmicsExpressionGenesExpectedCountModel_tpm.median(axis=1)
plotting.waterfall_plot(new_sample_median_expression, s=1)

In [None]:
new_sample_median_expression.head()

In [None]:
fig, ax = plt.subplots(1, 2)
fig.set_size_inches(16, 5)
chrM_labels = OmicsExpressionProteinCodingGenesTPMLogp1.columns[OmicsExpressionProteinCodingGenesTPMLogp1.columns.str.contains("^MT-")]
highest_sample = OmicsExpressionProteinCodingGenesTPMLogp1.loc['ACH-001386', :]
lowest_sample = OmicsExpressionProteinCodingGenesTPMLogp1.loc['ACH-000904', :]
plotting.density_scatter(highest_sample,
                         lowest_sample, ax=ax[0],
                         label_specific=chrM_labels)
sns.despine()
                         
ax[0].set_xlabel("Expression in ACH-001386")
ax[0].set_ylabel("Expression in ACH-000904")
ax[0].set_xlim(0, 18)
ax[0].set_ylim(0, 18)

highest_sample = OmicsExpressionProteinCodingGenesTPMLogp1.loc['ACH-001388', :]
lowest_sample = OmicsExpressionProteinCodingGenesTPMLogp1.loc['ACH-000904', :]
plotting.density_scatter(highest_sample,
                         lowest_sample, ax=ax[1],
                         label_specific=chrM_labels)
sns.despine()
                         
ax[1].set_xlabel("Expression in ACH-001388")
ax[1].set_ylabel("Expression in ACH-000904")
ax[1].set_xlim(0, 18)
ax[1].set_ylim(0, 18)

In [None]:
fig, ax = plt.subplots(1, 2)
fig.set_size_inches(16, 5)
chrM_labels = OmicsExpressionGenesExpectedCountModel_tpm.columns[OmicsExpressionGenesExpectedCountModel_tpm.columns.str.contains("^MT-")]
highest_sample = OmicsExpressionGenesExpectedCountModel_tpm.loc['ACH-001386', :]
lowest_sample = OmicsExpressionGenesExpectedCountModel_tpm.loc['ACH-000904', :]
plotting.density_scatter(highest_sample,
                         lowest_sample, ax=ax[0],
                         label_specific=chrM_labels)
sns.despine()
                         
ax[0].set_xlabel("Expression in ACH-001386")
ax[0].set_ylabel("Expression in ACH-000904")
ax[0].set_xlim(0, 18)
ax[0].set_ylim(0, 18)

highest_sample = OmicsExpressionGenesExpectedCountModel_tpm.loc['ACH-001388', :]
lowest_sample = OmicsExpressionGenesExpectedCountModel_tpm.loc['ACH-000904', :]
plotting.density_scatter(highest_sample,
                         lowest_sample, ax=ax[1],
                         label_specific=chrM_labels)
sns.despine()
                         
ax[1].set_xlabel("Expression in ACH-001388")
ax[1].set_ylabel("Expression in ACH-000904")
ax[1].set_xlim(0, 18)
ax[1].set_ylim(0, 18)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(8, 5)
plotting.density_scatter(OmicsExpressionProteinCodingGenesTPMLogp1.loc[:, :].median(axis=1),
                         OmicsExpressionProteinCodingGenesTPMLogp1.loc[:, chrM_labels].median(axis=1),
                         ax=ax)
ax.set_ylabel("Mitochondrial median expression")
ax.set_xlabel("All gene median expression")

fig, ax = plt.subplots()
fig.set_size_inches(8, 5)
plotting.density_scatter(OmicsExpressionGenesExpectedCountModel_tpm.loc[:, :].median(axis=1),
                         OmicsExpressionGenesExpectedCountModel_tpm.loc[:, chrM_labels].median(axis=1),
                         ax=ax)
ax.set_ylabel("Mitochondrial median expression")
ax.set_xlabel("All gene median expression")

# Apply the correction

In [None]:
def calculate_tpm_correct_chrM_log(counts_df, transcripts_length_df):
    """https://github.com/deweylab/RSEM/blob/8bc1e2115493c0cdf3c6bee80ef7a21a91b2acce/WriteResults.h#L77"""
    assert counts_df.shape == transcripts_length_df.shape
    EPSILON = 1e-300

    chrM_labels = counts_df.columns.str.contains("^MT-")
    print(chrM_labels.sum())

    denom = counts_df.loc[:, ~chrM_labels].sum(axis=1)
    denom = denom.where(denom > EPSILON, 1)
    print(denom.isnull().sum().sum())
    frac = counts_df.div(denom, axis=0)
    print(frac.isnull().sum().sum())

    transcripts_length_df = transcripts_length_df.where(transcripts_length_df > EPSILON, 1)
    fpkm = frac * 1e9 / transcripts_length_df
    print(fpkm.isnull().sum().sum())

    fpkm_sum = fpkm.loc[:, ~chrM_labels].sum(axis=1)
    fpkm_denom = fpkm_sum.where(fpkm_sum > EPSILON, 1)
    print(fpkm_denom.isnull().sum().sum())

    tpm_df = (fpkm.div(fpkm_denom, axis=0)) * 1e6
    print(tpm_df.isnull().sum().sum())
    return np.log2(tpm_df+1)

In [None]:
OmicsExpressionGenesExpectedCountProfile_tpm_chrM = calculate_tpm_correct_chrM_log(OmicsExpressionGenesExpectedCountProfile, 
                                                                               OmicsExpressionAllGenesEffectiveLengthProfile_matched_genes)
OmicsExpressionGenesExpectedCountProfile_tpm_chrM.columns = OmicsExpressionGenesExpectedCountProfile_tpm_chrM.columns.map(protcod_rename)
OmicsExpressionGenesExpectedCountProfile_tpm_chrM = OmicsExpressionGenesExpectedCountProfile_tpm_chrM.loc[:, ~OmicsExpressionGenesExpectedCountProfile_tpm_chrM.columns.isnull()]

OmicsExpressionGenesExpectedCountProfile_tpm_chrM.loc[:, "Model"] = OmicsExpressionGenesExpectedCountProfile_tpm_chrM.index.map(profile_to_model_dict)
OmicsExpressionGenesExpectedCountModel_tpm_chrM = OmicsExpressionGenesExpectedCountProfile_tpm_chrM.set_index("Model")
OmicsExpressionGenesExpectedCountModel_tpm_chrM = OmicsExpressionGenesExpectedCountModel_tpm_chrM.loc[OmicsExpressionProteinCodingGenesTPMLogp1.index, :]

In [None]:
new_sample_median_expression = OmicsExpressionGenesExpectedCountModel_tpm_chrM.median(axis=1)
plotting.waterfall_plot(new_sample_median_expression, s=1)

In [None]:
chrM_labels = OmicsExpressionGenesExpectedCountModel_tpm.columns[OmicsExpressionGenesExpectedCountModel_tpm.columns.str.contains("^MT-")]

fig, ax = plt.subplots()
fig.set_size_inches(8, 5)
plotting.density_scatter(OmicsExpressionGenesExpectedCountModel_tpm_chrM.loc[:, :].median(axis=1),
                         OmicsExpressionGenesExpectedCountModel_tpm_chrM.loc[:, chrM_labels].median(axis=1),
                         ax=ax)
ax.set_ylabel("Mitochondrial median expression")
ax.set_xlabel("All gene median expression")

fig, ax = plt.subplots()
fig.set_size_inches(8, 5)
plotting.density_scatter(OmicsExpressionAllGenesTPMLogp1Profile.loc[:, :].median(axis=1),
                         OmicsExpressionAllGenesTPMLogp1Profile.loc[:, chrM_labels].median(axis=1),
                         ax=ax)
ax.set_ylabel("Mitochondrial median expression")
ax.set_xlabel("All gene median expression")

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(8, 5)

highest_sample = OmicsExpressionGenesExpectedCountModel_tpm_chrM.loc['ACH-001388', :]
lowest_sample = OmicsExpressionGenesExpectedCountModel_tpm_chrM.loc['ACH-000904', :]
plotting.density_scatter(highest_sample,
                         lowest_sample, ax=ax,
                         label_specific=chrM_labels)
sns.despine()
                         
ax.set_xlabel("Expression in ACH-001388")
ax.set_ylabel("Expression in ACH-000904")
#ax[0].set_xlim(0, 18)
#ax[0].set_ylim(0, 18)

# fig, ax = plt.subplots()
# fig.set_size_inches(8, 5)
highest_sample = OmicsExpressionGenesExpectedCountModel_tpm.loc['ACH-001388', :]
lowest_sample = OmicsExpressionGenesExpectedCountModel_tpm.loc['ACH-000904', :]
plotting.density_scatter(highest_sample,
                         lowest_sample, ax=ax,
                         trend_line_args={'color': 'b'},
                         label_specific=chrM_labels)
sns.despine()
                       
ax.set_xlabel("Expression in ACH-001388")
ax.set_ylabel("Expression in ACH-000904")
#ax[1].set_xlim(0, 18)
#ax[1].set_ylim(0, 18)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(8, 5)

highest_sample = OmicsExpressionGenesExpectedCountModel_tpm_chrM.loc['ACH-001388', :]
lowest_sample = OmicsExpressionGenesExpectedCountModel_tpm_chrM.loc['ACH-000904', :]
plotting.density_scatter(highest_sample,
                         lowest_sample, ax=ax,
                         label_specific=chrM_labels)
sns.despine()
                         
ax.set_xlabel("Expression in ACH-001388")
ax.set_ylabel("Expression in ACH-000904")
#ax[0].set_xlim(0, 18)
#ax[0].set_ylim(0, 18)

fig, ax = plt.subplots()
fig.set_size_inches(8, 5)
highest_sample = OmicsExpressionGenesExpectedCountModel_tpm.loc['ACH-001388', :]
lowest_sample = OmicsExpressionGenesExpectedCountModel_tpm.loc['ACH-000904', :]
plotting.density_scatter(highest_sample,
                         lowest_sample, ax=ax,
                         trend_line_args={'color': 'b'},
                         label_specific=chrM_labels)
sns.despine()
                       
ax.set_xlabel("Expression in ACH-001388")
ax.set_ylabel("Expression in ACH-000904")
#ax[1].set_xlim(0, 18)
#ax[1].set_ylim(0, 18)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(8, 5)

highest_sample = OmicsExpressionGenesExpectedCountModel_tpm_chrM.loc['ACH-000399', :]
lowest_sample = OmicsExpressionGenesExpectedCountModel_tpm_chrM.loc['ACH-003106', :]
plotting.density_scatter(highest_sample,
                         lowest_sample, ax=ax,
                         label_specific=chrM_labels)
sns.despine()
                         
ax.set_xlabel("Expression in ACH-00039")
ax.set_ylabel("Expression in ACH-003106")
#ax[0].set_xlim(0, 18)
#ax[0].set_ylim(0, 18)

# fig, ax = plt.subplots()
# fig.set_size_inches(8, 5)
highest_sample = OmicsExpressionGenesExpectedCountModel_tpm.loc['ACH-000399', :]
lowest_sample = OmicsExpressionGenesExpectedCountModel_tpm.loc['ACH-003106', :]
plotting.density_scatter(highest_sample,
                         lowest_sample, ax=ax,
                         trend_line_args={'color': 'b'},
                         label_specific=chrM_labels)
sns.despine()
                       
ax.set_xlabel("Expression in ACH-00039")
ax.set_ylabel("Expression in ACH-003106")
#ax[1].set_xlim(0, 18)
#ax[1].set_ylim(0, 18)

## Upload this new version of datasets

In [None]:
OmicsExpressionGenesExpectedCountModel_tpm_chrM.to_csv("OmicsExpressionGenesExpectedCountModel_tpm_chrM.csv")

In [None]:
update = True
if update:
   new_dataset_id = tc.update_dataset(
        dataset_id="post-23q4-gene-expression-renormalization-chrm-8a6b",
        changes_description="post 23Q4 adjust gene expression normalization 2",
        upload_files=[
            {
                "path": "OmicsExpressionGenesExpectedCountModel_tpm_chrM.csv",
                "name": "OmicsExpressionGenesExpectedCountModel_tpm_chrM", # optional, will use file name if not provided
                "format": "NumericMatrixCSV", # or "NumericMatrixCSV" or "TableCSV"
                "encoding": "utf-8" # optional (but recommended), will use iso-8859-1 if not provided
            }
        ],
        upload_async=False,
        dataset_description="post 23Q4 adjust gene expression normalization",
   )
else:
   new_dataset_id = tc.create_dataset(
       "post-23q4-gene-expression-renormalization-chrM",
       dataset_description="post 23Q4 adjust gene expression normalization",
       upload_files=[
           {
               "path": "OmicsExpressionGenesExpectedCountModel_tpm_chrM.csv",
               "name": "OmicsExpressionGenesExpectedCountModel_tpm_chrM", # optional, will use file name if not provided
               "format": "NumericMatrixCSV", # or "NumericMatrixCSV" or "TableCSV"
               "encoding": "utf-8" # optional (but recommended), will use iso-8859-1 if not provided
           }
       ],
       folder_id="a9eedc220a6a4e70b8f1e64d2e57ed87", # optional, will default to your home folder if not provided
   )

new_dataset_id

# Square root normalization

In [None]:
def calculate_tpm_correct_chrM(counts_df, transcripts_length_df, chrM=True):
    """https://github.com/deweylab/RSEM/blob/8bc1e2115493c0cdf3c6bee80ef7a21a91b2acce/WriteResults.h#L77"""
    assert counts_df.shape == transcripts_length_df.shape
    EPSILON = 1e-300

    chrM_labels = counts_df.columns.str.contains("^MT-")
    print(chrM_labels.sum())

    if chrM:
        denom = counts_df.loc[:, ~chrM_labels].sum(axis=1)
    else:
        denom = counts_df.sum(axis=1)

    denom = denom.where(denom > EPSILON, 1)
    print(denom.isnull().sum().sum())
    frac = counts_df.div(denom, axis=0)
    print(frac.isnull().sum().sum())

    transcripts_length_df = transcripts_length_df.where(transcripts_length_df > EPSILON, 1)
    fpkm = frac * 1e9 / transcripts_length_df
    print(fpkm.isnull().sum().sum())

    if chrM:
        fpkm_sum = fpkm.loc[:, ~chrM_labels].sum(axis=1)
    else:
        fpkm_sum = fpkm.sum(axis=1)

    fpkm_denom = fpkm_sum.where(fpkm_sum > EPSILON, 1)
    print(fpkm_denom.isnull().sum().sum())

    tpm_df = (fpkm.div(fpkm_denom, axis=0)) * 1e6
    print(tpm_df.isnull().sum().sum())
    return np.sqrt(tpm_df+1)

In [None]:
OmicsExpressionGenesExpectedCountProfile_tpm_sqrt = calculate_tpm_correct_chrM(OmicsExpressionGenesExpectedCountProfile, 
                                                                               OmicsExpressionAllGenesEffectiveLengthProfile_matched_genes, chrM=False)
OmicsExpressionGenesExpectedCountProfile_tpm_sqrt.columns = OmicsExpressionGenesExpectedCountProfile_tpm_sqrt.columns.map(protcod_rename)
OmicsExpressionGenesExpectedCountProfile_tpm_sqrt = OmicsExpressionGenesExpectedCountProfile_tpm_sqrt.loc[:, ~OmicsExpressionGenesExpectedCountProfile_tpm_sqrt.columns.isnull()]

OmicsExpressionGenesExpectedCountProfile_tpm_sqrt.loc[:, "Model"] = OmicsExpressionGenesExpectedCountProfile_tpm_sqrt.index.map(profile_to_model_dict)
OmicsExpressionGenesExpectedCountModel_tpm_sqrt = OmicsExpressionGenesExpectedCountProfile_tpm_sqrt.set_index("Model")
OmicsExpressionGenesExpectedCountModel_tpm_sqrt = OmicsExpressionGenesExpectedCountModel_tpm_sqrt.loc[OmicsExpressionProteinCodingGenesTPMLogp1.index, :]

In [None]:
OmicsExpressionGenesExpectedCountProfile_tpm_chrM = calculate_tpm_correct_chrM(OmicsExpressionGenesExpectedCountProfile, 
                                                                               OmicsExpressionAllGenesEffectiveLengthProfile_matched_genes)
OmicsExpressionGenesExpectedCountProfile_tpm_chrM.columns = OmicsExpressionGenesExpectedCountProfile_tpm_chrM.columns.map(protcod_rename)
OmicsExpressionGenesExpectedCountProfile_tpm_chrM = OmicsExpressionGenesExpectedCountProfile_tpm_chrM.loc[:, ~OmicsExpressionGenesExpectedCountProfile_tpm_chrM.columns.isnull()]

OmicsExpressionGenesExpectedCountProfile_tpm_chrM.loc[:, "Model"] = OmicsExpressionGenesExpectedCountProfile_tpm_chrM.index.map(profile_to_model_dict)
OmicsExpressionGenesExpectedCountModel_tpm_chrM = OmicsExpressionGenesExpectedCountProfile_tpm_chrM.set_index("Model")
OmicsExpressionGenesExpectedCountModel_tpm_chrM = OmicsExpressionGenesExpectedCountModel_tpm_chrM.loc[OmicsExpressionProteinCodingGenesTPMLogp1.index, :]

In [None]:
chrM_labels = OmicsExpressionGenesExpectedCountModel_tpm_chrM.columns[OmicsExpressionGenesExpectedCountModel_tpm_chrM.columns.str.contains("^MT-")]

fig, ax = plt.subplots()
fig.set_size_inches(8, 5)
plotting.density_scatter(OmicsExpressionGenesExpectedCountModel_tpm_chrM.loc[:, :].median(axis=1),
                         OmicsExpressionGenesExpectedCountModel_tpm_chrM.loc[:, chrM_labels].median(axis=1),
                         ax=ax)
ax.set_ylabel("Mitochondrial median expression")
ax.set_xlabel("All gene median expression")
#ax.set_ylim(0, 200)

fig, ax = plt.subplots()
fig.set_size_inches(8, 5)
plotting.density_scatter(OmicsExpressionAllGenesTPMLogp1Profile.loc[:, :].median(axis=1),
                         OmicsExpressionAllGenesTPMLogp1Profile.loc[:, chrM_labels].median(axis=1),
                         ax=ax)
ax.set_ylabel("Mitochondrial median expression")
ax.set_xlabel("All gene median expression")

In [None]:
chrM_labels = OmicsExpressionGenesExpectedCountModel_tpm_sqrt.columns[OmicsExpressionGenesExpectedCountModel_tpm_sqrt.columns.str.contains("^MT-")]

fig, ax = plt.subplots()
fig.set_size_inches(8, 5)
plotting.density_scatter(OmicsExpressionGenesExpectedCountModel_tpm_sqrt.loc[:, :].median(axis=1),
                         OmicsExpressionGenesExpectedCountModel_tpm_sqrt.loc[:, chrM_labels].median(axis=1),
                         ax=ax)
ax.set_ylabel("Mitochondrial median expression")
ax.set_xlabel("All gene median expression")
#ax.set_ylim(0, 200)

fig, ax = plt.subplots()
fig.set_size_inches(8, 5)
plotting.density_scatter(OmicsExpressionAllGenesTPMLogp1Profile.loc[:, :].median(axis=1),
                         OmicsExpressionAllGenesTPMLogp1Profile.loc[:, chrM_labels].median(axis=1),
                         ax=ax)
ax.set_ylabel("Mitochondrial median expression")
ax.set_xlabel("All gene median expression")

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(8, 5)

highest_sample = OmicsExpressionGenesExpectedCountModel_tpm_chrM.loc['ACH-001388', :]
lowest_sample = OmicsExpressionGenesExpectedCountModel_tpm_chrM.loc['ACH-000904', :]
plotting.density_scatter(highest_sample,
                         lowest_sample, ax=ax,
                         label_specific=chrM_labels)
sns.despine()
                         
ax.set_xlabel("Expression in ACH-001388")
ax.set_ylabel("Expression in ACH-000904")
#ax[0].set_xlim(0, 18)
#ax[0].set_ylim(0, 18)

fig, ax = plt.subplots()
fig.set_size_inches(8, 5)
highest_sample = OmicsExpressionGenesExpectedCountModel_tpm.loc['ACH-001388', :]
lowest_sample = OmicsExpressionGenesExpectedCountModel_tpm.loc['ACH-000904', :]
plotting.density_scatter(highest_sample,
                         lowest_sample, ax=ax,
                         trend_line_args={'color': 'b'},
                         label_specific=chrM_labels)
sns.despine()
                       
ax.set_xlabel("Expression in ACH-001388")
ax.set_ylabel("Expression in ACH-000904")
#ax[1].set_xlim(0, 18)
#ax[1].set_ylim(0, 18)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(8, 5)

highest_sample = OmicsExpressionGenesExpectedCountModel_tpm_sqrt.loc['ACH-001388', :]
lowest_sample = OmicsExpressionGenesExpectedCountModel_tpm_sqrt.loc['ACH-000904', :]
plotting.density_scatter(highest_sample,
                         lowest_sample, ax=ax,
                         label_specific=chrM_labels)
sns.despine()
                         
ax.set_xlabel("Expression in ACH-001388")
ax.set_ylabel("Expression in ACH-000904")
#ax[0].set_xlim(0, 18)
#ax[0].set_ylim(0, 18)

fig, ax = plt.subplots()
fig.set_size_inches(8, 5)
highest_sample = OmicsExpressionGenesExpectedCountModel_tpm.loc['ACH-001388', :]
lowest_sample = OmicsExpressionGenesExpectedCountModel_tpm.loc['ACH-000904', :]
plotting.density_scatter(highest_sample,
                         lowest_sample, ax=ax,
                         trend_line_args={'color': 'b'},
                         label_specific=chrM_labels)
sns.despine()
                       
ax.set_xlabel("Expression in ACH-001388")
ax.set_ylabel("Expression in ACH-000904")
#ax[1].set_xlim(0, 18)
#ax[1].set_ylim(0, 18)

In [None]:
OmicsExpressionGenesExpectedCountModel_tpm_chrM.to_csv("OmicsExpressionGenesExpectedCountModel_tpm_chrM_sqrt.csv")

In [None]:
update = True
if update:
   new_dataset_id = tc.update_dataset(
        dataset_id="post-23q4-gene-expression-renormalization-chrm-sqrt-e82f",
        changes_description="post 23Q4 adjust gene expression normalization square root normalization after removing chrM from denomitor",
        upload_files=[
            {
                "path": "OmicsExpressionGenesExpectedCountModel_tpm_chrM_sqrt.csv",
                "name": "OmicsExpressionGenesExpectedCountModel_tpm_chrM_sqrt", # optional, will use file name if not provided
                "format": "NumericMatrixCSV", # or "NumericMatrixCSV" or "TableCSV"
                "encoding": "utf-8" # optional (but recommended), will use iso-8859-1 if not provided
            }
        ],
        upload_async=False,
        dataset_description="post 23Q4 adjust gene expression normalization after removing chrM",
   )
else:
   new_dataset_id = tc.create_dataset(
       "post-23q4-gene-expression-renormalization-chrM-sqrt",
       dataset_description="post 23Q4 adjust gene expression normalization",
       upload_files=[
           {
               "path": "OmicsExpressionGenesExpectedCountModel_tpm_chrM_sqrt.csv",
               "name": "OmicsExpressionGenesExpectedCountModel_tpm_chrM_sqrt", # optional, will use file name if not provided
               "format": "NumericMatrixCSV", # or "NumericMatrixCSV" or "TableCSV"
               "encoding": "utf-8" # optional (but recommended), will use iso-8859-1 if not provided
           }
       ],
       folder_id="a9eedc220a6a4e70b8f1e64d2e57ed87", # optional, will default to your home folder if not provided
   )

new_dataset_id

In [None]:
OmicsExpressionGenesExpectedCountModel_tpm_sqrt.to_csv("OmicsExpressionGenesExpectedCountModel_tpm_sqrt.csv")

In [None]:
update = False
if update:
   new_dataset_id = tc.update_dataset(
        dataset_id="post-23q4-gene-expression-renormalization-chrm-sqrt-e82f",
        changes_description="post 23Q4 adjust gene expression normalization square root normalization after removing chrM from denomitor",
        upload_files=[
            {
                "path": "OmicsExpressionGenesExpectedCountModel_tpm_chrM_sqrt.csv",
                "name": "OmicsExpressionGenesExpectedCountModel_tpm_chrM_sqrt", # optional, will use file name if not provided
                "format": "NumericMatrixCSV", # or "NumericMatrixCSV" or "TableCSV"
                "encoding": "utf-8" # optional (but recommended), will use iso-8859-1 if not provided
            }
        ],
        upload_async=False,
        dataset_description="post 23Q4 adjust gene expression normalization after removing chrM",
   )
else:
   new_dataset_id = tc.create_dataset(
       "post-23q4-gene-expression-renormalization-sqrt",
       dataset_description="post 23Q4 adjust gene expression normalization",
       upload_files=[
           {
               "path": "OmicsExpressionGenesExpectedCountModel_tpm_sqrt.csv",
               "name": "OmicsExpressionGenesExpectedCountModel_tpm_sqrt", # optional, will use file name if not provided
               "format": "NumericMatrixCSV", # or "NumericMatrixCSV" or "TableCSV"
               "encoding": "utf-8" # optional (but recommended), will use iso-8859-1 if not provided
           }
       ],
       folder_id="a9eedc220a6a4e70b8f1e64d2e57ed87", # optional, will default to your home folder if not provided
   )

new_dataset_id

# Aggregate read count table for testing

In [None]:
import pandas as pd
import dalmatian as dm

In [None]:
wm = dm.WorkspaceManager("broad-firecloud-ccle/DEV_DepMap_hg38_RNAseq")
terra_rnaseq_df = wm.get_samples()

In [None]:
def load_rnaseqc(terra_path):
    rnaseqc_count_df = pd.read_csv(terra_path, sep='\t', skiprows=2)
    rnaseqc_count_df = rnaseqc_count_df.set_index(rnaseqc_count_df.apply(lambda x: f"{x[1]} ({x[0].split('.')[0]})", axis=1))
    rnaseqc_count_df = rnaseqc_count_df.drop(["Name", "Description"], axis=1)
    return rnaseqc_count_df

In [None]:
from multiprocessing import Pool
pool = Pool(8)
rnaseqc_count_dfs = pool.map(load_rnaseqc, terra_rnaseq_df.rnaseqc2_gene_counts)

In [None]:
pool.close()

In [None]:
rnaseqc_count_mat = pd.concat(rnaseqc_count_dfs, axis=1)

In [None]:
rnaseqc_count_mat = rnaseqc_count_mat.T

In [None]:
rnaseqc_count_mat.shape

In [None]:
import pickle
with open("/home/ubuntu/pr_table.pkl", "rb") as input_file:
    pr_dict = pickle.load(input_file)

In [None]:
rnaseqc_count_mat.index = rnaseqc_count_mat.index.map(pr_dict)

In [None]:
rnaseqc_count_mat.head()

In [None]:
rnaseqc_count_mat = rnaseqc_count_mat.loc[~rnaseqc_count_mat.index.isnull(), :]

In [None]:
rnaseqc_count_mat.shape

In [None]:
rnaseqc_count_lengths = OmicsExpressionAllGenesEffectiveLengthProfile.loc[:, np.intersect1d(rnaseqc_count_mat.columns, OmicsExpressionAllGenesEffectiveLengthProfile.columns)]

In [None]:
rnaseqc_count_mat.shape, rnaseqc_count_lengths.shape

In [None]:
rnaseqc_count_lengths = rnaseqc_count_lengths.loc[np.intersect1d(rnaseqc_count_mat.index, rnaseqc_count_lengths.index), :]

In [None]:
rnaseqc_count_lengths.shape

In [None]:
rnaseqc_count_mat = rnaseqc_count_mat.loc[rnaseqc_count_lengths.index, :]

In [None]:
rnaseqc_count_mat = rnaseqc_count_mat.loc[:, rnaseqc_count_lengths.columns]

In [None]:
rnaseqc_count_mat.shape

In [None]:
rnaseqc_count_mat = rnaseqc_count_mat.loc[:, ~rnaseqc_count_mat.columns.duplicated()]

In [None]:
rnaseqc_count_mat.shape

In [None]:
rnaseqc_count_mat.head()

In [None]:
rnaseqc_count_mat.to_csv("rnaseqc_count_mat.csv")

In [None]:
update = False
if update:
   new_dataset_id = tc.update_dataset(
        dataset_id="post-23q4-gene-expression-renormalization-chrm-8a6b",
        changes_description="post 23Q4 adjust gene expression normalization 2",
        upload_files=[
            {
                "path": "OmicsExpressionGenesExpectedCountModel_tpm_chrM_sqrt.csv",
                "name": "OmicsExpressionGenesExpectedCountModel_tpm_chrM", # optional, will use file name if not provided
                "format": "NumericMatrixCSV", # or "NumericMatrixCSV" or "TableCSV"
                "encoding": "utf-8" # optional (but recommended), will use iso-8859-1 if not provided
            }
        ],
        upload_async=False,
        dataset_description="post 23Q4 adjust gene expression normalization",
   )
else:
   new_dataset_id = tc.create_dataset(
       "rnqseqc2_gene_count_profile",
       dataset_description="rnqseqc2 count profile",
       upload_files=[
           {
               "path": "rnaseqc_count_mat.csv",
               "name": "rnqseqc2_gene_count_profile", # optional, will use file name if not provided
               "format": "NumericMatrixCSV", # or "NumericMatrixCSV" or "TableCSV"
               "encoding": "utf-8" # optional (but recommended), will use iso-8859-1 if not provided
           }
       ],
       folder_id="a9eedc220a6a4e70b8f1e64d2e57ed87", # optional, will default to your home folder if not provided
   )

new_dataset_id

In [None]:
rnaseqc_count_mat_tpm = calculate_tpm_correct_chrM_log(rnaseqc_count_mat, rnaseqc_count_lengths)

In [None]:
rnaseqc_count_mat_tpm.head()

In [None]:
rnaseqc_count_mat_tpm.columns = rnaseqc_count_mat_tpm.columns.map(protcod_rename)
rnaseqc_count_mat_tpm = rnaseqc_count_mat_tpm.loc[:, ~rnaseqc_count_mat_tpm.columns.isnull()]

In [None]:
rnaseqc_count_mat_tpm.shape

In [None]:
rnaseqc_count_mat_tpm.index = rnaseqc_count_mat_tpm.index.map(profile_to_model_dict)

In [None]:
rnaseqc_count_mat_tpm.shape

In [None]:
rnaseqc_count_mat_tpm = rnaseqc_count_mat_tpm.loc[~rnaseqc_count_mat_tpm.index.isnull(), :]

In [None]:
rnaseqc_count_mat_tpm.shape

In [None]:
new_sample_median_expression = rnaseqc_count_mat_tpm.median(axis=1)
plotting.waterfall_plot(new_sample_median_expression, s=1)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(8, 5)
chrM_labels = rnaseqc_count_mat_tpm.columns[rnaseqc_count_mat_tpm.columns.str.contains("^MT-")]

plotting.density_scatter(rnaseqc_count_mat_tpm.loc[:, :].median(axis=1),
                         rnaseqc_count_mat_tpm.loc[:, chrM_labels].median(axis=1),
                         ax=ax)
ax.set_ylabel("Mitochondrial median expression")
ax.set_xlabel("All gene median expression")

In [None]:
fig, ax = plt.subplots(1, 2)
fig.set_size_inches(16, 5)
chrM_labels = rnaseqc_count_mat_tpm.columns[rnaseqc_count_mat_tpm.columns.str.contains("^MT-")]
highest_sample = rnaseqc_count_mat_tpm.loc['ACH-001386', :]
lowest_sample = rnaseqc_count_mat_tpm.loc['ACH-000904', :]
plotting.density_scatter(highest_sample,
                         lowest_sample, ax=ax[0],
                         label_specific=chrM_labels)
sns.despine()
                         
ax[0].set_xlabel("Expression in ACH-001386")
ax[0].set_ylabel("Expression in ACH-000904")
ax[0].set_xlim(0, 18)
ax[0].set_ylim(0, 18)


highest_sample = rnaseqc_count_mat_tpm.loc['ACH-000399', :]
lowest_sample = rnaseqc_count_mat_tpm.loc['ACH-001097', :]

plotting.density_scatter(highest_sample,
                         lowest_sample, ax=ax[1],
                         label_specific=chrM_labels)
sns.despine()
                         
ax[1].set_xlabel("Expression in ACH-000399")
ax[1].set_ylabel("Expression in ACH-001097")
ax[1].set_xlim(0, 18)
ax[1].set_ylim(0, 18)