### This notebook contains code for generating batch-corrected versions of the RNAseq data in the 24q2 data release and uploading them to Taiga.

In [None]:
from taigapy import TaigaClient
import pandas as pd
from inmoose.pycombat import pycombat_norm, pycombat_seq
import numpy as np

In [None]:
tc = TaigaClient()

In [None]:
OmicsExpressionProteinCodingGenesTPMLogp1_internal = tc.get(name='internal-24q2-3719', version=63, file='OmicsExpressionProteinCodingGenesTPMLogp1')
OmicsExpressionProteinCodingGenesTPMLogp1Stranded_internal = tc.get(name='internal-24q2-3719', version=63, file='OmicsExpressionProteinCodingGenesTPMLogp1Stranded')

OmicsExpressionProteinCodingGenesTPMLogp1_public = tc.get(name='public-24q2-356f', version=33, file='OmicsExpressionProteinCodingGenesTPMLogp1')
OmicsExpressionProteinCodingGenesTPMLogp1Stranded_public = tc.get(name='public-24q2-356f', version=33, file='OmicsExpressionProteinCodingGenesTPMLogp1Stranded')

OmicsExpressionProteinCodingGenesTPMLogp1_dmc = tc.get(name='dmc-24q2-5194', version=40, file='OmicsExpressionProteinCodingGenesTPMLogp1')
OmicsExpressionProteinCodingGenesTPMLogp1Stranded_dmc = tc.get(name='dmc-24q2-5194', version=40, file='OmicsExpressionProteinCodingGenesTPMLogp1Stranded')

In [None]:
OmicsProfiles_internal = tc.get(name='internal-24q2-3719', version=63, file='OmicsProfiles')
OmicsProfiles_public = tc.get(name='public-24q2-356f', version=2, file='OmicsProfiles')
OmicsProfiles_dmc = tc.get(name='dmc-24q2-5194', version=2, file='OmicsProfiles')

In [None]:
OmicsDefaultModelProfiles_internal = tc.get(name='internal-24q2-3719', version=63, file='OmicsDefaultModelProfiles')
OmicsDefaultModelProfiles_public = tc.get(name='public-24q2-356f', version=2, file='OmicsDefaultModelProfiles')
OmicsDefaultModelProfiles_dmc = tc.get(name='dmc-24q2-5194', version=2, file='OmicsDefaultModelProfiles')

### Internal

Retreive profile IDs for internal and filter based on strandedness. Some profile IDs correspond to the same model ID, so remove duplicates using OmicsDefaultModelProfiles.

In [None]:
nonstranded_profiles = OmicsProfiles_internal[OmicsProfiles_internal['Stranded'] == False]['ProfileID']

In [None]:
nonstranded_model_ids = OmicsDefaultModelProfiles_internal[OmicsDefaultModelProfiles_internal['ProfileID'].isin(nonstranded_profiles)]['ModelID']

In [None]:
stranded_profiles = OmicsProfiles_internal[OmicsProfiles_internal['Stranded'] == True]['ProfileID']

In [None]:
stranded_model_ids = OmicsDefaultModelProfiles_internal[OmicsDefaultModelProfiles_internal['ProfileID'].isin(stranded_profiles)]['ModelID']

In [None]:
len(nonstranded_model_ids), len(stranded_model_ids)

Find overlapping gene names between the original expression dataset (includes unstranded RNAseq w/ unstranded RSEM mode + stranded RNAseq w/ unstranded RSEM mode) and new expression dataset (stranded RNAseq w/ stranded RSEM mode). 

In [None]:
genes_overlap_ids = list(set(OmicsExpressionProteinCodingGenesTPMLogp1Stranded_internal.loc[stranded_model_ids].columns).intersection(set(OmicsExpressionProteinCodingGenesTPMLogp1_internal.loc[nonstranded_model_ids].columns)))

In [None]:
nonstranded_24q2 = OmicsExpressionProteinCodingGenesTPMLogp1_internal.loc[nonstranded_model_ids, genes_overlap_ids]

In [None]:
stranded_24q2 = OmicsExpressionProteinCodingGenesTPMLogp1Stranded_internal.loc[stranded_model_ids, genes_overlap_ids]

In [None]:
tpm_24q2 = pd.concat((nonstranded_24q2, stranded_24q2), axis=0)

In [None]:
tpm_24q2

In [None]:
strandness = np.array([0] * len(nonstranded_model_ids) + [1] * len(stranded_model_ids))

In [None]:
tpm_corrected_24q2 = pycombat_norm(tpm_24q2.T, strandness)

In [None]:
tpm_corrected_24q2 = tpm_corrected_24q2.T

In [None]:
tpm_corrected_24q2

In [None]:
tpm_corrected_24q2.to_csv('/Users/gulatide/Documents/strandedness/proteinCoding_genes_tpm_logp1_profile_batch_corrected.csv')

In [None]:
from taigapy import default_tc as tc

new_dataset_id = tc.update_dataset(
    changes_description="adding batch corrected version to 24Q2 release",
    dataset_permaname="internal-24q2-3719",
    upload_files=[
        {
            "path": "/Users/gulatide/Documents/strandedness/proteinCoding_genes_tpm_logp1_profile_batch_corrected.csv",
            "name": "OmicsExpressionProteinCodingGenesTPMLogp1BatchCorrected", # optional, will use file name if not provided
            "format": "NumericMatrixCSV", # or "NumericMatrixCSV" or "TableCSV"
            "encoding": "utf-8" # optional (but recommended), will use iso-8859-1 if not provided
        }
    ],
    add_all_existing_files=True,
    upload_async=False,
    dataset_description="24Q2 release of the DepMap dataset for the DepMap Portal. Please look at the README file for additional information about this dataset.", # optional (but recommended)
)

Validate batch correction does what it is supposed to:

In [None]:
lineage_v2 = [0]*len(strandness)

In [None]:
from sklearn.decomposition import PCA
import pandas as pd
import colorcet as cc
import umap
import seaborn as sns
import matplotlib.pyplot as plt
u = umap.UMAP()
palette = sns.color_palette(cc.glasbey, n_colors=33)

fig, ax = plt.subplots(3, 3)
fig.set_size_inches(21, 20)

for i, comp_number in enumerate([5, 10, 30]):
    pca = PCA(n_components=comp_number, whiten=True, random_state=999)
#     strandness = np.hstack([OmicsExpressionAllGenesTPMLogp1Profile.index.map(dict(zip(strand_count_dfs.Profile, strand_count_dfs.loc[:, 'strandness']))), ['rsem-stranded']*len(paired_data)])
    components = pca.fit_transform(tpm_corrected_24q2.T)
    print(components.shape, strandness.shape)
    if i == 0:
        pca_plot_data = pd.DataFrame({"comp1": components[:, 0], "comp2": components[:, 1], "lineage": lineage_v2, "strandness": strandness})
        # pca_plot_data = pd.DataFrame({"comp1": components[:, 0], "comp2": components[:, 1], "lineage": lineage, "strandness": np.hstack([OmicsExpressionAllGenesTPMLogp1Profile.index.map(dict(zip(strand_count_dfs.Profile, strand_count_dfs.loc[:, 'strandness']))), ['rsem-stranded']*len(paired_data)])})
        sns.scatterplot(data=pca_plot_data, x="comp1", y="comp2", hue="strandness", alpha=0.8, s=8, ax=ax[i][0], palette=palette)
        ax[i][0].set_xlabel(f'comp1 {pca.explained_variance_ratio_[0]*100:.2f} %')
        ax[i][0].set_ylabel(f'comp2 {pca.explained_variance_ratio_[1]*100:.2f} %')
    elif i == 1:
        pca_plot_data = pd.DataFrame({"comp1": components[:, 0], "comp3": components[:, 2], "lineage": lineage_v2, "strandness": strandness})
        # pca_plot_data = pd.DataFrame({"comp1": components[:, 0], "comp3": components[:, 2], "lineage": lineage, "strandness": OmicsExpressionAllGenesTPMLogp1Profile.index.map(dict(zip(strand_count_dfs.Profile, strand_count_dfs.loc[:, 'strandness'])))})
        sns.scatterplot(data=pca_plot_data, x="comp1", y="comp3", hue="strandness", alpha=0.8, s=8, ax=ax[i][0], palette=palette)
        ax[i][0].set_xlabel(f'comp1 {pca.explained_variance_ratio_[0]*100:.2f} %')
        ax[i][0].set_ylabel(f'comp3 {pca.explained_variance_ratio_[2]*100:.2f} %')
    elif i == 2:
        pca_plot_data = pd.DataFrame({"comp2": components[:, 1], "comp3": components[:, 2], "lineage": lineage_v2, "strandness": strandness})
        # pca_plot_data = pd.DataFrame({"comp2": components[:, 1], "comp3": components[:, 2], "lineage": lineage, "strandness": OmicsExpressionAllGenesTPMLogp1Profile.index.map(dict(zip(strand_count_dfs.Profile, strand_count_dfs.loc[:, 'strandness'])))})
        sns.scatterplot(data=pca_plot_data, x="comp2", y="comp3", hue="strandness", alpha=0.8, s=8, ax=ax[i][0], palette=palette)
        ax[i][0].set_xlabel(f'comp2 {pca.explained_variance_ratio_[1]*100:.2f} %')
        ax[i][0].set_ylabel(f'comp3 {pca.explained_variance_ratio_[2]*100:.2f} %')
    
    umap_rna = u.fit_transform(components)
    
    pca_plot_data = pd.DataFrame({"comp1": umap_rna[:, 0], "comp2": umap_rna[:, 1], "lineage": lineage_v2, "strandness": strandness})
    # pca_plot_data = pd.DataFrame({"comp1": umap_rna[:, 0], "comp2": umap_rna[:, 1], "lineage": lineage, "strandness": OmicsExpressionAllGenesTPMLogp1Profile.index.map(dict(zip(strand_count_dfs.Profile, strand_count_dfs.loc[:, 'strandness'])))})
    sns.scatterplot(data=pca_plot_data, x="comp1", y="comp2", hue="strandness", alpha=0.8, s=8, ax=ax[i][1], palette=palette)

    pca_plot_data = pd.DataFrame({"comp1": umap_rna[:, 0], "comp2": umap_rna[:, 1], "lineage": lineage_v2, "strandness": strandness})
    # pca_plot_data = pd.DataFrame({"comp1": umap_rna[:, 0], "comp2": umap_rna[:, 1], "lineage": lineage, "strandness": OmicsExpressionAllGenesTPMLogp1Profile.index.map(dict(zip(strand_count_dfs.Profile, strand_count_dfs.loc[:, 'strandness'])))})
    sns.scatterplot(data=pca_plot_data, x="comp1", y="comp2", hue="lineage", alpha=0.8, s=8, ax=ax[i][2], palette=palette)
    sns.move_legend(ax[i][2], "upper left", bbox_to_anchor=(1, 1), frameon=False, ncols=3)


### Public

In [None]:
nonstranded_profiles = OmicsProfiles_public[OmicsProfiles_public['Stranded'] == False]['ProfileID']

In [None]:
nonstranded_model_ids_public = OmicsDefaultModelProfiles_public[OmicsDefaultModelProfiles_public['ProfileID'].isin(nonstranded_profiles)]['ModelID']

In [None]:
stranded_profiles = OmicsProfiles_public[OmicsProfiles_public['Stranded'] == True]['ProfileID']

In [None]:
stranded_model_ids_public = OmicsDefaultModelProfiles_public[OmicsDefaultModelProfiles_public['ProfileID'].isin(stranded_profiles)]['ModelID']

In [None]:
len(nonstranded_model_ids_public), len(stranded_model_ids_public)

In [None]:
genes_overlap_ids_public = list(set(OmicsExpressionProteinCodingGenesTPMLogp1Stranded_public.loc[stranded_model_ids_public].columns).intersection(set(OmicsExpressionProteinCodingGenesTPMLogp1_public.loc[nonstranded_model_ids_public].columns)))

In [None]:
nonstranded_24q2_public = OmicsExpressionProteinCodingGenesTPMLogp1_public.loc[nonstranded_model_ids_public, genes_overlap_ids_public]

In [None]:
stranded_24q2_public = OmicsExpressionProteinCodingGenesTPMLogp1Stranded_public.loc[stranded_model_ids_public, genes_overlap_ids_public]

In [None]:
tpm_24q2_public = pd.concat((nonstranded_24q2_public, stranded_24q2_public), axis=0)

Drop the gene below from expression matrix since the variance is zero across model IDs, which leads to a divide-by-zero error when doing batch correction:

In [None]:
tpm_24q2_public = tpm_24q2_public.drop(columns=['DEFB131A (644414)'])

In [None]:
strandness_public = np.array([0] * len(nonstranded_model_ids_public) + [1] * len(stranded_model_ids_public))

In [None]:
len(strandness_public)

In [None]:
from inmoose.pycombat import pycombat_norm

In [None]:
tpm_corrected_24q2_public = pycombat_norm(tpm_24q2_public.T, strandness_public)

In [None]:
tpm_corrected_24q2_public

In [None]:
tpm_corrected_24q2_public = tpm_corrected_24q2_public.T

In [None]:
tpm_corrected_24q2_public.shape

In [None]:
tpm_corrected_24q2_public.to_csv('/Users/gulatide/Documents/strandedness/proteinCoding_genes_tpm_logp1_profile_batch_corrected.csv')

In [None]:
from taigapy import default_tc as tc

new_dataset_id = tc.update_dataset(
    changes_description="adding batch corrected version to 24Q2 release",
    dataset_permaname="public-24q2-356f",
    upload_files=[
        {
            "path": "/Users/gulatide/Documents/strandedness/proteinCoding_genes_tpm_logp1_profile_batch_corrected.csv",
            "name": "OmicsExpressionProteinCodingGenesTPMLogp1BatchCorrected", # optional, will use file name if not provided
            "format": "NumericMatrixCSV", # or "NumericMatrixCSV" or "TableCSV"
            "encoding": "utf-8" # optional (but recommended), will use iso-8859-1 if not provided
        }
    ],
    add_all_existing_files=True,
    upload_async=False,
    dataset_description="24Q2 release of the DepMap dataset for the DepMap Portal. Please look at the README file for additional information about this dataset.", # optional (but recommended)
)

In [None]:
lineage_v2 = [0]*len(strandness_public)

In [None]:
from sklearn.decomposition import PCA
import pandas as pd
import colorcet as cc
import umap
import seaborn as sns
import matplotlib.pyplot as plt
u = umap.UMAP()
palette = sns.color_palette(cc.glasbey, n_colors=33)

fig, ax = plt.subplots(3, 3)
fig.set_size_inches(21, 20)

for i, comp_number in enumerate([5, 10, 30]):
    pca = PCA(n_components=comp_number, whiten=True, random_state=999)
#     strandness = np.hstack([OmicsExpressionAllGenesTPMLogp1Profile.index.map(dict(zip(strand_count_dfs.Profile, strand_count_dfs.loc[:, 'strandness']))), ['rsem-stranded']*len(paired_data)])
    components = pca.fit_transform(tpm_corrected_24q2_public.T)
    print(components.shape, strandness_public.shape)
    if i == 0:
        pca_plot_data = pd.DataFrame({"comp1": components[:, 0], "comp2": components[:, 1], "lineage": lineage_v2, "strandness": strandness_public})
        # pca_plot_data = pd.DataFrame({"comp1": components[:, 0], "comp2": components[:, 1], "lineage": lineage, "strandness": np.hstack([OmicsExpressionAllGenesTPMLogp1Profile.index.map(dict(zip(strand_count_dfs.Profile, strand_count_dfs.loc[:, 'strandness']))), ['rsem-stranded']*len(paired_data)])})
        sns.scatterplot(data=pca_plot_data, x="comp1", y="comp2", hue="strandness", alpha=0.8, s=8, ax=ax[i][0], palette=palette)
        ax[i][0].set_xlabel(f'comp1 {pca.explained_variance_ratio_[0]*100:.2f} %')
        ax[i][0].set_ylabel(f'comp2 {pca.explained_variance_ratio_[1]*100:.2f} %')
    elif i == 1:
        pca_plot_data = pd.DataFrame({"comp1": components[:, 0], "comp3": components[:, 2], "lineage": lineage_v2, "strandness": strandness_public})
        # pca_plot_data = pd.DataFrame({"comp1": components[:, 0], "comp3": components[:, 2], "lineage": lineage, "strandness": OmicsExpressionAllGenesTPMLogp1Profile.index.map(dict(zip(strand_count_dfs.Profile, strand_count_dfs.loc[:, 'strandness'])))})
        sns.scatterplot(data=pca_plot_data, x="comp1", y="comp3", hue="strandness", alpha=0.8, s=8, ax=ax[i][0], palette=palette)
        ax[i][0].set_xlabel(f'comp1 {pca.explained_variance_ratio_[0]*100:.2f} %')
        ax[i][0].set_ylabel(f'comp3 {pca.explained_variance_ratio_[2]*100:.2f} %')
    elif i == 2:
        pca_plot_data = pd.DataFrame({"comp2": components[:, 1], "comp3": components[:, 2], "lineage": lineage_v2, "strandness": strandness_public})
        # pca_plot_data = pd.DataFrame({"comp2": components[:, 1], "comp3": components[:, 2], "lineage": lineage, "strandness": OmicsExpressionAllGenesTPMLogp1Profile.index.map(dict(zip(strand_count_dfs.Profile, strand_count_dfs.loc[:, 'strandness'])))})
        sns.scatterplot(data=pca_plot_data, x="comp2", y="comp3", hue="strandness", alpha=0.8, s=8, ax=ax[i][0], palette=palette)
        ax[i][0].set_xlabel(f'comp2 {pca.explained_variance_ratio_[1]*100:.2f} %')
        ax[i][0].set_ylabel(f'comp3 {pca.explained_variance_ratio_[2]*100:.2f} %')
    
    umap_rna = u.fit_transform(components)
    
    pca_plot_data = pd.DataFrame({"comp1": umap_rna[:, 0], "comp2": umap_rna[:, 1], "lineage": lineage_v2, "strandness": strandness_public})
    # pca_plot_data = pd.DataFrame({"comp1": umap_rna[:, 0], "comp2": umap_rna[:, 1], "lineage": lineage, "strandness": OmicsExpressionAllGenesTPMLogp1Profile.index.map(dict(zip(strand_count_dfs.Profile, strand_count_dfs.loc[:, 'strandness'])))})
    sns.scatterplot(data=pca_plot_data, x="comp1", y="comp2", hue="strandness", alpha=0.8, s=8, ax=ax[i][1], palette=palette)

    pca_plot_data = pd.DataFrame({"comp1": umap_rna[:, 0], "comp2": umap_rna[:, 1], "lineage": lineage_v2, "strandness": strandness_public})
    # pca_plot_data = pd.DataFrame({"comp1": umap_rna[:, 0], "comp2": umap_rna[:, 1], "lineage": lineage, "strandness": OmicsExpressionAllGenesTPMLogp1Profile.index.map(dict(zip(strand_count_dfs.Profile, strand_count_dfs.loc[:, 'strandness'])))})
    sns.scatterplot(data=pca_plot_data, x="comp1", y="comp2", hue="lineage", alpha=0.8, s=8, ax=ax[i][2], palette=palette)
    sns.move_legend(ax[i][2], "upper left", bbox_to_anchor=(1, 1), frameon=False, ncols=3)


### DMC

In [None]:
nonstranded_profiles = OmicsProfiles_dmc[OmicsProfiles_dmc['Stranded'] == False]['ProfileID']

In [None]:
nonstranded_model_ids_dmc = OmicsDefaultModelProfiles_dmc[OmicsDefaultModelProfiles_dmc['ProfileID'].isin(nonstranded_profiles)]['ModelID']

In [None]:
stranded_profiles = OmicsProfiles_dmc[OmicsProfiles_dmc['Stranded'] == True]['ProfileID']

In [None]:
stranded_model_ids_dmc = OmicsDefaultModelProfiles_dmc[OmicsDefaultModelProfiles_dmc['ProfileID'].isin(stranded_profiles)]['ModelID']

In [None]:
len(nonstranded_model_ids_dmc), len(stranded_model_ids_dmc)

In [None]:
genes_overlap_ids_dmc = list(set(OmicsExpressionProteinCodingGenesTPMLogp1Stranded_dmc.loc[stranded_model_ids_dmc].columns).intersection(set(OmicsExpressionProteinCodingGenesTPMLogp1_dmc.loc[nonstranded_model_ids_dmc].columns)))

In [None]:
nonstranded_24q2_dmc = OmicsExpressionProteinCodingGenesTPMLogp1_dmc.loc[nonstranded_model_ids_dmc, genes_overlap_ids_dmc]

In [None]:
stranded_24q2_dmc = OmicsExpressionProteinCodingGenesTPMLogp1Stranded_dmc.loc[stranded_model_ids_dmc, genes_overlap_ids_dmc]

In [None]:
tpm_24q2_dmc = pd.concat((nonstranded_24q2_dmc, stranded_24q2_dmc), axis=0)

In [None]:
strandness_dmc = np.array([0] * len(nonstranded_model_ids_dmc) + [1] * len(stranded_model_ids_dmc))

In [None]:
tpm_corrected_24q2_dmc = pycombat_norm(tpm_24q2_dmc.T, strandness_dmc)

In [None]:
tpm_corrected_24q2_dmc = tpm_corrected_24q2_dmc.T

In [None]:
tpm_corrected_24q2_dmc.shape

In [None]:
tpm_corrected_24q2_dmc.to_csv('/Users/gulatide/Documents/strandedness/proteinCoding_genes_tpm_logp1_profile_batch_corrected.csv')

In [None]:
from taigapy import default_tc as tc

new_dataset_id = tc.update_dataset(
    changes_description="adding batch corrected version to 24Q2 release",
    dataset_permaname="dmc-24q2-5194",
    upload_files=[
        {
            "path": "/Users/gulatide/Documents/strandedness/proteinCoding_genes_tpm_logp1_profile_batch_corrected.csv",
            "name": "OmicsExpressionProteinCodingGenesTPMLogp1BatchCorrected", # optional, will use file name if not provided
            "format": "NumericMatrixCSV", # or "NumericMatrixCSV" or "TableCSV"
            "encoding": "utf-8" # optional (but recommended), will use iso-8859-1 if not provided
        }
    ],
    add_all_existing_files=True,
    upload_async=False,
    dataset_description="24Q2 release of the DepMap dataset for the DepMap Portal. Please look at the README file for additional information about this dataset.", # optional (but recommended)
)