## Reproducing method from this Nature 2021 paper https://www.nature.com/articles/s41586-020-03114-6 by converting original R code https://github.com/broadinstitute/Aneuploidy_dependencies/tree/master into python

In [None]:
import pandas as pd
from taigapy import TaigaClient
tc = TaigaClient()

### Load profileID-level PureCN's outputs from the latest release

In [None]:
OmicsAbsoluteCNSegmentsProfile = tc.get(name='internal-23q2-1e49', version=97, file='OmicsAbsoluteCNSegmentsProfile')
OmicsSignaturesProfile = tc.get(name='internal-23q2-1e49', version=97, file='OmicsSignaturesProfile')
OmicsProfiles = tc.get(name='internal-23q2-1e49', version=97, file='OmicsProfiles')

In [None]:
# pre-23Q4 only: rename misleading column name. Should be fixed after 23Q4
OmicsAbsoluteCNSegmentsProfile = OmicsAbsoluteCNSegmentsProfile.rename(columns={"MajorAlleleAbsoluteCN": "SegmentAbsoluteCN"})

In [None]:
OmicsSignaturesProfile = OmicsSignaturesProfile.reset_index().rename(columns={"index": "ProfileID"}).drop(columns=["MSIScore", "CIN", "LoHFraction", "WGD"])
OmicsSignaturesProfile = OmicsSignaturesProfile[~OmicsSignaturesProfile.Ploidy.isna()]

### Hg38 centromere coordinates taken from "Modeled centromeres and heterochromatin regions" https://www.ncbi.nlm.nih.gov/grc/human

In [None]:
cent = pd.read_csv("../data/Modeled_regions_for_GRCh38.tsv", sep="\t", index_col=False)
cent = cent[~(cent["#region_name"].str.startswith("HET")) & (~cent["chr"].isin(["X", "Y"]))].drop(columns=["#region_name"]).rename(columns={"chr": "Chromosome", "start": "cent_start", "stop": "cent_end"})
cent["cent_mid"] = (0.5*(cent["cent_start"] + cent["cent_end"])).round().astype(int)

In [None]:
cent['Chromosome']=cent['Chromosome'].astype(int)
OmicsAbsoluteCNSegmentsProfile['Chromosome']=OmicsAbsoluteCNSegmentsProfile['Chromosome'].astype(int)

In [None]:
merged_seg = OmicsAbsoluteCNSegmentsProfile.merge(cent, on=['Chromosome'], how="left")
merged_seg = merged_seg.merge(OmicsSignaturesProfile, on=['ProfileID'], how="left")

## Make arm-level calls

In [None]:
def arm_call(df, cn_colname="SegmentAbsoluteCN", width_colname="seg_width", ploidy_colname="Ploidy"):
    df = df[~df.arm.isna()]
    df.sort_values(cn_colname, inplace=True)
    cumsum = df[width_colname].cumsum()
    cutoff = df[width_colname].sum() / 2.0
    median = df[cn_colname][cumsum >= cutoff].iloc[0].round().astype(int)
    
    ploidy = df[ploidy_colname].iloc[0].round().astype(int)
    status = 0
    if median > ploidy:
        status = 1
    elif median < ploidy:
        status = -1
    return status

In [None]:
def get_which_arm(df, start_colname="Start", end_colname="End"):
    df["seg_cent"] = 0.5*(df[start_colname] + df[end_colname])
    df["arm"] = None
    df.loc[df['seg_cent'] < df['cent_start'], 'arm'] = 'p'
    df.loc[df['seg_cent'] > df['cent_end'], 'arm'] = 'q'
    return(df)

In [None]:
seg_with_arm = get_which_arm(merged_seg)

In [None]:
seg_with_arm["seg_width"] = seg_with_arm["End"] - seg_with_arm["Start"]
seg_with_arm["chrom_arm"] = seg_with_arm["Chromosome"].astype(str) + seg_with_arm["arm"]

In [None]:
cna_table = seg_with_arm.groupby(["ProfileID", "chrom_arm"]).apply(arm_call).unstack(level=1)

In [None]:
cna_table = cna_table.drop(columns=["21p", "22p"])

In [None]:
cna_table

In [None]:
aneuploidy = cna_table.abs().sum(axis=1)

In [None]:
renaming_dict = dict(zip(OmicsProfiles.ProfileID.tolist(), OmicsProfiles.ModelID.tolist()))

In [None]:
aneuploidy_model = aneuploidy.rename(index=renaming_dict)

In [None]:
aneuploidy

## attempt to replicate CCLE results using ABSOLUTE seg data

In [None]:
CCLE_ABSOLUTE_combined_segtab = tc.get(name='ccle-absolute-cn', version=5, file='CCLE_ABSOLUTE_combined_segtab')
CCLE_ABSOLUTE_combined_table = tc.get(name='ccle-absolute-cn', version=5, file='CCLE_ABSOLUTE_combined_table')

In [None]:
CCLE_ABSOLUTE_combined_segtab[(CCLE_ABSOLUTE_combined_segtab.Chromosome == 15) & (CCLE_ABSOLUTE_combined_segtab.DepMap_ID == "ACH-000001")]

In [None]:
CCLE_ABSOLUTE_combined_segtab = CCLE_ABSOLUTE_combined_segtab.drop(columns=["CCLE_ID", "Length", "Num_Probes", "Modal_HSCN_1", "Modal_HSCN_2", "Subclonal_HSCN_a1", "Subclonal_HSCN_a2", "Cancer_cell_frac_a1", "Ccf_ci95_low_a1", "Ccf_ci95_high_a1", "Cancer_cell_frac_a2", "Ccf_ci95_low_a2", "Ccf_ci95_high_a2", "LOH", "Homozygous_deletion"])
CCLE_ABSOLUTE_combined_segtab = CCLE_ABSOLUTE_combined_segtab.rename(columns={"Modal_Total_CN": "SegmentAbsoluteCN"})

In [None]:
CCLE_ABSOLUTE_combined_segtab = CCLE_ABSOLUTE_combined_segtab.merge(CCLE_ABSOLUTE_combined_table[["DepMap_ID", "ploidy"]].rename(columns={"ploidy": "Ploidy"}), on=['DepMap_ID'], how="left")

In [None]:
CCLE_ABSOLUTE_combined_table[CCLE_ABSOLUTE_combined_table.DepMap_ID == "ACH-000001"]

In [None]:
# hg19 centromere coordinates extracted from the rCGH package
hg19_cent = pd.read_csv("../data/hg19_cent.csv").rename(columns={"chrom": "Chromosome", "centStart": "cent_start", "centEnd": "cent_end"})

In [None]:
hg19_cent["cent_mid"] = (0.5*(hg19_cent["cent_start"] + hg19_cent["cent_end"])).round().astype(int)
hg19_cent['Chromosome']=hg19_cent['Chromosome'].astype(int)
CCLE_ABSOLUTE_combined_segtab['Chromosome']=CCLE_ABSOLUTE_combined_segtab['Chromosome'].astype(int)
ccle_merged_seg = CCLE_ABSOLUTE_combined_segtab.merge(hg19_cent, on=['Chromosome'], how="left")


In [None]:
ccle_merged_seg

In [None]:
def split_cent_crosses(df):
    cross_segs = (df.Start < df.cent_start) & (df.End > df.cent_end)
    print("number of segments that span centromere: ", cross_segs.sum())
    c = df[cross_segs].copy()
    c.loc[:, "Start"] = c.loc[:, "cent_mid"]
    df.append(c, ignore_index = True)
    df.loc[cross_segs, "End"] = df.loc[cross_segs, "cent_mid"]
    return(df)

In [None]:
split_ccle_merged_seg = split_cent_crosses(ccle_merged_seg)

In [None]:
ccle_seg_with_arm = get_which_arm(split_ccle_merged_seg)

In [None]:
ccle_seg_with_arm["seg_width"] = ccle_seg_with_arm["End"] - ccle_seg_with_arm["Start"]
ccle_seg_with_arm["chrom_arm"] = ccle_seg_with_arm["Chromosome"].astype(str) + ccle_seg_with_arm["arm"]

In [None]:
ccle_cna_table = ccle_seg_with_arm.groupby(["DepMap_ID", "chrom_arm"]).apply(arm_call).unstack(level=1)

In [None]:
ccle_aneuploidy = ccle_cna_table.abs().sum(axis=1)
ccle_aneuploidy.name = "new_aneuploidy"

In [None]:
ccle_merged_seg[(ccle_merged_seg.Chromosome == 22) & (ccle_merged_seg.DepMap_ID == "ACH-000001")]

In [None]:
ccle_aneuploidy

In [None]:
published_aneuploidy_scores = pd.read_csv("../data/aneuploidy_scores.csv")

In [None]:
old_and_new = published_aneuploidy_scores.merge(ccle_aneuploidy, on="DepMap_ID", how="left")

In [None]:
old_and_new

### R's weighted.median function does something weird! 

In [None]:
import numpy as np
import matplotlib.pyplot as plt

x = old_and_new["Aneuploidy score"]
y = old_and_new["new_aneuploidy"]

plt.scatter(x, y, alpha=0.5)

plt.xlabel("CCLE method")
plt.ylabel("current method")
plt.show()

## using the same (current) method, compare ABSOLUTE and PureCN outputs from WES

In [None]:
from depmap_omics_upload import tracker

mytracker = tracker.SampleTracker()
pr_table = mytracker.add_model_cols_to_prtable(["ModelID"])

In [None]:
wes_prs = pr_table[(pr_table.Datatype == "wes") & (pr_table.ProfileSource != "taiga")]

In [None]:
wes_prs

In [None]:
wes_aneuploidy = aneuploidy[aneuploidy.index.isin(wes_prs.index)]
renaming_dict = dict(zip(wes_prs.index.tolist(), wes_prs.ModelID.tolist()))
wes_aneuploidy_model = wes_aneuploidy.rename(index=renaming_dict)

In [None]:
wes_aneuploidy_model.name = "PureCN_aneuploidy"

In [None]:
wes_aneuploidy_model.index.name = "DepMap_ID"

In [None]:
wes_aneuploidy_model

In [None]:
old_and_new = published_aneuploidy_scores.merge(wes_aneuploidy_model, on="DepMap_ID", how="left")

In [None]:
import numpy as np
import matplotlib.pyplot as plt

x = old_and_new["Aneuploidy score"]
y = old_and_new["PureCN_aneuploidy"]

plt.scatter(x, y, alpha=0.2)

plt.xlabel("ABSOLUTE")
plt.ylabel("PureCN")
plt.title("WES only")
plt.show()

In [None]:
old_and_new[(old_and_new["Aneuploidy score"] < 10) & (old_and_new["PureCN_aneuploidy"] > 30)]

In [None]:
aneuploidy['PR-ZxbCmO']

## using the same (current) method, compare ABSOLUTE and PureCN outputs with DepMap's prioritization scheme (WGS > WES)

In [None]:
OmicsAbsoluteCNSegmentsProfile = tc.get(name='internal-23q2-1e49', version=97, file='OmicsAbsoluteCNSegmentsProfile')
OmicsSignaturesProfile = tc.get(name='internal-23q2-1e49', version=97, file='OmicsSignaturesProfile')
OmicsAbsoluteCNSegmentsProfile = OmicsAbsoluteCNSegmentsProfile.rename(columns={"MajorAlleleAbsoluteCN": "SegmentAbsoluteCN"})

In [None]:
def get_cna_and_aneuploidy(seg, sig_table, cent_filename="../data/Modeled_regions_for_GRCh38.tsv", id_col="DepMap_ID", ploidy_col="Ploidy"):
    # parse centromere file
    cent = pd.read_csv(cent_filename, sep="\t", index_col=False)
    cent = cent[~(cent["#region_name"].str.startswith("HET")) & (~cent["chr"].isin(["X", "Y"]))].drop(columns=["#region_name"]).rename(columns={"chr": "Chromosome", "start": "cent_start", "stop": "cent_end"})
    cent["cent_mid"] = (0.5*(cent["cent_start"] + cent["cent_end"])).round().astype(int)
    cent['Chromosome']=cent['Chromosome'].astype(int)

    seg['Chromosome']=seg['Chromosome'].astype(int)
    seg["seg_width"] = seg["End"] - seg["Start"]
    merged_seg = seg.merge(cent, on=['Chromosome'], how="left")
    sig_table = sig_table.reset_index().rename(columns={"index": id_col})
    merged_seg = merged_seg.merge(sig_table[[id_col, ploidy_col]], on=[id_col], how="left")
    
    seg_with_arm = get_which_arm(merged_seg)
    seg_with_arm["chrom_arm"] = seg_with_arm["Chromosome"].astype(str) + seg_with_arm["arm"]
    
    cna_table = seg_with_arm.groupby([id_col, "chrom_arm"]).apply(arm_call).unstack(level=1)
    cna_table = cna_table.drop(columns=["21p", "22p"])
    
    aneuploidy = cna_table.abs().sum(axis=1).to_dict()
    sig_table["Aneuploidy"] = sig_table[id_col].map(aneuploidy)
    
    return cna_table, sig_table

In [None]:
cna_table, sig_table = get_cna_and_aneuploidy(OmicsAbsoluteCNSegmentsProfile, OmicsSignaturesProfile, id_col="ProfileID")

In [None]:
sig_table

In [None]:
OmicsDefaultModelProfiles = tc.get(name='internal-23q2-1e49', version=97, file='OmicsDefaultModelProfiles')
OmicsDefaultModelProfiles

In [None]:
sig_table_model = sig_table[sig_table.ProfileID.isin(OmicsDefaultModelProfiles.ProfileID)]
renaming_dict = dict(zip(OmicsDefaultModelProfiles.ProfileID.tolist(), OmicsDefaultModelProfiles.ModelID.tolist()))
sig_table_model["ModelID"] = sig_table_model["ProfileID"].map(renaming_dict)
sig_table_model = sig_table_model[sig_table_model.ModelID.isin(published_aneuploidy_scores.DepMap_ID)]

In [None]:
sig_table_model = sig_table_model.merge(published_aneuploidy_scores.rename(columns={"DepMap_ID": "ModelID"})[["ModelID", "Aneuploidy score"]], on=["ModelID"])

In [None]:
sig_table_model

In [None]:
x = sig_table_model["Aneuploidy score"]
y = sig_table_model["Aneuploidy"]

plt.scatter(x, y, alpha=0.2)

plt.xlabel("CCLE - WES and SNP array")
plt.ylabel("Current DepMap - WGS and WES")
plt.show()

In [None]:
sig_table_model[(sig_table_model["Aneuploidy score"] < 10) & (sig_table_model["Aneuploidy"] > 25)]

In [None]:
sig_table[(sig_table.ProfileID == "PR-qskzui")]

In [None]:
sig_table_with_model = sig_table.merge(pr_table.reset_index().rename(columns={"index": "ProfileID"})[["ProfileID", "ModelID", "Datatype"]], on=["ProfileID"])

In [None]:
sig_table_with_model[sig_table_with_model.ModelID == "ACH-000649"]

In [None]:
sig_table_with_model.set_index("ProfileID")

In [None]:
print("lines that have multiple DNAseq profiles with drastically different aneuploidy scores: ")
for m in sig_table_with_model.ModelID.tolist():
    s = sig_table_with_model[(sig_table_with_model.ModelID == m)]
    if len(s) > 1:
        if s.Aneuploidy.max() - s.Aneuploidy.min() > 10:
            print(m)