In [None]:
#https://gist.github.com/dceoy/99d976a2c01e7f0ba1c813778f9db744

import io
import os
import pandas as pd
import gzip
import numpy as np
from google.cloud import storage
from depmap_omics_upload import tracker as track


def read_vcf(path):
    storage_client = storage.Client()
    bucket = storage_client.bucket(path.split("/")[2])
    blob = bucket.blob("/".join(path.split("/")[3:]))
    if path.endswith(".gz"):
        data = io.BytesIO(blob.download_as_string())
        with gzip.open(data, 'r') as f:
            lines = [l.decode("utf-8") for l in f if not l.startswith(b'#')]
    else:
        data = blob.download_as_string().decode("utf-8") 
        f = data.split("\n")
        lines = [l + "\n" for l in f if not l.startswith('#')]
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'GT'],
        sep='\t'
    )

In [None]:
def transformGB(row):
    if len(row) < 2:
        return np.nan
    else:
        split_gt = row.split(':')[1].split('|')
        return [int(i) for i in split_gt]

In [None]:
def gb2str(row):
    if not isinstance(row["GB"], list):
        return "NA"
    else:
        strs = []
        for i in row["GB"]:
            integer = (row["REF"] * row["PERIOD"] + i) // row["PERIOD"]
            remainder = (row["REF"] * row["PERIOD"] + i) % row["PERIOD"]
            if remainder == 0:
                strs.append(str(int(integer)))
            else:
                strs.append(str(int(integer)) + "." + str(int(remainder)))
        return ', '.join(list(set(strs)))

In [None]:
def generateSTRRow(paths_df, method, colname="str"):
    hg38_sites = pd.read_csv("/home/xiaomeng/bin/depmap_omics/data/str_hg38.bed", sep="\t", names=["CHROM","START","END","PERIOD","REF","ID"]).astype({'PERIOD': 'int32'})
    str_rows = []
#     mytracker = track.SampleTracker()
#     seq_table = mytracker.add_model_cols_to_seqtable(cols=["ModelID"])
#     mytracker.close_gumbo_client()
    for i, p in paths_df.iterrows():
        if pd.isna(p[colname]):
            print("no hipSTR vcf available for: " + i)
        else:
            df = read_vcf(p[colname])
            if method == "hipstr":
                df["GB"] = df.apply(lambda x: (transformGB(x["GT"])), axis=1)
                df = hg38_sites.merge(df[["ID", "GB"]], on='ID', how='left')
                df["STR"] = df.apply(lambda x: gb2str(x), axis=1)
            elif method == "gangstr":
                df = df.rename(columns={"POS": "START"})
                df = hg38_sites.merge(df[['CHROM', 'START', "ALT"]], on=['CHROM', 'START'], how='left')
                df["STR"] = df.apply(lambda x: altAllele2str(x), axis=1)
            # Maybe no need to convert to model IDs here yet?
            df["sample_id"] = i
            str_row = df.pivot(index='sample_id', columns='ID', values='STR')
            str_rows.append(str_row)
    return(pd.concat(str_rows))

In [None]:
locations_14 = ['CSF1PO', 'D13S317', 'D16S539', 'D18S51', 'D21S11', 'D3S1358', 'D5S818', 'D7S820', 'D8S1179', 'FGA', 'PentaD', 'PentaE', 'TH01', 'TPOX']
locations_13 = ['CSF1PO', 'D13S317', 'D16S539', 'D18S51', 'D3S1358', 'D5S818', 'D7S820', 'D8S1179', 'FGA', 'PentaD', 'PentaE', 'TH01', 'TPOX']

In [None]:
def computeTanabe(df1, idx1, df2, idx2, colnames=locations_13):
    # compute tanabe similarity between two STR profiles
    match = 0
    total = 0
    for col in colnames:
        # TODO: how to best handle NAs?
        a1 = set(df1.loc[idx1, col].split(", "))
        a2 = set(df2.loc[idx2, col].split(", "))
        if a1 != set(['NA']) & a2 != set(['NA']):
            match += len(set(a1) & set(a2))
            total += len(set(a1)) + len(set(a2))
    return(2 * match / total)

In [None]:
def makeScoreMatrixDatabase(df_seqid, df_achid, colnames=locations_13):
    mytracker = track.SampleTracker()
    seq_table = mytracker.add_model_cols_to_seqtable(cols=["ModelID"])
    valid_achids = list(set(df_achid.index) - set([np.nan]))
    scoremat = pd.DataFrame(columns=valid_achids + ['ModelID'], index=(df_seqid.index))
    for i in df_seqid.index:
        scoremat.loc[i, "ModelID"] = seq_table.loc[i, "ModelID"]
        for j in valid_achids:
            scoremat.loc[i, j] = computeTanabe(df_seqid, i, df_achid, j)
    return scoremat

In [None]:
fn1 = 'gs://fc-secure-6b6a3e1a-6fb8-4d30-b0df-a359e6c5d6e6/submissions/90b6e606-b120-4fe6-925d-25d259102afc/hipstr/6e27f596-8bff-43ff-beb5-96350f06ecc4/call-run_hipstr/CDS-d7p66m.vcf.gz'
fn2 = "gs://fc-secure-6b6a3e1a-6fb8-4d30-b0df-a359e6c5d6e6/submissions/7979e8bb-42bf-41b9-857d-d6186cb54aa5/hipstr/3f5d6cab-a0c0-492f-b8ea-b970e9de5365/call-run_hipstr/CDS-5P6nT1.vcf.gz"
fn3 = 'gs://fc-secure-6b6a3e1a-6fb8-4d30-b0df-a359e6c5d6e6/submissions/7979e8bb-42bf-41b9-857d-d6186cb54aa5/hipstr/861270da-afcb-4c4b-92b5-8b9309b57a47/call-run_hipstr/CDS-BkvCmE.vcf.gz'

In [None]:
path_mapping = pd.DataFrame(data=[fn1], columns=["str"], index=["CDS-d7p66m"])

In [None]:
wgs_str_hipstr = generateSTRRow(path_mapping, method="hipstr")

In [None]:
wgs_str_hipstr

In [None]:
import dalmatian as dm

ws = dm.WorkspaceManager("broad-firecloud-ccle/DEV_DepMap_WGS_CN")
samples = ws.get_samples()

In [None]:
samples = samples.iloc[0:50]

In [None]:
samples

In [None]:
wgs_str_hipstr

In [None]:
from gsheets import Sheets

broad_internal_str_url = "https://docs.google.com/spreadsheets/d/134zxrQ77yMdDL4hLYybJJQN6pxLZIVLz-hORXIpid50/edit#gid=1107673601"
MY_ID = "~/.client_secret.json"
MYSTORAGE_ID = "~/.storage.json"
str_names = ['D3S1358','TH01', 'D21S11', 'D18S51', 'Penta E', 'D5S818', 'D13S317', 'D7S820',
             'D16S539', 'CSF1PO', 'Penta D', 'D8S1179', 'TPOX', 'FGA']

sheets = Sheets.from_files(MY_ID, MYSTORAGE_ID)
broad_str = sheets.get(broad_internal_str_url).sheets[0].to_frame(index_col=0)[str_names + ['Source']].rename(columns={"Penta E": "PentaE", "Penta D": "PentaD"})
broad_str = broad_str.fillna("NA")
broad_str = broad_str[~broad_str.index.isin(set([np.nan]))]

In [None]:
minerva_str = sheets.get("https://docs.google.com/spreadsheets/d/1RsJS2e6zgyHwnt4bUyedYoyMbXMNbSmZfsr7SoSKn5U/edit?usp=sharing").sheets[0].to_frame(index_col=0)[str_names].rename(columns={"Penta E": "PentaE", "Penta D": "PentaD"})
minerva_str = minerva_str.fillna("NA")
minerva_str = minerva_str[~minerva_str.index.isin(set([np.nan]))]

In [None]:
mnemosyne_str = sheets.get("https://docs.google.com/spreadsheets/d/1U9S8nvFj87ZcBelKabdMjjfjNUvBKq8Ja8hCWoLHgPw/edit#gid=1623015619").sheets[0].to_frame(index_col=0)[str_names].rename(columns={"Penta E": "PentaE", "Penta D": "PentaD"})
mnemosyne_str = mnemosyne_str.fillna("NA")
mnemosyne_str = mnemosyne_str[~mnemosyne_str.index.isin(set([np.nan]))]


In [None]:
# drop dup rows, if from multiple source, keep the one from achilles
idx = broad_str.index.tolist()
dups = set([x for x in idx if idx.count(x) > 1])
broad_str = broad_str.reset_index()
todrop = []
for d in dups:
    dup_rows = broad_str[broad_str['Arxspan ID'] == d]
    if len(set(dup_rows.Source)) > 1 and "Achilles" in set(dup_rows.Source):
        todrop.extend(dup_rows[dup_rows['Source'] != "Achilles"].index.tolist())
broad_str = broad_str.drop(todrop)
broad_str = broad_str.drop(columns=["Source"])
broad_str = broad_str.set_index('Arxspan ID')

In [None]:
broad_str

In [None]:
dom_str = sheets.get("https://docs.google.com/spreadsheets/d/1UtrkGRB0APTXKw_xfp5LObP892YPj0n0sFRfZBKYX0g/edit#gid=1807658353").sheets[0].to_frame(index_col=0)[str_names].rename(columns={"Penta E": "PentaE", "Penta D": "PentaD"}).astype(str)


In [None]:
dom_str

In [None]:
score_mat = makeScoreMatrixDatabase(wgs_str_hipstr, broad_str.loc[["ACH-002056"]])

In [None]:
score_mat

In [None]:
score_mat_minerva = makeScoreMatrixDatabase(wgs_str_hipstr, minerva_str)

In [None]:
score_mat_mnemosyne = makeScoreMatrixDatabase(wgs_str_hipstr, mnemosyne_str)

In [None]:
for c in score_mat_mnemosyne.columns.tolist():
    if c != "ModelID" and score_mat_mnemosyne.loc["CDS-d7p66m", c] > 0.6:
        print(c)

In [None]:
score_mat_mnemosyne.loc["CDS-d7p66m", ["2817-0", "2817-4"]]

In [None]:
wgs_str.loc[["CDS-00Nrci"]][locations_13]

In [None]:
broad_str.loc[["ACH-000839"]][locations_13]

In [None]:
score_mat_minerva_t.loc["2869-1", "CDS-Zdl1W4"]

In [None]:
score_mat_minerva_t = score_mat_minerva.T
score_mat_minerva_t = score_mat_minerva_t.drop(["ModelID"])
score_mat_minerva_t[score_mat_minerva_t["CDS-Zdl1W4"].astype(float) > 0.6]

In [None]:
mytracker = track.SampleTracker()
seq_table = mytracker.add_model_cols_to_seqtable(cols=["ModelID", "PatientID"])

model_table = mytracker.read_model_table()

In [None]:
scores = []
mismatch_scores = []
for i in score_mat.index:
    if score_mat.loc[i, "ModelID"] in set(score_mat.columns):
        patient_id = seq_table.loc[i, "PatientID"]
        same_patient = set(model_table[model_table.PatientID == patient_id].index)
        scores.append(score_mat.loc[i, score_mat.loc[i, "ModelID"]])
        mismatches = set(score_mat.columns) - set(["ModelID", score_mat.loc[i, "ModelID"]]) - same_patient
        mismatch_scores.extend(score_mat.loc[i, list(mismatches)].tolist())

In [None]:
score_mat[score_mat.ModelID == "ACH-002835"]["ACH-002835"]

In [None]:
import numpy as np
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize =(6, 4))
ax.hist(scores, bins = 20)
 
# Show plot
plt.show()

In [None]:
# plot mismatch scores:

fig, ax = plt.subplots(figsize =(6, 4))
ax.hist(mismatch_scores, bins = 20)
 
# Show plot
plt.show()

In [None]:
mismatch_scores.sort()

In [None]:
mismatch_scores[-20:]

# GangSTR

In [None]:
def altAllele2str(row):
    if pd.isna(row["ALT"]) or row["ALT"] == ".":
        return "NA"
    else:
        strs = []
        for i in row["ALT"].split(","):
            l = len(i)
            integer = l // row["PERIOD"]
            remainder = l % row["PERIOD"]
            if remainder == 0:
                strs.append(str(int(integer)))
            else:
                strs.append(str(int(integer)) + "." + str(int(remainder)))
        return ', '.join(list(set(strs)))

In [None]:
gangstr_fn1 = "gs://fc-secure-9dffc819-20a8-49ea-8fa8-1b1bab1475d0/submissions/7b4fe301-f573-4859-bd05-98191c15a608/GangSTR/84e4fe9a-096e-4613-a0fd-462a1081a294/call-CallGangSTR/attempt-2/CDS-00Nrci.vcf"
gangstr_fn2 = "gs://fc-secure-9dffc819-20a8-49ea-8fa8-1b1bab1475d0/submissions/7b4fe301-f573-4859-bd05-98191c15a608/GangSTR/1bea68db-ec31-4e18-b2e0-9c76fce185ed/call-CallGangSTR/attempt-2/CDS-0b4jFH.vcf"
gangstr_fn3 = "gs://fc-secure-9dffc819-20a8-49ea-8fa8-1b1bab1475d0/submissions/7b4fe301-f573-4859-bd05-98191c15a608/GangSTR/643addc8-e6f0-45ae-812e-79fa599b3328/call-CallGangSTR/CDS-0bV15m.vcf"
gangstr_fn4 = "gs://fc-secure-9dffc819-20a8-49ea-8fa8-1b1bab1475d0/submissions/7b4fe301-f573-4859-bd05-98191c15a608/GangSTR/4d798622-7f63-41b2-8a05-5c06287556f9/call-CallGangSTR/CDS-0e3PRe.vcf"
gangstr_fn5 = "gs://fc-secure-9dffc819-20a8-49ea-8fa8-1b1bab1475d0/submissions/7b4fe301-f573-4859-bd05-98191c15a608/GangSTR/d987a65c-0cd0-48d0-9e8d-a439de082dcc/call-CallGangSTR/attempt-2/CDS-0ewUnk.vcf"

In [None]:
path_mapping_gangstr = pd.DataFrame(data=[gangstr_fn1, gangstr_fn2, gangstr_fn3, gangstr_fn4, gangstr_fn5], columns=["str"], index=["CDS-00Nrci", "CDS-0b4jFH", "CDS-0bV15m", "CDS-0e3PRe", "CDS-0ewUnk"])

In [None]:
wgs_str_gangstr = generateSTRRow(path_mapping_gangstr, method="gangstr")

In [None]:
wgs_str_gangstr.loc[["CDS-00Nrci"]]

In [None]:
wgs_str_hipstr.loc[["CDS-00Nrci"]]

In [None]:
df = read_vcf(gangstr_fn1)
df

In [None]:
from taigapy import TaigaClient
tc = TaigaClient()

CCLE_gene_cn = tc.get(name='internal-20q4-2540', version=47, file='CCLE_gene_cn')

In [None]:
"ACH-001955" in CCLE_gene_cn.index

In [None]:
wgs_str_gangstr_all = generateSTRRow(samples, method="gangstr", colname="GangSTR_vcf")

In [None]:
wgs_str_gangstr_all

In [None]:
score_mat_gangstr = makeScoreMatrixDatabase(wgs_str_gangstr_all, broad_str, colnames=locations_14)

In [None]:
score_mat_gangstr

In [None]:
scores_gangstr = []
mismatch_scores_gangstr = []
for i in score_mat_gangstr.index:
    if score_mat_gangstr.loc[i, "ModelID"] in set(score_mat_gangstr.columns):
        patient_id = seq_table.loc[i, "PatientID"]
        same_patient = set(model_table[model_table.PatientID == patient_id].index)
        scores_gangstr.append(score_mat_gangstr.loc[i, score_mat_gangstr.loc[i, "ModelID"]])
        mismatches = set(score_mat_gangstr.columns) - set(["ModelID", score_mat_gangstr.loc[i, "ModelID"]]) - same_patient
        mismatch_scores_gangstr.extend(score_mat_gangstr.loc[i, list(mismatches)].tolist())

In [None]:
import numpy as np
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize =(6, 4))
ax.hist(scores_gangstr, bins = 20)
 
# Show plot
plt.show()

In [None]:
# plot mismatch scores:

fig, ax = plt.subplots(figsize =(6, 4))
ax.hist(mismatch_scores_gangstr, bins = 20)

# Show plot
plt.show()

In [None]:
# check ACH-000600 and ACH-000658 against STR database

In [None]:
ids = ["CDS-0DgVJy", "CDS-p5raBU", "CDS-2FC7DW"]

In [None]:
import dalmatian as dm

wm = dm.WorkspaceManager("broad-firecloud-ccle/CCLE_SNP_QC-copy").disable_hound()

In [None]:
mytracker = track.SampleTracker()
seq_table = mytracker.read_seq_table()

samples = seq_table.loc[ids]

In [None]:
HG38BAMCOL = ["bam_filepath", "bai_filepath"]

bams = samples[HG38BAMCOL]
bams["id"] = bams.index

# Upload sample sheet
samples_df = pd.DataFrame(
    bams[HG38BAMCOL + ["id", "id"]].values,
    columns=["internal_bam_filepath", "internal_bai_filepath"] + ["sample_id", "participant_id"],
)

In [None]:
samples_df = samples_df.set_index("sample_id")
wm.upload_samples(samples_df, add_participant_samples=True)

In [None]:
fn1 = "gs://fc-secure-9c2c0017-9eb5-47e5-9f43-983097f1f7b6/submissions/5160df09-c8f6-47c1-a1b0-1016aad12f93/hipstr/09f55f83-7f97-4e61-9033-d34b950b9456/call-run_hipstr/CDS-0DgVJy.vcf.gz"
fn2 = "gs://fc-secure-9c2c0017-9eb5-47e5-9f43-983097f1f7b6/submissions/5160df09-c8f6-47c1-a1b0-1016aad12f93/hipstr/1be62168-30cf-491b-8227-c7f1876ac603/call-run_hipstr/CDS-p5raBU.vcf.gz"
fn3 = "gs://fc-secure-9c2c0017-9eb5-47e5-9f43-983097f1f7b6/submissions/5160df09-c8f6-47c1-a1b0-1016aad12f93/hipstr/86b096e1-53db-4665-8dfa-c731885d7312/call-run_hipstr/cacheCopy/CDS-2FC7DW.vcf.gz"

In [None]:
path_mapping = pd.DataFrame(data=[fn1, fn2, fn3], columns=["str"], index=["CDS-0DgVJy", "CDS-p5raBU", "CDS-2FC7DW"])

In [None]:
path_mapping = pd.DataFrame(data=[fn3], columns=["str"], index=["CDS-2FC7DW"])

In [None]:
wgs_str_hipstr = generateSTRRow(path_mapping, method="hipstr")

In [None]:
score_mat = makeScoreMatrixDatabase(wgs_str_hipstr, broad_str)

In [None]:
score_mat.loc[["CDS-0DgVJy", "CDS-p5raBU", "CDS-2FC7DW"], ["ACH-001341", "ACH-000658"]]

In [None]:
wgs_str_hipstr

# Trying out Gumbo STR table

In [None]:
from depmap_omics_upload import tracker as track

mytracker = track.SampleTracker(gumbo_env="staging")

In [None]:
model_table = mytracker.read_model_table()

In [None]:
str_table = client.get("str_profile")

In [None]:
str_table.columns

In [None]:
str_col_rename = {'D3S1358': 'd3s1358', 
                  'TH01': 'th01', 
                  'D21S11': 'd21s11', 
                  'D18S51': 'd18s51', 
                  'Penta E': 'penta_e',
                  'D5S818': 'd5s818', 
                  'D13S317': 'd13s317', 
                  'D7S820':'d7s820', 
                  'D16S539': 'd16s539', 
                  'CSF1PO':'csf1po', 
                  'Penta D': 'penta_d', 
                  'vWA': 'vwa',
                  'D8S1179': 'd8s1179', 
                  'TPOX': 'tpox', 
                  'FGA': 'fga', 
                  'Amelogenin': 'amelogenin', 
                  'Mouse': 'mouse', 
                  'Mycoplasma': 'mycoplasma',
                  'LabCorpSpecNbr': 'lab_corp_spec_nbr', 
                  'LabCorpCaseNbr': 'lab_corp_case_nbr', 
                  'Source': 'source_group'}

In [None]:
from gsheets import Sheets
import numpy as np

broad_internal_str_url = "https://docs.google.com/spreadsheets/d/134zxrQ77yMdDL4hLYybJJQN6pxLZIVLz-hORXIpid50/edit#gid=1107673601"
MY_ID = "~/.client_secret.json"
MYSTORAGE_ID = "~/.storage.json"
str_names = ['D3S1358','TH01', 'D21S11', 'D18S51', 'Penta E', 'D5S818', 'D13S317', 'D7S820',
             'D16S539', 'CSF1PO', 'Penta D', 'D8S1179', 'TPOX', 'FGA']

sheets = Sheets.from_files(MY_ID, MYSTORAGE_ID)
broad_str = sheets.get(broad_internal_str_url).sheets[0].to_frame()[list(str_col_rename.keys()) + ["Arxspan ID"]].rename(columns=str_col_rename)

In [None]:
broad_str.columns

In [None]:
first5 = broad_str.iloc[1055:1065]

In [None]:
first5['source'] = 'Internal'
first5['is_reference'] = False

In [None]:
for i in first5.index:
    patient_id = model_table.loc[first5.loc[i, "Arxspan ID"], "PatientID"]
    first5.loc[i, "patient_id"] = patient_id

In [None]:
first5 = first5.drop(columns=["Arxspan ID"])

In [None]:
first5

In [None]:
mytracker.client.insert_only("str_profile", first5)

In [None]:
mytracker.client.commit()

# Clean up and reformat existing STR sheets

In [None]:
from depmap_omics_upload import tracker as track

mytracker = track.SampleTracker(gumbo_env="production")

mc_table = mytracker.read_mc_table()
model_table = mytracker.read_model_table()

## DMX

In [None]:
from gsheets import Sheets
import numpy as np

broad_internal_str_url = "https://docs.google.com/spreadsheets/d/134zxrQ77yMdDL4hLYybJJQN6pxLZIVLz-hORXIpid50/edit#gid=1107673601"
MY_ID = "~/.client_secret.json"
MYSTORAGE_ID = "~/.storage.json"
str_names = ['D3S1358','TH01', 'D21S11', 'D18S51', 'Penta E', 'D5S818', 'D13S317', 'D7S820',
             'D16S539', 'CSF1PO', 'Penta D', 'D8S1179', 'TPOX', 'FGA']

sheets = Sheets.from_files(MY_ID, MYSTORAGE_ID)
broad_str = sheets.get(broad_internal_str_url).sheets[0].to_frame()[list(str_col_rename.keys()) + ["Arxspan ID", "Annotation", "Notes", "Sample Reference"]].rename(columns=str_col_rename)

In [None]:
import pandas as pd
import numpy as np

# replace all NA* with NaN
broad_str = broad_str.replace({"NA*": np.nan})

In [None]:
# rename arxspan ids that are no longer in gumbo as a result of model merging:
arxspan_rename = {"ACH-001665": "ACH-000338", "ACH-003070": "ACH-001281", "ACH-000643-03": "ACH-000643"}
broad_str["Arxspan ID"] = broad_str["Arxspan ID"].replace(arxspan_rename)

In [None]:
# ignore arxspan ids that are not in model table
broad_str = broad_str[broad_str["Arxspan ID"].isin(set(model_table.index))]

In [None]:
# ignore minerva entries, since they're duplicates
broad_str = broad_str[broad_str["source_group"] != "Minerva"]

In [None]:
# fill patient ids
broad_str["patient_id"] = np.NaN
for i in broad_str.index:
    arxs = broad_str.loc[i, "Arxspan ID"]
    patient_id = model_table.loc[arxs, "PatientID"]
    broad_str.loc[i, "patient_id"] = patient_id

In [None]:
# rename values in mouse column:
mouse_rename_dict = {"NA*": "NA", "Negative": "Not Detected", "POSITIVE": "Positive", np.nan: "NA"}
broad_str["mouse"] = broad_str["mouse"].replace(mouse_rename_dict)

In [None]:
# check STRs in the DMX sheet that only maps to one SANGER MC
# so sam can create BROAD MCs for them in gumbo
for arxs in broad_str["Arxspan ID"].unique().tolist():
    mcs = mc_table[(mc_table.ModelID == arxs)]
    if len(mcs) == 1 and len(mc_table[(mc_table.ModelID == arxs) & (mc_table.Source == "SANGER")]) == 1:
        print(arxs)

In [None]:
# attempts to fill mc-ids!
# see under each model, if there's one and only one Broad MC:
for arxs in broad_str["Arxspan ID"].unique().tolist():
    mcs = mc_table[(mc_table.ModelID == arxs)]
    if len(mcs) == 0:
        print(str(arxs) + ": no MC in gumbo!")
    elif len(mcs) != 1:
        broad_mcs = mc_table[(mc_table.ModelID == arxs) & (mc_table.Source == "BROAD") & (~mc_table.expansion_team.isin(["CCLF", "Minerva", "Mnemosyne"]))]
        if len(broad_mcs) != 1:
            print(arxs)
        else:
            broad_str.loc[broad_str["Arxspan ID"] == arxs, "model_condition_id"] = broad_mcs.index.tolist()[0]
    else:
        if len(mcs[mcs.Source == "SANGER"]) != 1:
            broad_str.loc[broad_str["Arxspan ID"] == arxs, "model_condition_id"] = mcs.index.tolist()[0]

In [None]:
# for lines that can't be programmatically mapped above, rename manually
model2mc_dict={'ACH-002411': 'MC-002411-Xj7Q',
'ACH-002415': 'MC-002415-MwHT',
'ACH-002416': 'MC-002416-sOXV',
'ACH-002418': 'MC-002418-FEiN',
'ACH-000219': 'MC-000219-yd7j',
'ACH-001163': 'MC-001163-vV3z',
'ACH-001164': 'MC-001164-VOSE',
'ACH-000614': 'MC-000614-7X1Q',
'ACH-002410': 'MC-002410-gy9C',
'ACH-002452': 'MC-002452-vQCK',
'ACH-002454': 'MC-002454-LPKC',
'ACH-002456': 'MC-002456-294k'}

In [None]:
for i in broad_str[broad_str.model_condition_id.isna()].index:
    if broad_str.loc[i, "Arxspan ID"] not in list(model2mc_dict.keys()):
        print("no mapping available for " + broad_str.loc[i, "Arxspan ID"])
    else:
        broad_str.loc[i, "model_condition_id"] = model2mc_dict[broad_str.loc[i, "Arxspan ID"]]

In [None]:
# combine "annotation" and "notes" to populate the "comment" column
def combine_comments(row):
    if pd.isnull(row["Notes"]):
        return row["Annotation"]
    else:
        return row["Annotation"] + "; " + row["Notes"]

broad_str["comments"] = broad_str.apply(lambda x: combine_comments(x), axis=1)

In [None]:
# all PRISM profiles shouldn't have MCs assigned yet
broad_str.loc[broad_str[broad_str.source_group == "PRISM"].index.tolist(), "model_condition_id"] = np.nan

In [None]:
# all DMX STR profiles are internal
broad_str["source"] = "Internal"

In [None]:
# drop columns that are not in gumbo
broad_str = broad_str[[c for c in broad_str.columns if c in str_table.columns]]

In [None]:
broad_str

In [None]:
broad_str[broad_str.source_group == "PRISM"]

In [None]:
from depmap_omics_upload import tracker as track

mytracker = track.SampleTracker(gumbo_env="staging")

mytracker.client.insert_only("str_profile", broad_str)

In [None]:
mytracker.client.commit()

## Minerva

In [None]:
str_col_rename_minerva = {
                        'D3S1358': 'd3s1358', 
                        'TH01': 'th01', 
                        'D21S11': 'd21s11', 
                        'D18S51': 'd18s51', 
                        'Penta E': 'penta_e',
                        'D5S818': 'd5s818', 
                        'D13S317': 'd13s317', 
                        'D7S820':'d7s820',
                        'D16S539': 'd16s539', 
                        'CSF1PO':'csf1po', 
                        'Penta D': 'penta_d', 
                        'vWA': 'vwa',
                        'D8S1179': 'd8s1179', 
                        'TPOX': 'tpox', 
                        'FGA': 'fga', 
                        'AMEL': 'amelogenin', 
                        'Mouse': 'mouse',
                        'LabCorpSpecNbr': 'lab_corp_spec_nbr', 
                        'LabCorpCaseNbr': 'lab_corp_case_nbr',
                        'Date Submitted': "pellet_submitted_date",
                        'Date Collected': "pellet_creation_date",
                        'SampleReferenceNbr': "sample_reference"}

In [None]:
from gsheets import Sheets
import numpy as np

MY_ID = "~/.client_secret.json"
MYSTORAGE_ID = "~/.storage.json"

sheets = Sheets.from_files(MY_ID, MYSTORAGE_ID)

minerva_str = sheets.get("https://docs.google.com/spreadsheets/d/1RsJS2e6zgyHwnt4bUyedYoyMbXMNbSmZfsr7SoSKn5U/edit?usp=sharing").sheets[0].to_frame(index_col=0)[list(str_col_rename_minerva.keys())].rename(columns=str_col_rename_minerva)

In [None]:
# add Arxspan id column
minerva_str["Arxspan ID"] = "ACH-00" + minerva_str.index.str.split("-").str[0]

In [None]:
# fill patient ids
minerva_str["patient_id"] = np.NaN
for i in minerva_str.index:
    arxs = minerva_str.loc[i, "Arxspan ID"]
    patient_id = model_table.loc[arxs, "PatientID"]
    minerva_str.loc[i, "patient_id"] = patient_id

In [None]:
# attempts to fill mc-ids!
# see under each model, if there's one and only one Broad MC:
minerva_str["model_condition_id"] = None
for arxs in minerva_str["Arxspan ID"].unique().tolist():
    mcs = mc_table[(mc_table.ModelID == arxs) & (mc_table.expansion_team == "Minerva") & (mc_table.Source == "BROAD")]
    if len(mcs) == 0:
        print(str(arxs) + ": no Broad Minerva MC in gumbo!")
    elif len(mcs) > 1:
        print(str(arxs) + ": multiple Broad Minerva MCs in gumbo!")
    else:
        minerva_str.loc[minerva_str["Arxspan ID"] == arxs, "model_condition_id"] = mcs.index.tolist()[0]

In [None]:
# rename values in mouse column:
mouse_rename_dict = {"NA*": "NA", "Negative": "Not Detected", 'Not detected': 'Not Detected', "POSITIVE": "Positive", np.nan: "NA"}
minerva_str["mouse"] = minerva_str["mouse"].replace(mouse_rename_dict)

In [None]:
# drop redundant columns
minerva_str = minerva_str.drop(columns=["Arxspan ID"])

In [None]:
# fill in source, source group and is_ref columns
minerva_str["source"] = "Internal"
minerva_str["source_group"] = "Minerva"
minerva_str["is_reference"] = False

In [None]:
minerva_str

In [None]:
#mytracker.client.insert_only("str_profile", minerva_str.loc[["2730-1", "2730-2", "2730-3"]])

In [None]:
#mytracker.client.commit()

## Mnemosyne

In [None]:
from gsheets import Sheets
import numpy as np

MY_ID = "~/.client_secret.json"
MYSTORAGE_ID = "~/.storage.json"

sheets = Sheets.from_files(MY_ID, MYSTORAGE_ID)
broad_str = sheets.get(broad_internal_str_url).sheets[0].to_frame()[list(str_col_rename.keys()) + ["Arxspan ID", "Sample Reference"]].rename(columns=str_col_rename)

mnemosyne_str = sheets.get("https://docs.google.com/spreadsheets/d/1U9S8nvFj87ZcBelKabdMjjfjNUvBKq8Ja8hCWoLHgPw/edit#gid=1623015619").sheets[0].to_frame(index_col=0)[list(str_col_rename_minerva.keys())].rename(columns=str_col_rename_minerva)

In [None]:
# add Arxspan id column
mnemosyne_str["Arxspan ID"] = "ACH-00" + mnemosyne_str.index.str.split("-").str[0]

In [None]:
# fill patient ids
print("indices not in gumbo: ")
print(mnemosyne_str[~mnemosyne_str["Arxspan ID"].isin(model_table.index.tolist())].index)

mnemosyne_str["patient_id"] = np.NaN
mnemosyne_str = mnemosyne_str[mnemosyne_str["Arxspan ID"].isin(model_table.index.tolist())]
for i in mnemosyne_str.index:
    arxs = mnemosyne_str.loc[i, "Arxspan ID"]
    patient_id = model_table.loc[arxs, "PatientID"]
    mnemosyne_str.loc[i, "patient_id"] = patient_id

In [None]:
# attempts to fill mc-ids!
# see under each model, if there's one and only one Broad MC:
for arxs in mnemosyne_str["Arxspan ID"].unique().tolist():
    mcs = mc_table[(mc_table.ModelID == arxs) & (mc_table.expansion_team == "Mnemosyne") & (mc_table.Source == "BROAD")]
    if len(mcs) == 0:
        print(str(arxs) + ": no Broad Mnemosyne MC in gumbo!")
    elif len(mcs) > 1:
        print(str(arxs) + ": multiple Broad Mnemosyne MCs in gumbo!")
    else:
        mnemosyne_str.loc[mnemosyne_str["Arxspan ID"] == arxs, "model_condition_id"] = mcs.index.tolist()[0]

In [None]:
# rename values in mouse column:
mouse_rename_dict = {"NA*": "NA", "Negative": "Not Detected", 'Not detected': 'Not Detected', "POSITIVE": "Positive", np.nan: "NA"}
mnemosyne_str["mouse"] = mnemosyne_str["mouse"].replace(mouse_rename_dict)

In [None]:
# drop redundant columns
mnemosyne_str = mnemosyne_str.drop(columns=["Arxspan ID"])

In [None]:
# fill in source, source group and is_ref columns
mnemosyne_str["source"] = "Internal"
mnemosyne_str["source_group"] = "Mnemosyne"
mnemosyne_str["is_reference"] = False

In [None]:
mnemosyne_str

## WGS derived

In [None]:
str_col_rename_omics = {'D3S1358': 'd3s1358', 
                  'TH01': 'th01', 
                  'D21S11': 'd21s11', 
                  'D18S51': 'd18s51', 
                  'PentaE': 'penta_e',
                  'D5S818': 'd5s818', 
                  'D13S317': 'd13s317', 
                  'D7S820':'d7s820', 
                  'D16S539': 'd16s539', 
                  'CSF1PO':'csf1po', 
                  'PentaD': 'penta_d', 
                  'vWA': 'vwa',
                  'D8S1179': 'd8s1179', 
                  'TPOX': 'tpox', 
                  'FGA': 'fga'}

In [None]:
import dalmatian as dm

ws = dm.WorkspaceManager("broad-firecloud-ccle/DEV_DepMap_WGS_CN")
samples = ws.get_samples()

In [None]:
wgs_str_hipstr = generateSTRRow(samples.iloc[:10], method="hipstr")

In [None]:
# replace "NA" with nan
wgs_str_hipstr = wgs_str_hipstr.replace({"NA": np.nan})

In [None]:
# map patient and MC ids
from depmap_omics_upload import tracker as track

mytracker = track.SampleTracker(gumbo_env="staging")

seq_table = mytracker.read_seq_table()
pr_table = mytracker.add_model_cols_to_prtable(["ModelID", "PatientID"])

In [None]:
for i in wgs_str_hipstr.index:
    pr_id = seq_table.loc[i, "ProfileID"]
    wgs_str_hipstr.loc[i, "patient_id"] = pr_table.loc[pr_id, "PatientID"]
    wgs_str_hipstr.loc[i, "model_condition_id"] = pr_table.loc[pr_id, "ModelCondition"]

In [None]:
# rename columns into snake case
wgs_str_hipstr = wgs_str_hipstr.rename(columns=str_col_rename_omics)

In [None]:
# fill in source, source group and is_ref columns
wgs_str_hipstr["source"] = "Internal"
wgs_str_hipstr["source_group"] = "WGS Inferred"
wgs_str_hipstr["is_reference"] = False

In [None]:
# randomly generate indicies
from depmap_omics_upload.mgenepy.utils import helper as h

wgs_str_hipstr["id"] = [
        "STR-" + h.randomString(stringLength=6, stype="all", withdigits=True)
        for _ in range(len(wgs_str_hipstr))
    ]

In [None]:
for i in wgs_str_hipstr.index:
    seq_table.loc[i, "str_profile"] = wgs_str_hipstr.loc[i, "id"]

In [None]:
wgs_str_hipstr = wgs_str_hipstr.reset_index()
wgs_str_hipstr = wgs_str_hipstr.drop(columns="sample_id")

In [None]:
wgs_str_hipstr

In [None]:
mytracker.client.insert_only("str_profile", wgs_str_hipstr)

In [None]:
mytracker.client.commit()

In [None]:
mytracker.write_seq_table(seq_table)

# Assign reference

In [None]:
from depmap_omics_upload import tracker as track

mytracker = track.SampleTracker(gumbo_env="production")

mc_table = mytracker.read_mc_table()
model_table = mytracker.read_model_table()

In [None]:
print("total number of patients: ")
print(len(model_table.PatientID.unique()))

In [None]:
print("number of patients in DMX's sheet (minerva excluded): ")
print(len(broad_str.patient_id.unique()))

In [None]:
print("number of patients with multiple DMX STR profiles: ")
print(len(broad_str[broad_str.patient_id.duplicated()].patient_id.unique()))

In [None]:
tda_str = pd.concat([minerva_str, mnemosyne_str], ignore_index=True)

In [None]:
tda_str

In [None]:
print("number of patients with multiple STR profiles, between DMX and minerva/mnemosyne:")
tda_str[tda_str.patient_id.isin(broad_str.patient_id)].to_csv("tda_dups_dmx.csv")