In [None]:
#https://gist.github.com/dceoy/99d976a2c01e7f0ba1c813778f9db744

import io
import os
import pandas as pd
import gzip
import numpy as np
from google.cloud import storage
from depmap_omics_upload import tracker as track


def read_vcf(path):
    storage_client = storage.Client()
    bucket = storage_client.bucket(path.split("/")[2])
    blob = bucket.blob("/".join(path.split("/")[3:]))
    data = io.BytesIO(blob.download_as_string())
    with gzip.open(data, 'r') as f:
        lines = [l.decode("utf-8") for l in f if not l.startswith(b'#')]
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'GT'],
        sep='\t'
    )

In [None]:
def transformGB(row):
    if len(row) < 2:
        return np.nan
    else:
        split_gt = row.split(':')[1].split('|')
        return [int(i) for i in split_gt]

In [None]:
def gb2str(row):
    if not isinstance(row["GB"], list):
        return "NA"
    else:
        strs = []
        for i in row["GB"]:
            integer = (row["REF"] * row["PERIOD"] + i) // row["PERIOD"]
            remainder = (row["REF"] * row["PERIOD"] + i) % row["PERIOD"]
            if remainder == 0:
                strs.append(str(int(integer)))
            else:
                strs.append(str(int(integer)) + "." + str(int(remainder)))
        return ', '.join(list(set(strs)))

In [None]:
def generateSTRRow(paths_df):
    hg38_sites = pd.read_csv("/home/xiaomeng/bin/depmap_omics/data/str_hg38.bed", sep="\t", names=["CHROM","START","END","PERIOD","REF","ID"]).astype({'PERIOD': 'int32'})
    str_rows = []
#     mytracker = track.SampleTracker()
#     seq_table = mytracker.add_model_cols_to_seqtable(cols=["ModelID"])
#     mytracker.close_gumbo_client()
    for i, p in paths_df.iterrows():
        if pd.isna(p["str"]):
            print("no hipSTR vcf available for: " + i)
        else:
            df = read_vcf(p["str"])
            df["GB"] = df.apply(lambda x: (transformGB(x["GT"])), axis=1)
            df = hg38_sites.merge(df[["ID", "GB"]], on='ID', how='left')
            df["STR"] = df.apply(lambda x: gb2str(x), axis=1)
            # Maybe no need to convert to model IDs here yet?
            df["sample_id"] = i
            str_row = df.pivot(index='sample_id', columns='ID', values='STR')
            str_rows.append(str_row)
    return(pd.concat(str_rows))

In [None]:
locations_14 = ['CSF1PO', 'D13S317', 'D16S539', 'D18S51', 'D21S11', 'D3S1358', 'D5S818', 'D7S820', 'D8S1179', 'FGA', 'PentaD', 'PentaE', 'TH01', 'TPOX']
locations_13 = ['CSF1PO', 'D13S317', 'D16S539', 'D18S51', 'D3S1358', 'D5S818', 'D7S820', 'D8S1179', 'FGA', 'PentaD', 'PentaE', 'TH01', 'TPOX']

In [None]:
def computeTanabe(df1, idx1, df2, idx2, colnames=locations_13):
    # compute tanabe similarity between two STR profiles
    match = 0
    total = 0
    for col in colnames:
        # TODO: how to best handle NAs?
        a1 = set(df1.loc[idx1, col].split(", "))
        a2 = set(df2.loc[idx2, col].split(", "))
        if a1 != set(['NA']) & a2 != set(['NA']):
            match += len(set(a1) & set(a2))
            total += len(set(a1)) + len(set(a2))
    return(2 * match / total)

In [None]:
def makeScoreMatrixDatabase(df_seqid, df_achid):
    mytracker = track.SampleTracker()
    seq_table = mytracker.add_model_cols_to_seqtable(cols=["ModelID"])
    valid_achids = list(set(df_achid.index) - set([np.nan]))
    scoremat = pd.DataFrame(columns=valid_achids + ['ModelID'], index=(df_seqid.index))
    for i in df_seqid.index:
        scoremat.loc[i, "ModelID"] = seq_table.loc[i, "ModelID"]
        for j in valid_achids:
            scoremat.loc[i, j] = computeTanabe(df_seqid, i, df_achid, j)
    return scoremat

In [None]:
fn1 = 'gs://fc-secure-9dffc819-20a8-49ea-8fa8-1b1bab1475d0/submissions/fb75cad6-1ee7-41b9-b25d-d725a87067a2/hipstr/6d988cc2-8510-40f5-b77e-cbbe13b29ec4/call-run_hipstr/CDS-0b4jFH.vcf.gz'
fn2 = "gs://fc-secure-9dffc819-20a8-49ea-8fa8-1b1bab1475d0/submissions/fb75cad6-1ee7-41b9-b25d-d725a87067a2/hipstr/b364a608-44d1-4ac0-8abe-86bf7651d7e5/call-run_hipstr/CDS-00Nrci.vcf.gz"

In [None]:
import dalmatian as dm

ws = dm.WorkspaceManager("broad-firecloud-ccle/DEV_DepMap_WGS_CN")
samples = ws.get_samples()

In [None]:
samples = samples.iloc[0:50]

In [None]:
samples

In [None]:
path_mapping = pd.DataFrame(data=[fn1, fn2], columns=["str"], index=["CDS-0b4jFH", "CDS-00Nrci"])

In [None]:
wgs_str = generateSTRRow(path_mapping)

In [None]:
wgs_str

In [None]:
from gsheets import Sheets

broad_internal_str_url = "https://docs.google.com/spreadsheets/d/134zxrQ77yMdDL4hLYybJJQN6pxLZIVLz-hORXIpid50/edit#gid=1107673601"
MY_ID = "~/.client_secret.json"
MYSTORAGE_ID = "~/.storage.json"
str_names = ['D3S1358','TH01', 'D21S11', 'D18S51', 'Penta E', 'D5S818', 'D13S317', 'D7S820',
             'D16S539', 'CSF1PO', 'Penta D', 'D8S1179', 'TPOX', 'FGA']

sheets = Sheets.from_files(MY_ID, MYSTORAGE_ID)
broad_str = sheets.get(broad_internal_str_url).sheets[0].to_frame(index_col=0)[str_names + ['Source']].rename(columns={"Penta E": "PentaE", "Penta D": "PentaD"})
broad_str = broad_str.fillna("NA")
broad_str = broad_str[~broad_str.index.isin(set([np.nan]))]

In [None]:
# drop dup rows, if from multiple source, keep the one from achilles
idx = broad_str.index.tolist()
dups = set([x for x in idx if idx.count(x) > 1])
broad_str = broad_str.reset_index()
todrop = []
for d in dups:
    dup_rows = broad_str[broad_str['Arxspan ID'] == d]
    if len(set(dup_rows.Source)) > 1 and "Achilles" in set(dup_rows.Source):
        todrop.extend(dup_rows[dup_rows['Source'] != "Achilles"].index.tolist())
broad_str = broad_str.drop(todrop)
broad_str = broad_str.drop(columns=["Source"])
broad_str = broad_str.set_index('Arxspan ID')

In [None]:
broad_str

In [None]:
score_mat = makeScoreMatrixDatabase(wgs_str, broad_str)

In [None]:
wgs_str.loc[["CDS-00Nrci"]][locations_13]

In [None]:
broad_str.loc[["ACH-000839"]][locations_13]

In [None]:
score_mat

In [None]:
mytracker = track.SampleTracker()
seq_table = mytracker.add_model_cols_to_seqtable(cols=["ModelID", "PatientID"])

In [None]:
model_table = mytracker.read_model_table()

In [None]:
scores = []
mismatch_scores = []
for i in score_mat.index:
    if score_mat.loc[i, "ModelID"] in set(score_mat.columns):
        patient_id = seq_table.loc[i, "PatientID"]
        same_patient = set(model_table[model_table.PatientID == patient_id].index)
        scores.append(score_mat.loc[i, score_mat.loc[i, "ModelID"]])
        mismatches = set(score_mat.columns) - set(["ModelID", score_mat.loc[i, "ModelID"]]) - same_patient
        mismatch_scores.extend(score_mat.loc[i, list(mismatches)].tolist())

In [None]:
score_mat[score_mat.ModelID == "ACH-002835"]["ACH-002835"]

In [None]:
import numpy as np
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize =(6, 4))
ax.hist(scores, bins = 20)
 
# Show plot
plt.show()

In [None]:
# plot mismatch scores:

fig, ax = plt.subplots(figsize =(6, 4))
ax.hist(mismatch_scores, bins = 20)
 
# Show plot
plt.show()

In [None]:
mismatch_scores.sort()

In [None]:
mismatch_scores[-20:]