In [None]:
#https://gist.github.com/dceoy/99d976a2c01e7f0ba1c813778f9db744

import io
import os
import pandas as pd
import gzip
import numpy as np
from google.cloud import storage
from depmap_omics_upload import tracker as track


def read_vcf(path):
    storage_client = storage.Client()
    bucket = storage_client.bucket(path.split("/")[2])
    blob = bucket.blob("/".join(path.split("/")[3:]))
    data = io.BytesIO(blob.download_as_string())
    with gzip.open(data, 'r') as f:
        lines = [l.decode("utf-8") for l in f if not l.startswith(b'#')]
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'GT'],
        sep='\t'
    )

In [None]:
def transformGB(row):
    if len(row) < 2:
        return np.nan
    else:
        split_gt = row.split(':')[1].split('|')
        return [int(i) for i in split_gt]

In [None]:
def gb2str(row):
    if not isinstance(row["GB"], list):
        return "NA"
    else:
        strs = []
        for i in row["GB"]:
            integer = (row["REF"] * row["PERIOD"] + i) // row["PERIOD"]
            remainder = (row["REF"] * row["PERIOD"] + i) % row["PERIOD"]
            if remainder == 0:
                strs.append(str(int(integer)))
            else:
                strs.append(str(int(integer)) + "." + str(int(remainder)))
        return ', '.join(strs)

In [None]:
def generateSTRRow(paths_df):
    hg38_sites = pd.read_csv("/home/xiaomeng/bin/depmap_omics/data/str_hg38.bed", sep="\t", names=["CHROM","START","END","PERIOD","REF","ID"]).astype({'PERIOD': 'int32'})
    str_rows = []
    mytracker = track.SampleTracker()
    seq_table = mytracker.add_model_cols_to_seqtable(cols=["ModelID"])
    mytracker.close_gumbo_client()
    for i, p in paths_df.iterrows():
        df = read_vcf(p["str"])
        df["GB"] = df.apply(lambda x: (transformGB(x["GT"])), axis=1)
        df = hg38_sites.merge(df[["ID", "GB"]], on='ID', how='left')
        df["STR"] = df.apply(lambda x: gb2str(x), axis=1)
        df["sample_id"] = seq_table.loc[i, "ModelID"]
        str_row = df.pivot(index='sample_id', columns='ID', values='STR')
        str_rows.append(str_row)
    return(pd.concat(str_rows))

In [None]:
fn1 = 'gs://fc-secure-9dffc819-20a8-49ea-8fa8-1b1bab1475d0/submissions/fb75cad6-1ee7-41b9-b25d-d725a87067a2/hipstr/6d988cc2-8510-40f5-b77e-cbbe13b29ec4/call-run_hipstr/CDS-0b4jFH.vcf.gz'
fn2 = "gs://fc-secure-9dffc819-20a8-49ea-8fa8-1b1bab1475d0/submissions/fb75cad6-1ee7-41b9-b25d-d725a87067a2/hipstr/b364a608-44d1-4ac0-8abe-86bf7651d7e5/call-run_hipstr/CDS-00Nrci.vcf.gz"

In [None]:
path_mapping = pd.DataFrame(data=[fn1, fn2], columns=["str"], index=["CDS-0b4jFH", "CDS-00Nrci"])

In [None]:
generateSTRRow(path_mapping)