In [None]:
import pandas as pd
import os
import re
import dalmatian as dm
from mgenepy.utils import helper as h
from depmap_omics_upload import tracker

In [None]:
rna_wm = dm.WorkspaceManager("broad-firecloud-ccle/DepMap_hg38_RNAseq")
rna_23q2 = rna_wm.get_sample_sets().loc["23Q2", "samples"]
wes_wm = dm.WorkspaceManager("broad-firecloud-ccle/DepMap_WES_CN_hg38")
wes_23q2 = wes_wm.get_sample_sets().loc["23Q2", "samples"]

In [None]:
mytracker = tracker.SampleTracker()
seq_table = mytracker.add_model_cols_to_seqtable(["ModelID", "Sex"])

In [None]:
cclf_seq_table = seq_table[(seq_table.source == "CCLF") & (seq_table.index.isin(set(rna_23q2 + wes_23q2)))]

# Inference from X chrom SNPs

In [None]:
import io
import gzip
import numpy as np
from google.cloud import storage

def read_vcf(path):
    storage_client = storage.Client()
    bucket = storage_client.bucket(path.split("/")[2])
    blob = bucket.blob("/".join(path.split("/")[3:]))
    if path.endswith(".gz"):
        data = io.BytesIO(blob.download_as_string())
        with gzip.open(data, 'r') as f:
            lines = [l.decode("utf-8") for l in f if not l.startswith(b'#') and not l.startswith(b'#CHROM')]
    else:
        data = blob.download_as_string().decode("utf-8") 
        f = data.split("\n")
        lines = [l + "\n" for l in f if not l.startswith('#')]
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE'],
        sep='\t'
    )

In [None]:
def extractInfo(sample, form, field):
    split_format = form.split(":")
    idx = split_format.index(field)
    split_gt = sample.split(":")
    return split_gt[idx]

In [None]:
def extractAndFilter(vcf_df, afcutoff = 0.2, mincoverage = 10, germline_only=True):
    vcf_df = vcf_df.drop(columns=["ID", "QUAL", "INFO"])
    # ignore multiallelic mutations
    vcf_df = vcf_df[~vcf_df.ALT.str.contains(",")]
    if germline_only:
        vcf_df = vcf_df[~vcf_df.FILTER.str.contains("germline")]
        vcf_df = vcf_df.drop(columns=["FILTER"])
    # vcf_df["GT"] = vcf_df.apply(lambda x: (extractInfo(x["SAMPLE"], x["FORMAT"], "GT")), axis=1)
    vcf_df["AF"] = vcf_df.apply(lambda x: (extractInfo(x["SAMPLE"], x["FORMAT"], "AF")), axis=1)
    vcf_df["DP"] = vcf_df.apply(lambda x: (extractInfo(x["SAMPLE"], x["FORMAT"], "DP")), axis=1)
    vcf_df = vcf_df[(vcf_df.DP.astype(float) > mincoverage) & (~vcf_df.CHROM.isin(["chrY", "Y", "chrM", "M"]))]
    if not germline_only:
        vcf_df = vcf_df[(vcf_df.AF.astype(float) > afcutoff)]
    return vcf_df

In [None]:
from scipy.stats import fisher_exact
import numpy as np
from scipy.stats.contingency import odds_ratio

def predictSex(vcf_df_filtered, min_or = 4, min_or_na = 2.5, max_pv = 0.001, homafcutoff = 0.95):
    homx = len(vcf_df_filtered[(vcf_df_filtered.AF.astype(float) > homafcutoff) & (vcf_df_filtered.CHROM.isin(["chrX", "X"]))])
    hetx = len(vcf_df_filtered[(vcf_df_filtered.AF.astype(float) <= homafcutoff) & (vcf_df_filtered.CHROM.isin(["chrX", "X"]))])
    homauto = len(vcf_df_filtered[(vcf_df_filtered.AF.astype(float) > homafcutoff) & (~vcf_df_filtered.CHROM.isin(["chrX", "X"]))])
    hetauto = len(vcf_df_filtered[(vcf_df_filtered.AF.astype(float) <= homafcutoff) & (~vcf_df_filtered.CHROM.isin(["chrX", "X"]))])
    table = np.array([[homx, hetx], [homauto, hetauto]])
    res = fisher_exact(table)
    pval = res[1]
    odds = odds_ratio(table).statistic
    print("odds ratio: " + str(odds))
    predicted_sex = "F"
    if odds >= min_or_na:
        predicted_sex = "NA"
    if odds >= min_or and pval > max_pv:
        predicted_sex = "NA"
    if pval <= max_pv and odds >= min_or:
        predicted_sex = "M"
    return predicted_sex, odds

In [None]:
vcf_df = read_vcf(wes_cclf_samples.loc["CDS-1aJRXx", "mutect2_fixed_vcf"])
vcf_df_filtered = extractAndFilter(vcf_df)

In [None]:
predictSex(vcf_df_filtered)

In [None]:
vcf_df

In [None]:
vcf_df_filtered

In [None]:
wes_samples = wes_wm.get_samples()
wes_cclf_samples = wes_samples.loc[wes_23q2]

In [None]:
cclf_seq_table = seq_table.loc[wes_23q2]

In [None]:
print("CCLF:")
for i in wes_cclf_samples.index.tolist():
    print(i)
    print("reported sex: " + seq_table.loc[i, "Sex"])
    vcfpath = wes_cclf_samples.loc[i, "mutect2_fixed_vcf"]
    vcf_df = read_vcf(vcfpath)
    vcf_df_filtered = extractAndFilter(vcf_df)
    sex, odds = predictSex(vcf_df_filtered)
    print("inferred sex: " + sex)
    cclf_seq_table.loc[i, "snp_inferred_sex"] = sex
    cclf_seq_table.loc[i, "odds_ratio"] = odds
    

In [None]:
cclf_seq_table["snp_inferred_sex"] = cclf_seq_table["snp_inferred_sex"].replace({"F": "Female", "M": "Male"})

In [None]:
cclf_seq_table = cclf_seq_table[~cclf_seq_table.version.isna()]

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.hist(cclf_seq_table[(cclf_seq_table.Sex == "Male") & (cclf_seq_table.expected_type == "wes")].odds_ratio, 
         alpha=0.5,
         label='reported male',
         bins=40)
  
plt.hist(cclf_seq_table[(cclf_seq_table.Sex == "Female") & (cclf_seq_table.expected_type == "wes")].odds_ratio,
         alpha=0.5,
         label='reported female',
         bins=40)

plt.xlim(0, 10)
plt.legend(loc='upper right')
plt.title('odds ratio - CCLF')
plt.show()

In [None]:
print("CCLF")
print("total: ", len(cclf_seq_table[cclf_seq_table.Sex != "Unknown"]))
print("total male: ", len(cclf_seq_table[cclf_seq_table.Sex == "Male"]))
print("correct male: ", len(cclf_seq_table[(cclf_seq_table.Sex == "Male") & (cclf_seq_table.snp_inferred_sex == "Male")]))
print("incorrect male: ", len(cclf_seq_table[(cclf_seq_table.Sex == "Male") & (cclf_seq_table.snp_inferred_sex == "Female")]))
print("inconclusive male: ", len(cclf_seq_table[(cclf_seq_table.Sex == "Male") & (cclf_seq_table.snp_inferred_sex == "NA")]))
print("total female: ", len(cclf_seq_table[cclf_seq_table.Sex == "Female"]))
print("correct female: ", len(cclf_seq_table[(cclf_seq_table.Sex == "Female") & (cclf_seq_table.snp_inferred_sex == "Female")]))
print("incorrect female: ", len(cclf_seq_table[(cclf_seq_table.Sex == "Female") & (cclf_seq_table.snp_inferred_sex == "Male")]))
print("inconclusive female: ", len(cclf_seq_table[(cclf_seq_table.Sex == "Female") & (cclf_seq_table.snp_inferred_sex == "NA")]))


In [None]:
print("CCLE:")
for i in ccle_seq_table.index.tolist():
    print(i)
    if i in wes_samples.index:
        print("reported sex: " + seq_table.loc[i, "Sex"])
        vcfpath = wes_samples.loc[i, "mutect2_fixed_vcf"]
        vcf_df = read_vcf(vcfpath)
        vcf_df_filtered = extractAndFilter(vcf_df)
        sex, odds = predictSex(vcf_df_filtered)
        print("inferred sex: " + sex)
        ccle_seq_table.loc[i, "snp_inferred_sex"] = sex
        ccle_seq_table.loc[i, "odds_ratio"] = odds

In [None]:
ccle_seq_table

In [None]:
for i in ccle_seq_table.index.tolist():
    if i in set(cclf_seq_table.index.tolist()):
        ccle_seq_table.loc[i, "odds_ratio"] = cclf_seq_table.loc[i, "odds_ratio"]

In [None]:
ccle_seq_table["snp_inferred_sex"] = ccle_seq_table["snp_inferred_sex"].replace({"F": "Female", "M": "Male"})

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.hist(ccle_seq_table[(ccle_seq_table.Sex == "Male") & (ccle_seq_table.expected_type == "wes")].odds_ratio, 
         alpha=0.5,
         label='reported male',
         bins=30)
  
plt.hist(ccle_seq_table[(ccle_seq_table.Sex == "Female") & (ccle_seq_table.expected_type == "wes")].odds_ratio,
         alpha=0.5,
         label='reported female',
         bins=30)

plt.xlim(0, 10)
plt.legend(loc='upper right')
plt.title('odds ratio - CCLE')
plt.show()

In [None]:
print("CCLE")
print("total: ", len(ccle_seq_table[ccle_seq_table.Sex != "Unknown"]))
print("total male: ", len(ccle_seq_table[ccle_seq_table.Sex == "Male"]))
print("correct male: ", len(ccle_seq_table[(ccle_seq_table.Sex == "Male") & (ccle_seq_table.snp_inferred_sex == "Male")]))
print("incorrect male: ", len(ccle_seq_table[(ccle_seq_table.Sex == "Male") & (ccle_seq_table.snp_inferred_sex == "Female")]))
print("inconclusive male: ", len(ccle_seq_table[(ccle_seq_table.Sex == "Male") & (ccle_seq_table.snp_inferred_sex == "NA")]))
print("total female: ", len(ccle_seq_table[ccle_seq_table.Sex == "Female"]))
print("correct female: ", len(ccle_seq_table[(ccle_seq_table.Sex == "Female") & (ccle_seq_table.snp_inferred_sex == "Female")]))
print("incorrect female: ", len(ccle_seq_table[(ccle_seq_table.Sex == "Female") & (ccle_seq_table.snp_inferred_sex == "Male")]))
print("inconclusive female: ", len(ccle_seq_table[(ccle_seq_table.Sex == "Female") & (ccle_seq_table.snp_inferred_sex == "NA")]))


In [None]:
path = "gs://fc-secure-d2a2d895-a7af-4117-bdc7-652d7d268324/6001c090-b09a-4785-8b8a-33aa9c3a7ec6/omics_post_mutect2/6b5fc00b-2b07-4b26-967e-1b4386a7094f/call-fixm2/cacheCopy/CDS-00rz9N_fixed.vcf.gz"

In [None]:
vcf_df = read_vcf(path)

In [None]:
vcf_df_filtered = extractADfromX(vcf_df)

In [None]:
predictSex(vcf_df_filtered)

In [None]:
vcf_df_filtered

# Inference from log2(X coverage/Y coverage)

In [None]:
le = len(cclf_seq_table.index)
j = 0
for i in cclf_seq_table.index:
    j += 1
    h.showcount(j, le)
    bam = cclf_seq_table.loc[i, "hg19_bam_filepath"]
    data = os.popen("export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` && samtools idxstats " + bam)
    res = data.read()
    df = pd.DataFrame([y.split('\t') for y in res.split('\n')], columns=["chrom", "length", "mapped", "unmapped"]).set_index('chrom')
    ratio = 0
    if "chrX" in df.index.tolist():
        ratio = int(df.loc["chrX", "mapped"]) / int(df.loc["chrY", "mapped"])
    elif "X" in df.index.tolist():
        ratio = int(df.loc["X", "mapped"]) / int(df.loc["Y", "mapped"])
    else:
        print("wrong contig name")
    cclf_seq_table.loc[i, "X_to_Y_ratio"] = ratio
    if ratio > 4:
        cclf_seq_table.loc[i, "inferred_gender"] = "Female"
    else:
        cclf_seq_table.loc[i, "inferred_gender"] = "Male"

In [None]:
print("done")

In [None]:
cclf_seq_table.Sex.value_counts()

In [None]:
cclf_seq_table = cclf_seq_table[cclf_seq_table.Sex != "Unknown"]

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.hist(cclf_seq_table[cclf_seq_table.Sex == "Male"].X_to_Y_ratio.apply(lambda x: np.log2(x)), bins=50, label='petal_length')
plt.show() 

In [None]:
plt.hist(cclf_seq_table[(cclf_seq_table.Sex == "Male") & (cclf_seq_table.expected_type == "wes")].X_to_Y_ratio.apply(lambda x: np.log2(x)), 
         alpha=0.5,
         label='annotated male',
         bins=50)
  
plt.hist(cclf_seq_table[(cclf_seq_table.Sex == "Female") & (cclf_seq_table.expected_type == "wes")].X_to_Y_ratio.apply(lambda x: np.log2(x)),
         alpha=0.5,
         label='annotated female',
         bins=50)
  
plt.legend(loc='upper right')
plt.title('log2(X/Y) - CCLF')
plt.show()

In [None]:
mismatch = cclf_seq_table[cclf_seq_table.Sex != cclf_seq_table.inferred_gender]

In [None]:
cclf_seq_table.inferred_gender.value_counts()

In [None]:
mismatch[mismatch.Sex == "Female"]

In [None]:
mismatch[mismatch.Sex == "Male"]

In [None]:
data = os.popen(
                "export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` && export GCS_REQUESTER_PAYS_PROJECT=`broad-firecloud-ccle` && samtools idxstats "
                + "gs://cclebams/rna/CDS-0ERvMG.bam"
            )
res = data.read()
df = pd.DataFrame([y.split('\t') for y in res.split('\n')], columns=["chrom", "length", "mapped", "unmapped"]).set_index('chrom')


In [None]:
df.index

In [None]:
pd.DataFrame([y.split('\t') for y in res.split('\n')], columns=["chrom", "length", "mapped", "unmapped"]).set_index('chrom')

In [None]:
ccle_seq_table = seq_table[(seq_table.source == "CCLE2") & (seq_table.expected_type.isin(["wes"]))]

In [None]:
le = len(ccle_seq_table.index)
j = 0
for i in ccle_seq_table.index:
    j += 1
    h.showcount(j, le)
    bam = ccle_seq_table.loc[i, "bam_filepath"]
    data = os.popen("export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` && samtools idxstats " + bam)
    res = data.read()
    df = pd.DataFrame([y.split('\t') for y in res.split('\n')], columns=["chrom", "length", "mapped", "unmapped"]).set_index('chrom')
    ratio = 0
    if "chrX" in df.index.tolist():
        ratio = int(df.loc["chrX", "mapped"]) / int(df.loc["chrY", "mapped"])
    elif "X" in df.index.tolist():
        ratio = int(df.loc["X", "mapped"]) / int(df.loc["Y", "mapped"])
    else:
        print("wrong contig name")
    ccle_seq_table.loc[i, "X_to_Y_ratio"] = ratio
    if ratio > 4:
        ccle_seq_table.loc[i, "inferred_gender"] = "Female"
    else:
        ccle_seq_table.loc[i, "inferred_gender"] = "Male"

In [None]:
ccle_seq_table.inferred_gender.value_counts()

In [None]:
plt.hist(ccle_seq_table[(ccle_seq_table.Sex == "Male") & (ccle_seq_table.expected_type == "wes")].X_to_Y_ratio.apply(lambda x: np.log2(x)), 
         alpha=0.5,
         label='annotated male',
         bins=50)
  
plt.hist(ccle_seq_table[(ccle_seq_table.Sex == "Female") & (ccle_seq_table.expected_type == "wes")].X_to_Y_ratio.apply(lambda x: np.log2(x)),
         alpha=0.5,
         label='annotated female',
         bins=50)
  
plt.legend(loc='upper right')
plt.title('log2(X/Y) - CCLE')
plt.show()