In [None]:
import pandas as pd

In [None]:
segdup = pd.read_csv("../data/segdup_regions.csv")[["chrom", "chromStart", "chromEnd", "fracMatch", "fracMatchIndel", "otherChrom", "otherStart", "otherEnd"]]

In [None]:
segdup

## convert to bed

In [None]:
segdup = segdup[(segdup.fracMatch > 0.98) & (~segdup.chrom.str.contains("_"))]

In [None]:
segdup["chrom"] = segdup["chrom"].replace({"chrX": "chr23", "chrY": "chr24", "chrM": "chr25"})
segdup["chrom"] = segdup["chrom"].str.replace("chr", "").astype("int")
segdup = segdup.sort_values(by=["chrom", "chromStart", "chromEnd"])
segdup["chrom"] = "chr" + segdup["chrom"].astype(str)
segdup["chrom"] = segdup["chrom"].replace({"chr23": "chrX", "chr24": "chrY", "chr25": "chrM"})

In [None]:
segdup

In [None]:
segdup.otherChrom.value_counts()

In [None]:
segdup[["chrom", "chromStart", "chromEnd"]].to_csv("../data/segDup_majorAllele_withAltContigs_98pcFracMatch.bed", sep="\t", header=False, index=False)

### a lot of segments in this bed file overlap with each other. had to run bedtools merge to get the union of those segments

## see how many genes in the CN matrix are going to be masked

In [None]:
from taigapy import TaigaClient
tc = TaigaClient()

OmicsCNGene = tc.get(name='internal-23q2-1e49', version=97, file='OmicsCNGene')


In [None]:
from mgenepy.utils import helper as h

mybiomart = h.generateGeneNames(attributes=["start_position", "end_position", "chromosome_name"],)
mybiomart = mybiomart.rename(
        columns={
            "start_position": "start",
            "end_position": "end",
            "chromosome_name": "Chromosome",
        }
    )

In [None]:
mybiomart

In [None]:
mybiomart["Chromosome"] = mybiomart["Chromosome"].replace({"X": "23", "Y": "24", "MT": "25"})
mybiomart = mybiomart[mybiomart.Chromosome.isin(set(map(str, range(1, 26))))]
mybiomart["Chromosome"] = mybiomart["Chromosome"].astype(int)

In [None]:
mybiomart = mybiomart.sort_values(by=["Chromosome", "start", "end"])
mybiomart["Chromosome"] = mybiomart["Chromosome"].replace({23: "X", 24: "Y", 25: "MT"})
mybiomart = mybiomart.drop_duplicates("hgnc_symbol", keep="first")
mybiomart["gene_name"] = [
        i["hgnc_symbol"] + " (" + str(i["entrezgene_id"]).split(".")[0] + ")"
        for _, i in mybiomart.iterrows()
    ]
mybiomart["Chromosome"] = "chr" + mybiomart["Chromosome"].astype(str)


In [None]:
mybiomart[mybiomart.gene_name.isin(OmicsCNGene)][["Chromosome", "start", "end", "gene_name"]].to_csv("../data/biomart_cngenes.bed", sep="\t", header=False, index=False)

In [None]:
gene_dict = mybiomart[["Chromosome", "start", "end", "gene_name"]].set_index('gene_name').T.to_dict('list')

In [None]:
gene_dict

In [None]:
overlap_df = pd.read_csv("/home/xiaomeng/bin/depmap_omics/data/overlap_withalt.txt", sep="\t", names=["chrom", "start", "end", "gene_name"])

In [None]:
overlap_df

In [None]:
masked_genes = []
for g in overlap_df.gene_name.unique().tolist():
    _, start, end = gene_dict[g]
    gene_length = end - start
    overlap_length = 0
    overlap_segments = overlap_df[overlap_df.gene_name == g]
    for i, v in overlap_segments.iterrows():
        overlap_length += v['end'] - v['start']
    if overlap_length / gene_length > 0.5:
        masked_genes.append(g)

In [None]:
masked_genes

In [None]:
len(masked_genes)

In [None]:
[c for c in OmicsCNGene.columns if c.startswith("LINC01670")]

In [None]:
gene_dict["LINC01670 (105379487)"]

In [None]:
OmicsCNGene[masked_genes].sum().value_counts()

In [None]:
OmicsCNGene.sum().value_counts()

# a proper function for it

In [None]:
import pybedtools

In [None]:
def maskSegDup(cnmatrix, matname, mybiomart, save_output="", segdup_bed="../data/repeatMasker_max10_noAlt_merged.bed", thresh=0.5):
    # sort and format biomart
#     mybiomart["Chromosome"] = mybiomart["Chromosome"].replace({"X": "23", "Y": "24", "MT": "25"})
#     mybiomart = mybiomart[mybiomart.Chromosome.isin(set(map(str, range(1, 26))))]
#     mybiomart["Chromosome"] = mybiomart["Chromosome"].astype(int)
#     mybiomart = mybiomart.sort_values(by=["Chromosome", "start", "end"])
#     mybiomart["Chromosome"] = mybiomart["Chromosome"].replace({23: "X", 24: "Y", 25: "MT"})
#     mybiomart = mybiomart.drop_duplicates("hgnc_symbol", keep="first")
#     mybiomart["Chromosome"] = "chr" + mybiomart["Chromosome"].astype(str)
    
    mybiomart[mybiomart.gene_name.isin(cnmatrix)][["Chromosome", "start", "end", "gene_name"]].to_csv(save_output + "biomart_cngenes.bed", sep="\t", header=False, index=False)
    
    cngenes = pybedtools.BedTool(save_output + "biomart_cngenes.bed")
    cngenes.intersect(segdup_bed).saveas(save_output + 'mask_overlap.bed')
    
    overlap_df = pd.read_csv(save_output + 'mask_overlap.bed', sep="\t", names=["chrom", "start", "end", "gene_name"])
    gene_dict = mybiomart[["Chromosome", "start", "end", "gene_name"]].set_index('gene_name').T.to_dict('list')
    masked_genes = []
    for g in overlap_df.gene_name.unique().tolist():
        _, start, end = gene_dict[g]
        gene_length = end - start
        overlap_length = 0
        overlap_segments = overlap_df[overlap_df.gene_name == g]
        for i, v in overlap_segments.iterrows():
            overlap_length += v['end'] - v['start']
        if overlap_length / gene_length > thresh:
            masked_genes.append(g)
    print("masking " + str(len(masked_genes)) + " genes from CN matrix")
    cnmatrix = cnmatrix.drop(columns=masked_genes)
    
    return cnmatrix, masked_genes

In [None]:
genes = pybedtools.BedTool("../data/biomart_cngenes.bed")

In [None]:
genes.intersect("/home/xiaomeng/bin/depmap_omics/data/segDup_majorAllele_98pcFracMatch_merged.bed").saveas('ov.bed')

In [None]:
cnmatrix, masked_genes = maskSegDup(OmicsCNGene, "CN", mybiomart, thresh=0.01, segdup_bed="../data/repeatMasker_max10_noAlt_merged.bed")

In [None]:
masked_genes

## repeatmasker

In [None]:
repeats = pd.read_csv("/home/xiaomeng/bin/depmap_omics/data/repeatmasker.csv")[["milliDiv", "milliDel", "milliIns", "genoName", "genoStart", "genoEnd"]]

In [None]:
repeats["max"] = repeats[["milliDiv", "milliDel", "milliIns"]].max(axis=1)

In [None]:
repeats = repeats[(repeats["max"] < 10) & (~repeats.genoName.str.contains("_"))]
repeats["genoName"] = repeats["genoName"].replace({"chrX": "chr23", "chrY": "chr24", "chrM": "chr25"})
repeats["genoName"] = repeats["genoName"].str.replace("chr", "").astype("int")
repeats = repeats.sort_values(by=["genoName", "genoStart", "genoEnd"])
repeats["genoName"] = "chr" + repeats["genoName"].astype(str)
repeats["genoName"] = repeats["genoName"].replace({"chr23": "chrX", "chr24": "chrY", "chr25": "chrM"})

In [None]:
repeats[["genoName", "genoStart", "genoEnd"]].to_csv("../data/repeatMasker_max10_noAlt.bed", sep="\t", header=False, index=False)

In [None]:
repeats

In [None]:
repeats[(repeats.genoName == "chr11") & (repeats.genoStart > 55635000) & (repeats.genoStart < 55641309)]