In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pybedtools, os

from CN_utils import *

In [3]:
# directory containing the .tsv FACETS CN segments files for each dataset. Default repo location given.
input_dir = "../../data/copy_number/"
CN_temp_dir = "../../data/copy_number/"
resource_dir = "../../data/resource/"

# names of datasets which you want to process- should match copy number file names.
# external datasets ("PUTH" and "SCORT") not included with our repo- you will have to generate these yourself
dataset_names = ["HTAN_WGS", "HTAN_WES", "PUTH", "SCORT"]

all_unfiltered = []
WGD_info = []
sample_to_patient = {}
for dataset in dataset_names:
    data_to_add = pd.read_csv(input_dir+dataset+"_CN_unfiltered.tsv", sep="\t")
    all_unfiltered.append(data_to_add)
    sample_to_patient.update(dict(zip(data_to_add["sample_id"], data_to_add["patient"])))
    doubling_to_add = pd.read_csv(input_dir+"genome_doubling/"+dataset+"_doubled.tsv", sep="\t", index_col=0)
    WGD_info.append(dict(zip(doubling_to_add.index, doubling_to_add["genome_doubled"])))

In [4]:
removed_regions_loc = resource_dir+"hg38.UCSC.centromere.telomere.encode.bed"

removed_regions = pd.read_csv(removed_regions_loc, sep="\t", names=["chrom", "start_pos", "end_pos", "length", "source", "reason"])

telomere_table = removed_regions[removed_regions["reason"]=="telomere"]

telomere_dict = {}
for chrom in list(set(telomere_table["chrom"])):
    only_chrom = telomere_table[telomere_table["chrom"]==chrom]
    telomere_dict[chrom] = [only_chrom.iloc[0]["end_pos"], only_chrom.iloc[1]["start_pos"]]
    
genome_len = sum([telomere_dict[x][1]-telomere_dict[x][0] if not x in ["chrX", "chrY"] else 0 for x in telomere_dict])

In [5]:
CN_dir_loc = CN_temp_dir + "overlap_filtered_beds/"
os.makedirs(CN_dir_loc, exist_ok=True)
overlap_threshold = 0.3

blacklist_filtered = []
for i,save_df in enumerate(all_unfiltered):
    save_name = dataset_names[i]
    save_loc = CN_dir_loc+save_name+"_unfiltered.bed"
    to_save = save_df[["chrom", "loc_start", "loc_end", "sample_id"]]
    to_save["seg_ID"] = [to_save.iloc[i]["sample_id"]+"_"+str(i) for i in range(len(to_save))]
    to_save.to_csv(save_loc, header=False, index=False, sep="\t")
    to_filter = save_df.copy()
    to_filter.index = to_save["seg_ID"]
    
    filtered_loc = CN_dir_loc+save_name+"_overlap_filtered.bed"
    bed_to_intersect = pybedtools.BedTool(save_loc)
    bed_to_intersect.intersect(removed_regions_loc, f=overlap_threshold, v=True).saveas(filtered_loc)
    filtered_df = pd.read_csv(filtered_loc, sep="\t", names=["chrom", "loc_start", "loc_end", "sample_id", "seg_ID"])

    blacklist_filtered.append(to_filter.loc[filtered_df["seg_ID"]])
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_save["seg_ID"] = [to_save.iloc[i]["sample_id"]+"_"+str(i) for i in range(len(to_save))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_save["seg_ID"] = [to_save.iloc[i]["sample_id"]+"_"+str(i) for i in range(len(to_save))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_save["seg_ID"] = [to

In [6]:
len_threshold = 1e6
cf_threshold = 0.15
    
length_filtered = []
for to_filter in blacklist_filtered:
    to_filter["length"] = to_filter["loc_end"] - to_filter["loc_start"]
    length_filtered.append(to_filter[to_filter["length"] > len_threshold])
    
cf_filtered = []
for to_filter in length_filtered:
    cf_filtered.append(to_filter[to_filter["cf_em"] > cf_threshold])

In [7]:
bed_columns = ["chrom", "loc_start", "loc_end", "tcn_em", "lcn_em"]
bed_to_intersect = pybedtools.BedTool(resource_dir+"hg38_gene_locs.bed")

for i,to_save in enumerate(cf_filtered):
    save_name = dataset_names[i]
    merged = extend_merge_all(to_save, telomere_dict, WGD_info[i])
    all_samples = list(set(merged["sample_id"]))
    
    bed_dir_path = CN_temp_dir+"final_filtered_beds/"+save_name+"/"
    os.makedirs(bed_dir_path, exist_ok=True)
    
    gene_dir_path = CN_temp_dir+"gene_overlap_beds/"+save_name+"/"
    os.makedirs(gene_dir_path, exist_ok=True)
    for sample in all_samples:
        merged_CN_bed_loc = bed_dir_path+sample+"_filtered_merged.bed"
        only_sample = merged[merged["sample_id"]==sample]
        only_sample[bed_columns].to_csv(merged_CN_bed_loc, header=False, index=False, sep="\t", na_rep="NaN")
        
        gene_intersect_bed_loc = gene_dir_path+sample+"_gene_overlaps.bed"
        bed_to_intersect.intersect(merged_CN_bed_loc, wb=True).saveas(gene_intersect_bed_loc)
    
    to_save.to_csv(input_dir+save_name+"_CN_filtered.tsv", index=False, sep="\t")
    merged.to_csv(input_dir+save_name+"_CN_filtered_merged.tsv", index=False, sep="\t")
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cacher_needs_updating = self._check_is_chained_assignment_possible()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cacher_needs_updating = self._check_is_chained_assignment_possible()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cacher_needs_updating = self._check_is_chained_assignment_possible()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

In [None]:
number_of_genes = len(pd.read_csv(resource_dir+"hg38_gene_locs.bed", sep="\t", header=None))    

all_gene_calls = []
for i,to_save in enumerate(cf_filtered):
    save_name = dataset_names[i]
    
    gene_dir_path = CN_temp_dir+"gene_overlap_beds/"+save_name+"/"
    all_samples = list(set(to_save["sample_id"]))
    
    to_concat = []
    for sample in all_samples:
        gene_intersect_bed_loc = gene_dir_path+sample+"_gene_overlaps.bed"
        gene_overlaps = pd.read_csv(gene_intersect_bed_loc, sep="\t", header=None, names=["gene_chrom", "gene_start", "gene_end", "gene_name", "seg_chrom", "seg_start", "seg_end", "tcn_em", "lcn_em"])
        gene_overlaps = deduplicate_gene_calls(gene_overlaps)
        add_CN_types(gene_overlaps, sample_to_patient[sample], WGD_info[i][sample])
        gene_overlaps["sample_id"] = sample
        
        assert len(gene_overlaps) == number_of_genes, "wrong number of total gene CN calls"
        
        to_concat.append(gene_overlaps)
    all_gene_calls.append(pd.concat(to_concat))
        

In [12]:
os.makedirs(input_dir+"gene_CN_calls/", exist_ok=True)
for i,to_save in enumerate(all_gene_calls):
    save_name = dataset_names[i]
    to_save.to_csv(input_dir+"gene_CN_calls/"+save_name+"_gene_CNs.tsv", sep="\t", index=False)

## APPENDIX: making a gene to genomic location table from the GRCh38 reference

In [98]:
grch38_gff_loc = "" #path to gencode.v33.basic.annotation.gff3.gz
grch38_genes = pd.read_csv(grch38_gff_loc, comment="#", compression="gzip", sep="\t", header=None, names=["chrom", "source", "type", "start", "end", "nothing", "strand", "nothing2", "info"])


In [69]:
def parse_info_column(info):
    splitted = info.split(";")
    for token in splitted:
        pair = token.split("=")
        if pair[0] == "gene_name":
            return pair[1]
    return "NONE"

only_genes = grch38_genes[grch38_genes["type"]=="gene"]
only_genes["gene_name"] = [parse_info_column(x) for x in only_genes["info"]]
only_genes = only_genes[~np.isin(only_genes["chrom"], ["chrY", "chrM"])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  only_genes["gene_name"] = [parse_info_column(x) for x in only_genes["info"]]


In [71]:
duplicate_names = only_genes.value_counts("gene_name")
duplicate_names = duplicate_names[duplicate_names > 1]

deduplicated = []
for gene in duplicate_names.index:
    both_listings = only_genes[only_genes["gene_name"]==gene]
    to_add = pd.DataFrame(both_listings.iloc[0]).transpose()
    to_add["start"] = np.min(both_listings["start"])
    to_add["end"] = np.max(both_listings["end"])
    deduplicated.append(to_add)
    
deduplicated = pd.concat(deduplicated)
only_genes = only_genes[~np.isin(only_genes["gene_name"], duplicate_names.index)]
only_genes = pd.concat([only_genes, deduplicated])

In [76]:
grch38_to_bed = only_genes[["chrom", "start", "end", "gene_name"]]
grch38_to_bed.to_csv(resource_dir+"hg38_gene_locs.bed", sep="\t", index=False, header=False)