In [44]:
import pandas as pd
import numpy as np 
import os 
import matplotlib.pyplot as plt
import shutil
import ast

Preprocessing
===

In [45]:
directory_path = r"C:\Users\albao\Downloads\AA_CCLE_hg38_aggregated_050323\AA_CCLE_hg38_aggregated_050323\AA_outputs"
output_path = r"C:\Users\albao\Downloads\AA_CCLE_hg38_aggregated_050323/BREAST"
def move_bed_files(directory_path, output_path, cancertype):
    os.makedirs(output_path, exist_ok=True)
    for directory in os.listdir(directory_path):
        if cancertype in directory:
            sample_path = os.path.join(directory_path, directory)
            for sample in os.listdir(sample_path):
                if ".bed" in sample:
                    shutil.copy(os.path.join(sample_path, sample), os.path.join(output_path, sample))
move_bed_files(directory_path, output_path, "BREAST")                   

In [46]:
# obtain the different types:
locations = pd.DataFrame()
for file in os.listdir(output_path):
    try:
        temp = pd.read_csv(os.path.join(output_path, file), sep = "\t", names = ['chromosome', 'start', 'end', 'stat', 'cycle_file'])
        locations = pd.concat([locations, temp])
    except Exception as e:
        print(e)
locations.head(2)

Unnamed: 0,chromosome,start,end,stat,cycle_file
0,chr3,37936027,38301037,4.696643,/expanse/lustre/projects/csd714/edwin5588/CCLE...
1,chr7,114216157,115016148,7.31299,/expanse/lustre/projects/csd714/edwin5588/CCLE...


In [47]:
aggregated = pd.read_csv(r"C:\Users\albao\Downloads\AA_CCLE_hg38_aggregated_050323\AA_CCLE_hg38_aggregated_050323\aggregated_results.csv")
aggregated = aggregated[['Sample name', 'Classification', 'All genes', "Location", "Oncogenes", "Captured interval length"]]
brCan = aggregated[aggregated['Sample name'].str.contains('BREAST')]

#drop rows with NaN in 'Classification' column
brCan = brCan.dropna(subset=['Classification'])

#filter for ecDNA classifications
brCan = brCan[brCan['Classification'].str.contains('ecDNA')]
brCan.head(2)

Unnamed: 0,Sample name,Classification,All genes,Location,Oncogenes,Captured interval length
23,AU565_BREAST,ecDNA,"['ACAA1', 'DLEC1', 'MYD88', 'OXSR1', 'PLCD1', ...",['chr3:37937674-38298744'],['MYD88'],361070.0
24,AU565_BREAST,ecDNA,"['ANXA13', 'ATAD2', 'C8orf76', 'C8orf89', 'CAL...","['chr8:71754758-71754831', 'chr8:71755523-7287...","['HEY1', 'MYC', 'TPD52']",23353056.0


In [48]:
# obtain set of all oncogenes and genes
oncogenes = set()
for og_str in np.array(brCan["Oncogenes"]):
    if og_str != '[]': 
        oncogenes.update(ast.literal_eval(og_str))
print(len(oncogenes))

all_genes = set()
for og_str in np.array(brCan["All genes"]):
    if og_str != '[]':
        all_genes.update(ast.literal_eval(og_str))
print(len(all_genes))

# oncogenes included within the all genes (sanity check)
elements = all_genes.intersection(oncogenes)
len(elements)

84
1306


84

Obtaining all potential locations
===

In [88]:
def merge_intervals(intervals, range_limit = 50000):
    if not intervals:
        return []
    # Sort intervals by the start time
    intervals.sort(key=lambda x: x[0])
    merged_intervals = [intervals[0]]
    
    for current in intervals:
        last_merged = merged_intervals[-1]
        if current[0] <= last_merged[1] + range_limit:
            merged_intervals[-1] = (last_merged[0], max(last_merged[1], current[1]))
        else:
            merged_intervals.append(current)

    return merged_intervals


In [98]:
locations.head(3)
# first create set of all chromosomes
chromosomes = set(locations["chromosome"])
# create dictionary 
locations_dict = {}
for chrom in chromosomes:
    locations_dict[chrom] = []
    
# now parse original bed files

for i, row in locations.iterrows():
    chrom = row["chromosome"]
    start = row["start"]
    end = row["end"]
    locations_dict[chrom].append((start, end))
lengths = {}
for key in locations_dict.keys():
    lengths[key] = len(locations_dict[key])
total_locations = sum(len(value) for value in locations_dict.values())
print(f"total potential locations: {total_locations}")
print(lengths)

total potential locations: 519
{'chr11': 39, 'chr17': 70, 'chr2': 9, 'chr5': 11, 'chr8': 71, 'chr3': 11, 'chr6': 13, 'chr16': 9, 'chr20': 45, 'chr1': 36, 'chr12': 16, 'chr14': 7, 'chr13': 17, 'chrX': 85, 'chr22': 5, 'chr9': 16, 'chr18': 5, 'chr15': 6, 'chr4': 11, 'chr21': 2, 'chr7': 9, 'chr19': 16, 'chr10': 10}


In [99]:
for key in locations_dict.keys():
    locations_dict[key] = merge_intervals(locations_dict[key], range_limit =500000 )
lengths = {}
for key in locations_dict.keys():
    lengths[key] = len(locations_dict[key])

total_locations = sum(len(value) for value in locations_dict.values())
print(f"total potential locations: {total_locations}")
print(lengths)


total potential locations: 202
{'chr11': 9, 'chr17': 17, 'chr2': 9, 'chr5': 10, 'chr8': 5, 'chr3': 10, 'chr6': 9, 'chr16': 6, 'chr20': 6, 'chr1': 21, 'chr12': 12, 'chr14': 6, 'chr13': 9, 'chrX': 14, 'chr22': 5, 'chr9': 9, 'chr18': 4, 'chr15': 6, 'chr4': 9, 'chr21': 2, 'chr7': 9, 'chr19': 8, 'chr10': 7}
