In [1]:
import pandas as pd
import numpy as np 
import os 
import matplotlib.pyplot as plt
import shutil
import ast

Preprocessing
===

In [2]:
directory_path = r".\data\AA_CCLE_hg38_aggregated_050323\AA_CCLE_hg38_aggregated_050323\AA_outputs"
output_path = r".\data\AA_CCLE_hg38_aggregated_050323/BREAST"
def move_bed_files(directory_path, output_path, cancertype):
    os.makedirs(output_path, exist_ok=True)
    for directory in os.listdir(directory_path):
        if cancertype in directory:
            sample_path = os.path.join(directory_path, directory)
            for sample in os.listdir(sample_path):
                if ".bed" in sample:
                    shutil.copy(os.path.join(sample_path, sample), os.path.join(output_path, sample))
move_bed_files(directory_path, output_path, "BREAST")                   

In [3]:
# obtain the different types:
locations = pd.DataFrame()
for file in os.listdir(output_path):
    try:
        temp = pd.read_csv(os.path.join(output_path, file), sep = "\t", names = ['chromosome', 'start', 'end', 'stat', 'cycle_file'])
        locations = pd.concat([locations, temp])
    except Exception as e:
        print(e)
locations.head(2)

  locations = pd.concat([locations, temp])
  locations = pd.concat([locations, temp])
  locations = pd.concat([locations, temp])


Unnamed: 0,chromosome,start,end,stat,cycle_file
0,chr3,37936027,38301037,4.696643,/expanse/lustre/projects/csd714/edwin5588/CCLE...
1,chr7,114216157,115016148,7.31299,/expanse/lustre/projects/csd714/edwin5588/CCLE...


In [4]:
aggregated = pd.read_csv(r".\data\AA_CCLE_hg38_aggregated_050323\AA_CCLE_hg38_aggregated_050323\aggregated_results.csv")
aggregated = aggregated[['Sample name', 'Classification', 'All genes', "Location", "Oncogenes", "Captured interval length"]]
brCan = aggregated[aggregated['Sample name'].str.contains('BREAST')]

#drop rows with NaN in 'Classification' column
brCan = brCan.dropna(subset=['Classification'])

#filter for ecDNA classifications
brCan = brCan[brCan['Classification'].str.contains('ecDNA')]
brCan.head(2)

Unnamed: 0,Sample name,Classification,All genes,Location,Oncogenes,Captured interval length
23,AU565_BREAST,ecDNA,"['ACAA1', 'DLEC1', 'MYD88', 'OXSR1', 'PLCD1', ...",['chr3:37937674-38298744'],['MYD88'],361070.0
24,AU565_BREAST,ecDNA,"['ANXA13', 'ATAD2', 'C8orf76', 'C8orf89', 'CAL...","['chr8:71754758-71754831', 'chr8:71755523-7287...","['HEY1', 'MYC', 'TPD52']",23353056.0


In [5]:
# obtain set of all oncogenes and genes
oncogenes = set()
for og_str in np.array(brCan["Oncogenes"]):
    if og_str != '[]': 
        oncogenes.update(ast.literal_eval(og_str))
print(len(oncogenes))

all_genes = set()
for og_str in np.array(brCan["All genes"]):
    if og_str != '[]':
        all_genes.update(ast.literal_eval(og_str))
print(len(all_genes))

# oncogenes included within the all genes (sanity check)
elements = all_genes.intersection(oncogenes)
len(elements)

84
1306


84

Obtaining all potential locations
===

In [8]:
def merge_intervals(df, range_limit=50000):
    def merge_group(group):
        group = group.sort_values('start').reset_index(drop=True)
        merged_intervals = []

        for _, row in group.iterrows():
            if not merged_intervals:
                merged_intervals.append(row)
            else:
                last_merged = merged_intervals[-1]
                if row['start'] <= last_merged['end'] + range_limit:
                    merged_intervals[-1]['end'] = max(last_merged['end'], row['end'])
                else:
                    merged_intervals.append(row)

        return pd.DataFrame(merged_intervals)

    merged_df = df.groupby('chromosome').apply(merge_group).reset_index(drop=True)
    return merged_df

merged_locations_df = merge_intervals(locations, range_limit=500000)
merged_locations_df.head(2)


  merged_df = df.groupby('chromosome').apply(merge_group).reset_index(drop=True)


Unnamed: 0,chromosome,start,end,stat,cycle_file
0,chr1,30934470,31089474,6.13184,/expanse/lustre/projects/csd714/edwin5588/CCLE...
1,chr1,36224586,36299587,5.244299,/expanse/lustre/projects/csd714/edwin5588/CCLE...


In [9]:
def obtain_stats_locations(locations):
    locations.head(3)
    # first create set of all chromosomes
    chromosomes = set(locations["chromosome"])
    # create dictionary 
    locations_dict = {}
    for chrom in chromosomes:
        locations_dict[chrom] = []
    for i, row in locations.iterrows():
        chrom = row["chromosome"]
        start = row["start"]
        end = row["end"]
        locations_dict[chrom].append((start, end))
    lengths = {}
    for key in locations_dict.keys():
        lengths[key] = len(locations_dict[key])
    total_locations = sum(len(value) for value in locations_dict.values())
    print(f"total potential locations: {total_locations}")
    print(lengths)
print("prefiltering\n")
obtain_stats_locations(locations)
print("-------------------------------------------\n")
print("post filtering\n")
obtain_stats_locations(merged_locations_df)

prefiltering

total potential locations: 519
{'chr4': 11, 'chr13': 17, 'chrX': 85, 'chr9': 16, 'chr11': 39, 'chr5': 11, 'chr3': 11, 'chr17': 70, 'chr10': 10, 'chr8': 71, 'chr16': 9, 'chr2': 9, 'chr21': 2, 'chr14': 7, 'chr7': 9, 'chr12': 16, 'chr19': 16, 'chr1': 36, 'chr15': 6, 'chr20': 45, 'chr22': 5, 'chr18': 5, 'chr6': 13}
-------------------------------------------

post filtering

total potential locations: 202
{'chr4': 9, 'chr13': 9, 'chrX': 14, 'chr9': 9, 'chr11': 9, 'chr5': 10, 'chr3': 10, 'chr17': 17, 'chr10': 7, 'chr8': 5, 'chr16': 6, 'chr2': 9, 'chr21': 2, 'chr14': 6, 'chr7': 9, 'chr12': 12, 'chr19': 8, 'chr1': 21, 'chr15': 6, 'chr20': 6, 'chr22': 5, 'chr18': 4, 'chr6': 9}


In [10]:
# file location -> need to also include the intensity 
output_bed = r".\data\AA_CCLE_hg38_aggregated_050323/BREAST_filtered_threshold_500000.bed"
merged_locations_df.drop(columns=["stat"]).to_csv(output_bed, header=False, index=False, sep = "\t")