# Benchmarking of extrachromosomal circular DNA (eccDNA) and circular RNA detection software

In this study, we performed a comprehensive evaluation of 5 extrachromosomal circular DNA (eccDNA) detection software (Circle-Map, CIRCexplorer2, CircleFinder, ecc_finder-bwa, and ecc_finder-minimap2) and 5 circualr RNA (circRNA) detection software (CIRCexplorer2, circRNA_finder, CIRIquant, find_circ, and segemehl). We evaluated their performance on both in silico and biological data, and we propose a new protocol based on a Rosette detection that combines more than two algorithms to enhance the detection of eccDNAs and circRNAs.

## Library and function imports

In [None]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from collections import defaultdict
import re
from math import isclose

In [None]:
from functions.load_bed_file import load_bed_file
from functions.compare_circular_data import analyze_circular_data
from functions.metrics import plot_circular_detection
from functions.save_metrics import process_and_save_metrics
from functions.coordinates_precision import plot_coordinates_precision
from functions.lengths import plot_length_distributions, distribution_comparison_ks_test
from functions.repeat_elements import annotate_repeat_elements, process_repeat_elements, calculate_repeat_element_metrics, plot_stats_repeat_elements
from functions.genomic_elements import annotate_genomic_elements, annotate_bed_files_genomic_elements, process_genomic_elements, calculate_genomic_element_metrics, plot_stats_genomic_elements
from functions.combinations import process_eccDNA_filtering, process_circRNA_filtering, plot_combinations, compare_combinations, analyze_and_save_circles, merge_files
from functions.ratio import process_circle_matrix, circle_diff, circle_diff_real, diff_cj_combinations, plot_diffCJ_scatterplot, process_and_plot_ratios
from functions.matrix import matrix_real, create_circle_presence_matrix
from functions.upsetR import upset_plot
from functions.validation import matrix_fp


## 1. in silico

We created 1,000 circles with the following characteristics:

- eccDNA was created from all the genome and circRNA from the transcript region.
- Length: [175, 10,000]
- Lognormal distribution (mean = 1,000, sd = 1)
- Biological mutation ratio: 0.01 (Kimura model)
- Sequencing error rate: 0.001 (random)

Additionally, 1,000 linear fragments with the same characteristics as the circular molecule (except for a minimum length of 501) from randomly selected genomic regions were introduced as false positive molecules.

### Unfilter

In [None]:
# Define paths and parameters
eccDNA_bed = 'data/insilico/eccDNA/eccDNA.bed'
eccDNA_dir = 'data/insilico/eccDNA/unfilter'
eccDNA_output = 'results/eccDNA/insilico/unfilter/'
eccDNA_tools = ['CIRCexplorer2', 'Circle-Map', 'Circle_finder', 'ecc_finder-bwa', 'ecc_finder-minimap2']

circRNA_bed = 'data/insilico/circRNA/circRNA.bed'
circRNA_dir = 'data/insilico/circRNA/unfilter'
circRNA_output = 'results/circRNA/insilico/unfilter/'
circRNA_tools = ['CIRCexplorer2', 'circRNA_finder', 'CIRIquant', 'find_circ', 'segemehl']

coverages = [5, 7, 10, 15, 20, 30, 50, 70, 100]

In [None]:
analyze_circular_data(eccDNA_dir, eccDNA_bed, eccDNA_output, eccDNA_tools, coverages)

In [None]:
eccDNA_dir = 'results/eccDNA/insilico/unfilter/statistics'
eccDNA_output = f'{eccDNA_dir}/graphs/'

data_files = [
    'truepositives.csv',
    'falsepositives.csv',
    'falsenegatives.csv',
    'fscore.csv',
    'recall.csv',
    'precision.csv'
]

stats = ['True Positives', 'False Positives', 'False Negatives', 'F-score', 'Recall', 'Precision']

for data_file, name in zip(data_files, stats):
    data_path = f'{eccDNA_dir}/{data_file}'
    plot_circular_detection(data_path, name, eccDNA_output, 'eccDNA')

In [None]:
analyze_circular_data(circRNA_dir, circRNA_bed, circRNA_output, circRNA_tools, coverages)

In [None]:
circRNA_dir = 'results/circRNA/insilico/unfilter/statistics'
circRNA_output = f'{circRNA_dir}/graphs/'

for data_file, name in zip(data_files, stats):
    data_path = f'{circRNA_dir}/{data_file}'
    plot_circular_detection(data_path, name, circRNA_output, 'circRNA')

## Filter-split

In [None]:
# Define paths and parameters
eccDNA_dir = 'data/insilico/eccDNA/filter-split'
eccDNA_output = 'results/eccDNA/insilico/filter-split'

circRNA_dir = 'data/insilico/circRNA/filter-split'
circRNA_output = 'results/circRNA/insilico/filter-split'

In [None]:
analyze_circular_data(eccDNA_dir, eccDNA_bed, eccDNA_output, eccDNA_tools, coverages)

In [None]:
eccDNA_dir = 'results/eccDNA/insilico/filter-split/statistics'
eccDNA_output = f'{eccDNA_dir}/graphs/'

for data_file, name in zip(data_files, stats):
    data_path = f'{eccDNA_dir}/{data_file}'
    plot_circular_detection(data_path, name, eccDNA_output, 'eccDNA')

In [None]:
analyze_circular_data(circRNA_dir, circRNA_bed, circRNA_output, circRNA_tools, coverages)

In [None]:
circRNA_dir = 'results/circRNA/insilico/filter-split/statistics'
circRNA_output = f'{circRNA_dir}/graphs/'

for data_file, name in zip(data_files, stats):
    data_path = f'{circRNA_dir}/{data_file}'
    plot_circular_detection(data_path, name, circRNA_output, 'circRNA')

## Filter-duplicates

In [None]:
# Define paths and parameters
eccDNA_dir = 'data/insilico/eccDNA/filter-duplicates'
eccDNA_output = 'results/eccDNA/insilico/filter-duplicates'

circRNA_dir = 'data/insilico/circRNA/filter-duplicates'
circRNA_output = 'results/circRNA/insilico/filter-duplicates'

In [None]:
analyze_circular_data(eccDNA_dir, eccDNA_bed, eccDNA_output, eccDNA_tools, coverages)

In [None]:
eccDNA_dir = 'results/eccDNA/insilico/filter-duplicates/statistics'
eccDNA_output = f'{eccDNA_dir}/graphs/'

for data_file, name in zip(data_files, stats):
    data_path = f'{eccDNA_dir}/{data_file}'
    plot_circular_detection(data_path, name, eccDNA_output, 'eccDNA')

In [None]:
analyze_circular_data(circRNA_dir, circRNA_bed, circRNA_output, circRNA_tools, coverages)

In [None]:
circRNA_dir = 'results/circRNA/insilico/filter-duplicates/statistics'
circRNA_output = f'{circRNA_dir}/graphs/'

for data_file, name in zip(data_files, stats):
    data_path = f'{circRNA_dir}/{data_file}'
    plot_circular_detection(data_path, name, circRNA_output, 'circRNA')

### Filter

In [None]:
# Define paths and parameters
eccDNA_dir = 'data/insilico/eccDNA/filter'
eccDNA_output = 'results/eccDNA/insilico/filter'

circRNA_dir = 'data/insilico/circRNA/filter'
circRNA_output = 'results/circRNA/insilico/filter'

In [None]:
analyze_circular_data(eccDNA_dir, eccDNA_bed, eccDNA_output, eccDNA_tools, coverages)

In [None]:
eccDNA_dir = 'results/eccDNA/insilico/filter/statistics'
eccDNA_output = f'{eccDNA_dir}/graphs/'

data_files = [
    'truepositives.csv',
    'falsepositives.csv',
    'falsenegatives.csv',
    'fscore.csv',
    'recall.csv',
    'precision.csv'
]

stats = ['True Positives', 'False Positives', 'False Negatives', 'F-score', 'Recall', 'Precision']

for data_file, name in zip(data_files, stats):
    data_path = f'{eccDNA_dir}/{data_file}'
    plot_circular_detection(data_path, name, eccDNA_output, 'eccDNA')

In [None]:
analyze_circular_data(circRNA_dir, circRNA_bed, circRNA_output, circRNA_tools, coverages)

In [None]:
circRNA_dir = 'results/circRNA/insilico/filter/statistics'
circRNA_output = f'{circRNA_dir}/graphs/'

for data_file, name in zip(data_files, stats):
    data_path = f'{circRNA_dir}/{data_file}'
    plot_circular_detection(data_path, name, circRNA_output, 'circRNA')

### Save metrics

In [None]:
eccDNA_base = "data/insilico/eccDNA"
filtering_methods = ["unfilter", "filter-split", "filter-duplicates", "filter"]

data = []

for tool in eccDNA_tools:
    for cov in coverages:
        row_counts = {"Tool": tool, "Coverage": cov}
        for filtering in filtering_methods:
            bed_file = os.path.join(eccDNA_base, filtering, tool, f"cov{cov}_{tool}.bed")
            if os.path.exists(bed_file):
                with open(bed_file, "r") as f:
                    row_count = sum(1 for line in f)
            else:
                row_count = 0
            row_counts[filtering.capitalize()] = row_count
        data.append(row_counts)

df = pd.DataFrame(data)
os.makedirs("results/eccDNA/insilico/filter/statistics", exist_ok=True)
df.to_csv("results/eccDNA/insilico/eccDNA_bed_counts.csv", index=False)

In [None]:
circRNA_base = "data/insilico/circRNA"

data = []
for tool in eccDNA_tools:
    for cov in coverages:
        row_counts = {"Tool": tool, "Coverage": cov}
        for filtering in filtering_methods:
            bed_file = os.path.join(circRNA_base, filtering, tool, f"cov{cov}_{tool}.bed")
            if os.path.exists(bed_file):
                with open(bed_file, "r") as f:
                    row_count = sum(1 for line in f)
            else:
                row_count = 0
            row_counts[filtering.capitalize()] = row_count
        data.append(row_counts)

df = pd.DataFrame(data)
df.to_csv("results/circRNA/insilico/circRNA_bed_counts.csv", index=False)

In [None]:
eccDNA_bed = "data/insilico/eccDNA/"
eccDNA_dir = "results/eccDNA/insilico"
eccDNA_output='results/eccDNA/insilico/analysis_results.xlsx'

process_and_save_metrics(eccDNA_bed, eccDNA_dir, eccDNA_tools, coverages, filtering_methods, eccDNA_output)

In [None]:
circRNA_bed = "data/insilico/circRNA/"
circRNA_dir = "results/circRNA/insilico"
circRNA_output='results/circRNA/insilico/analysis_results.xlsx'

process_and_save_metrics(circRNA_bed, circRNA_dir, circRNA_tools, coverages, filtering_methods, circRNA_output)

### Coordinates precision

In [None]:
# Define paths and parameters
eccDNA_bed = 'data/insilico/eccDNA/eccDNA.bed'
eccDNA_dir = 'data/insilico/eccDNA/filter'
eccDNA_output = 'results/eccDNA/insilico/filter/coordinates_precision'

circRNA_bed = 'data/insilico/circRNA/circRNA.bed'
circRNA_dir = 'data/insilico/circRNA/filter'
circRNA_output = 'results/circRNA/insilico/filter/coordinates_precision'

In [None]:
plot_coordinates_precision(eccDNA_tools, coverages, eccDNA_bed, eccDNA_dir, eccDNA_output, circle_type='eccDNA')

In [None]:
plot_coordinates_precision(circRNA_tools, coverages, circRNA_bed, circRNA_dir, circRNA_output, circle_type = 'circRNA')

### Length distributions

In [None]:
eccDNA_output = 'results/eccDNA/insilico/filter/length_distributions'
plot_length_distributions(eccDNA_bed, eccDNA_dir, eccDNA_tools, eccDNA_output, circle_type='eccDNA')

In [None]:
distribution_comparison_ks_test(eccDNA_bed, eccDNA_dir, eccDNA_tools, eccDNA_output, circle_type='eccDNA')

In [None]:
circRNA_output = 'results/circRNA/insilico/filter/length_distributions'
plot_length_distributions(circRNA_bed, circRNA_dir, circRNA_tools, circRNA_output, circle_type='circRNA', min_length=240, max_length=480)

In [None]:
distribution_comparison_ks_test(circRNA_bed, circRNA_dir, circRNA_tools, circRNA_output, circle_type='circRNA')

## Repeat Elements

In [None]:
# Define paths and parameters
repeats_file = 'repeatmasker/data/repeat_elements_hg38.txt'
eccDNA_base = "results/eccDNA/insilico/filter"
eccDNA_output = f'{eccDNA_base}/repeat_elements/'
coverage = 'cov30'

# Generate true positive, false negative, and false positive paths
bed_files_true = [eccDNA_bed] + [f'{eccDNA_base}/truepositives/{tool}/{coverage}_{tool}.bed' for tool in eccDNA_tools]
bed_files_false_neg = [eccDNA_bed] + [f'{eccDNA_base}/falsenegatives/{tool}/{coverage}_{tool}.bed' for tool in eccDNA_tools]
bed_files_false_pos = [eccDNA_bed] + [f'{eccDNA_base}/falsepositives/{tool}/{coverage}_{tool}.bed' for tool in eccDNA_tools]

# Annotate BED files for true positives, false negatives, and false positives
annotate_repeat_elements(repeats_file, bed_files_true, eccDNA_output, eccDNA_tools, true_or_false='truepositives')
annotate_repeat_elements(repeats_file, bed_files_false_neg, eccDNA_output, eccDNA_tools, true_or_false='falsenegatives')
annotate_repeat_elements(repeats_file, bed_files_false_pos, eccDNA_output, eccDNA_tools, true_or_false='falsepositives')

In [None]:
# Define tools + reference
eccDNA_all = ['eccDNA'] + eccDNA_tools

# Define categories and tools
categories = ['truepositives', 'falsenegatives', 'falsepositives']

# Process each category
for category in categories:
    # Define basename for each category
    if category == 'truepositives':
        basename = 'TP'
    elif category == 'falsenegatives':
        basename = 'FN'
    elif category == 'falsepositives':
        basename = 'FP'
        
    # List of annotated BED files for the current category
    eccDNA_annotated_files = [
        f'{eccDNA_output}/{category}/eccDNA_repeat_elements_{basename}.bed'
    ] + [
        f'{eccDNA_output}/{category}/{tool}_repeat_elements_{basename}.bed' for tool in eccDNA_tools
    ]

    # Output CSV file (use a separate variable for the output file)
    eccDNA_csv = f'{eccDNA_output}/repeat_elements_counts_{basename}.csv'

    # Process the annotated BED files and create the CSV
    process_repeat_elements(eccDNA_annotated_files, eccDNA_all, eccDNA_csv)


In [None]:
eccDNA_tp_file = f'{eccDNA_output}/repeat_elements_counts_TP.csv'
eccDNA_fn_file = f'{eccDNA_output}/repeat_elements_counts_FN.csv'
eccDNA_fp_file = f'{eccDNA_output}/repeat_elements_counts_FP.csv'

calculate_repeat_element_metrics(eccDNA_tp_file, eccDNA_fn_file, eccDNA_fp_file, eccDNA_output, eccDNA_all)

In [None]:
# List of statistics to process
stats = ['precision', 'recall', 'fscore']

# Loop through each stat, generate file path, and plot metrics
for stat in stats:
    stat_file = f'{eccDNA_output}/repeat_elements_{stat}.csv'
    plot_stats_repeat_elements(stat_file, stat, eccDNA_output)

In [None]:
# Define paths and parameters
circRNA_base = "results/circRNA/insilico/filter"
circRNA_output = f'{circRNA_base}/repeat_elements'

# Generate true positive, false negative, and false positive paths
bed_files_true = [circRNA_bed] + [f'{circRNA_base}/truepositives/{tool}/{coverage}_{tool}.bed' for tool in circRNA_tools]
bed_files_false_neg = [circRNA_bed] + [f'{circRNA_base}/falsenegatives/{tool}/{coverage}_{tool}.bed' for tool in circRNA_tools]
bed_files_false_pos = [circRNA_bed] + [f'{circRNA_base}/falsepositives/{tool}/{coverage}_{tool}.bed' for tool in circRNA_tools]

# Annotate BED files for true positives, false negatives, and false positives
annotate_repeat_elements(repeats_file, bed_files_true, circRNA_output, circRNA_tools, true_or_false='truepositives')
annotate_repeat_elements(repeats_file, bed_files_false_neg, circRNA_output, circRNA_tools, true_or_false='falsenegatives')
annotate_repeat_elements(repeats_file, bed_files_false_pos, circRNA_output, circRNA_tools, true_or_false='falsepositives')

In [None]:
# Define tools + reference
circRNA_all = ['circRNA'] + circRNA_tools

# Process each category
for category in categories:
    # Define basename for each category
    if category == 'truepositives':
        basename = 'TP'
    elif category == 'falsenegatives':
        basename = 'FN'
    elif category == 'falsepositives':
        basename = 'FP'
        
    # List of annotated BED files for the current category
    circRNA_annotated_files = [
        f'{circRNA_output}/{category}/circRNA_repeat_elements_{basename}.bed'
    ] + [
        f'{circRNA_output}/{category}/{tool}_repeat_elements_{basename}.bed' for tool in circRNA_tools
    ]

    # Output CSV file (use a separate variable for the output file)
    circRNA_csv = f'{circRNA_output}/repeat_elements_counts_{basename}.csv'

    # Process the annotated BED files and create the CSV
    process_repeat_elements(circRNA_annotated_files, circRNA_all, circRNA_csv)

In [None]:
circRNA_tp_file = f'{circRNA_output}/repeat_elements_counts_TP.csv'
circRNA_fn_file = f'{circRNA_output}/repeat_elements_counts_FN.csv'
circRNA_fp_file = f'{circRNA_output}/repeat_elements_counts_FP.csv'

calculate_repeat_element_metrics(circRNA_tp_file, circRNA_fn_file, circRNA_fp_file, circRNA_output, circRNA_all)

In [None]:
# Loop through each stat, generate file path, and plot metrics
for stat in stats:
    stat_file = f'{circRNA_output}/repeat_elements_{stat}.csv'
    plot_stats_repeat_elements(stat_file, stat, circRNA_output)

## Genomic Elements

In [None]:
gene_file = "/data/database/genomes/GRCh38/genomic_genes.gtf"
exon_file = "/data/database/genomes/GRCh38/exon_genes.gtf"
other_file = "/data/database/genomes/GRCh38/genomic_annotation.gtf"

In [None]:
# Define paths and parameters
eccDNA_base = "results/eccDNA/insilico/filter"
eccDNA_output = f'{eccDNA_base}/genomic_elements/'

coverage = 'cov30'

# Generate true positive, false negative, and false positive paths
bed_files_true = [eccDNA_bed] + [f'{eccDNA_base}/truepositives/{tool}/{coverage}_{tool}.bed' for tool in eccDNA_tools]
bed_files_false_neg = [eccDNA_bed] + [f'{eccDNA_base}/falsenegatives/{tool}/{coverage}_{tool}.bed' for tool in eccDNA_tools]
bed_files_false_pos = [eccDNA_bed] + [f'{eccDNA_base}/falsepositives/{tool}/{coverage}_{tool}.bed' for tool in eccDNA_tools]

# Annotate BED files for true positives, false negatives, and false positives
annotate_bed_files_genomic_elements(bed_files_true, eccDNA_output, eccDNA_tools, true_or_false='truepositives')
annotate_bed_files_genomic_elements(bed_files_false_neg, eccDNA_output, eccDNA_tools, true_or_false='falsenegatives')
annotate_bed_files_genomic_elements(bed_files_false_pos, eccDNA_output, eccDNA_tools, true_or_false='falsepositives')

In [None]:
output_dir = eccDNA_output + 'eccDNA_genomic_elements.bed'
annotate_genomic_elements(eccDNA_bed, output_dir, gene_file, exon_file, other_file)

In [None]:

for category in categories:
    if category == 'truepositives':
        basename = 'TP'
    elif category == 'falsenegatives':
        basename = 'FN'
    elif category == 'falsepositives':
        basename = 'FP'
        
    eccDNA_annotated_files = [
        f'{eccDNA_output}/{category}/eccDNA_genomic_elements_{basename}.bed'
    ] + [
        f'{eccDNA_output}/{category}/{tool}_genomic_elements_{basename}.bed' for tool in eccDNA_tools
    ]

    eccDNA_csv = f'{eccDNA_output}/genomic_elements_counts_{basename}.csv'

    # Fix the argument order
    process_genomic_elements(eccDNA_annotated_files, eccDNA_csv, eccDNA_all)


In [None]:
eccDNA_tp_file = f'{eccDNA_output}/genomic_elements_counts_TP.csv'
eccDNA_fn_file = f'{eccDNA_output}/genomic_elements_counts_FN.csv'
eccDNA_fp_file = f'{eccDNA_output}/genomic_elements_counts_FP.csv'

calculate_genomic_element_metrics(eccDNA_tp_file, eccDNA_fn_file, eccDNA_fp_file, eccDNA_output, eccDNA_tools)

In [None]:
# Loop through each stat, generate file path, and plot metrics
for stat in stats:
    stat_file = f'{eccDNA_output}/genomic_elements_{stat}.csv'
    plot_stats_genomic_elements(stat_file, stat, eccDNA_output)

In [None]:
# Define paths and parameters
circRNA_base = "results/circRNA/insilico/filter"
circRNA_output = f'{circRNA_base}/genomic_elements/'

# Generate true positive, false negative, and false positive paths
bed_files_true = [circRNA_bed] + [f'{circRNA_base}/truepositives/{tool}/{coverage}_{tool}.bed' for tool in circRNA_tools]
bed_files_false_neg = [circRNA_bed] + [f'{circRNA_base}/falsenegatives/{tool}/{coverage}_{tool}.bed' for tool in circRNA_tools]
bed_files_false_pos = [circRNA_bed] + [f'{circRNA_base}/falsepositives/{tool}/{coverage}_{tool}.bed' for tool in circRNA_tools]

# Annotate BED files for true positives, false negatives, and false positives
annotate_bed_files_genomic_elements(bed_files_true, circRNA_output, circRNA_tools, true_or_false='truepositives')
annotate_bed_files_genomic_elements(bed_files_false_neg, circRNA_output, circRNA_tools, true_or_false='falsenegatives')
annotate_bed_files_genomic_elements(bed_files_false_pos, circRNA_output, circRNA_tools, true_or_false='falsepositives')

In [None]:
output_dir = circRNA_output + 'circRNA_genomic_elements.bed'
annotate_genomic_elements(circRNA_bed, output_dir, gene_file, exon_file, other_file)

In [None]:
for category in categories:
    if category == 'truepositives':
        basename = 'TP'
    elif category == 'falsenegatives':
        basename = 'FN'
    elif category == 'falsepositives':
        basename = 'FP'
        
    circRNA_annotated_files = [
        f'{circRNA_output}/{category}/circRNA_genomic_elements_{basename}.bed'
    ] + [
        f'{circRNA_output}/{category}/{tool}_genomic_elements_{basename}.bed' for tool in circRNA_tools
    ]

    circRNA_csv = f'{circRNA_output}/genomic_elements_counts_{basename}.csv'

    # Fix the argument order
    process_genomic_elements(circRNA_annotated_files, circRNA_csv, circRNA_all)

In [None]:
circRNA_tp_file = f'{circRNA_output}/genomic_elements_counts_TP.csv'
circRNA_fn_file = f'{circRNA_output}/genomic_elements_counts_FN.csv'
circRNA_fp_file = f'{circRNA_output}/genomic_elements_counts_FP.csv'

calculate_genomic_element_metrics(circRNA_tp_file, circRNA_fn_file, circRNA_fp_file, circRNA_output, circRNA_tools)

In [None]:
# Loop through each stat, generate file path, and plot metrics
for stat in stats:
    stat_file = f'{circRNA_output}/genomic_elements_{stat}.csv'
    plot_stats_genomic_elements(stat_file, stat, circRNA_output)

## Combination analysis

In [None]:
for filtering in filtering_methods:
    process_eccDNA_filtering(filtering, eccDNA_bed, eccDNA_tools)

In [None]:
for filtering in filtering_methods:
    process_circRNA_filtering(filtering, circRNA_bed, circRNA_tools)

In [None]:
eccDNA_dir = "results/eccDNA/insilico"
plot_combinations(eccDNA_dir)

In [None]:
circRNA_dir = "results/circRNA/insilico"
plot_combinations(circRNA_dir)

In [None]:
compare_combinations(eccDNA_dir)

In [None]:
compare_combinations(circRNA_dir)

### CJ validation

In [None]:
eccDNA_matrix = 'results/eccDNA/insilico/filter/truepositives/truepositives_matrix.csv'
matrix_fp(eccDNA_bed, eccDNA_tools, eccDNA_matrix, 'eccDNA')

In [None]:
eccDNA_bam = 'output/eccDNA/insilico/bam/sorted_unknown_DNA_cov30.bam'
eccDNA_reads = 'results/eccDNA/insilico/filter/truepositives/truepositives_reads.csv'
process_circle_matrix(eccDNA_matrix, eccDNA_bam, eccDNA_reads, N_offset=20)

In [None]:
eccDNA_output = 'results/eccDNA/insilico/filter/truepositives/'
filtering = ["Simulated", "unfilter" ,"filter-split", "filter-duplicates", "filter"]
circle_diff(eccDNA_matrix, eccDNA_reads, filtering, eccDNA_output, group='truepositives')

In [None]:
eccDNA_matrix = 'results/eccDNA/insilico/filter/falsepositives/falsepositives_matrix.csv'
create_circle_presence_matrix(eccDNA_bed, eccDNA_tools, eccDNA_matrix, circular_type='eccDNA')

In [None]:
eccDNA_bam = 'output/eccDNA/insilico/bam/sorted_unknown_DNA_cov30.bam'
eccDNA_reads = 'results/eccDNA/insilico/filter/falsepositives/falsepositives_reads.csv'
process_circle_matrix(eccDNA_matrix, eccDNA_bam, eccDNA_reads, N_offset=20)

In [None]:
eccDNA_output = 'results/eccDNA/insilico/filter/falsepositives/'
filtering = ["unfilter" ,"filter-split", "filter-duplicates", "filter"]
circle_diff(eccDNA_matrix, eccDNA_reads, filtering, eccDNA_output, group='falsepositives')

In [None]:
circRNA_matrix = 'results/circRNA/insilico/filter/truepositives/truepositives_matrix.csv'
matrix_fp(circRNA_bed, circRNA_tools, circRNA_matrix, 'circRNA')

In [None]:
circRNA_bam = 'output/circRNA/insilico/bam/circRNA_cov30.sorted.bam'
circRNA_reads = 'results/circRNA/insilico/filter/truepositives/truepositives_reads.csv'
process_circle_matrix(circRNA_matrix, circRNA_bam, circRNA_reads, use_chr_prefix=True)

In [None]:
circRNA_output = 'results/circRNA/insilico/filter/truepositives/'
filtering = ["Simulated", "unfilter" ,"filter-split", "filter-duplicates", "filter"]
circle_diff(circRNA_matrix, circRNA_reads, filtering, circRNA_output, group='truepositives')

In [None]:
circRNA_matrix = 'results/circRNA/insilico/filter/falsepositives/falsepositives_matrix.csv'
create_circle_presence_matrix(circRNA_bed, circRNA_tools, circRNA_matrix, circular_type='circRNA')

In [None]:
circRNA_bam = 'output/circRNA/insilico/bam/circRNA_cov30.sorted.bam'
circRNA_reads = 'results/circRNA/insilico/filter/falsepositives/falsepositives_reads.csv'
process_circle_matrix(circRNA_matrix, circRNA_bam, circRNA_reads, use_chr_prefix=True)

In [None]:
circRNA_output = 'results/circRNA/insilico/filter/falsepositives/'
filtering = ["unfilter" ,"filter-split", "filter-duplicates", "filter"]
circle_diff(circRNA_matrix, circRNA_reads, filtering, circRNA_output, group='falsepositives')

# 2. Real

## Metrics

In [None]:
sample = "ATAC-seq"
for filtering in filtering_methods:
    eccDNA_dir = f"data/real/eccDNA/{filtering}"
    eccDNA_output = f"results/eccDNA/real/{filtering}/{sample}"
    matrix_real(eccDNA_dir, eccDNA_tools, sample, eccDNA_output)

In [None]:
sample = "Circle-Seq"
for filtering in filtering_methods:
    eccDNA_dir = f"data/real/eccDNA/{filtering}"
    eccDNA_output = f"results/eccDNA/real/{filtering}/{sample}"
    matrix_real(eccDNA_dir, eccDNA_tools, sample, eccDNA_output)


In [None]:
sample = "CNT"
for filtering in filtering_methods:
    circRNA_dir = f"data/real/circRNA/{filtering}"
    circRNA_output = f"results/circRNA/real/{filtering}/{sample}"
    matrix_real(circRNA_dir, circRNA_tools, sample, circRNA_output)

In [None]:
sample = "RNASE"
for filtering in filtering_methods:
    circRNA_dir = f"data/real/circRNA/{filtering}"
    circRNA_output = f"results/circRNA/real/{filtering}/{sample}"
    matrix_real(circRNA_dir, circRNA_tools, sample, circRNA_output)

In [None]:
data = 'Circle-Seq'
for filtering in filtering_methods:
    matrix_dir = f'results/eccDNA/real/{filtering}/{data}/matrix.csv'
    output_dir = f'results/eccDNA/real/{filtering}/{data}/'
    upset_plot(matrix_dir, output_dir, eccDNA_tools)

In [None]:
data = 'ATAC-seq'
for filtering in filtering_methods:
    matrix_dir = f'results/eccDNA/real/{filtering}/{data}/matrix.csv'
    output_dir = f'results/eccDNA/real/{filtering}/{data}/'
    upset_plot(matrix_dir, output_dir, eccDNA_tools)

In [None]:
data = 'CNT'
for filtering in filtering_methods:
    matrix_dir = f'results/circRNA/real/{filtering}/{data}/matrix.csv'
    output_dir = f'results/circRNA/real/{filtering}/{data}/'
    upset_plot(matrix_dir, output_dir, circRNA_tools)

In [None]:
data = 'RNASE'
for filtering in filtering_methods:
    matrix_dir = f'results/circRNA/real/{filtering}/{data}/matrix.csv'
    output_dir = f'results/circRNA/real/{filtering}/{data}/'
    upset_plot(matrix_dir, output_dir, circRNA_tools)

In [None]:
samples = ['Circle-Seq', "ATAC-seq"]
eccDNA_base = "data/real/eccDNA"

for sample in samples:
    data = []
    for tool in eccDNA_tools:
        row_counts = {"Tool": tool}
        for filtering in filtering_methods:
            bed_file = os.path.join(eccDNA_base, filtering, tool, f"{sample}.{tool}.bed")
            if os.path.exists(bed_file):
                with open(bed_file, "r") as f:
                    row_count = sum(1 for line in f)
            else:
                row_count = 0
            row_counts[filtering.capitalize()] = row_count
        data.append(row_counts)

    df = pd.DataFrame(data)
    df.to_csv(f"results/eccDNA/real/{sample}_counts.csv", index=False)

In [None]:
samples = ['CNT', "RNASE"]
circRNA_base = "data/real/circRNA"

for sample in samples:
    data = []
    for tool in circRNA_tools:
        row_counts = {"Tool": tool}
        for filtering in filtering_methods:
            bed_file = os.path.join(circRNA_base, filtering, tool, f"{sample}.{tool}.bed")
            if os.path.exists(bed_file):
                with open(bed_file, "r") as f:
                    row_count = sum(1 for line in f)
            else:
                row_count = 0
            row_counts[filtering.capitalize()] = row_count
        data.append(row_counts)

    df = pd.DataFrame(data)
    df.to_csv(f"results/circRNA/real/{sample}_counts.csv", index=False)

In [None]:
for filtering in filtering_methods:
    process_circle_matrix(
        f"results/eccDNA/real/{filtering}/Circle-Seq/matrix.csv",
        f"output/eccDNA/real/Circle-Seq/bwa/CS_sorted_unknown_circle.bam",
        f"results/eccDNA/real/{filtering}/Circle-Seq/matrix_with_reads.csv",
        N_offset=20, 
        verbose=False
    )

In [None]:
for filtering in filtering_methods:
    process_circle_matrix(
        f"results/eccDNA/real/{filtering}/ATAC-seq/matrix.csv",
        f"output/eccDNA/real/ATAC-seq/bwa/AS_sorted_unknown_circle.bam",
        f"results/eccDNA/real/{filtering}/ATAC-seq/matrix_with_reads.csv",
        use_chr_prefix=True,
        N_offset=20, 
        verbose=False
    )

In [None]:
for filtering in filtering_methods:
    process_circle_matrix(
        f"results/circRNA/real/{filtering}/CNT/matrix.csv",
        f"output/circRNA/real/star/CNT.Aligned.sorted.out.bam",
        f"results/circRNA/real/{filtering}/CNT/matrix_with_reads.csv",
        use_chr_prefix=True,
        N_offset=20, 
        verbose=False
    )

In [None]:
for filtering in filtering_methods:
    process_circle_matrix(
        f"results/circRNA/real/{filtering}/RNASE/matrix.csv",
        f"output/circRNA/real/star/RNASE.Aligned.sorted.out.bam",
        f"results/circRNA/real/{filtering}/RNASE/matrix_with_reads.csv",
        use_chr_prefix=True,
        N_offset=20, 
        verbose=False
    )

In [None]:
eccDNA_matrix = "results/eccDNA/real"
filtering_methods = ["unfilter", "filter-split", "filter-duplicates", "filter"]
data = 'Circle-Seq'

circle_diff_real(eccDNA_matrix, eccDNA_tools, filtering_methods, data)

In [None]:
data = 'ATAC-seq'
eccDNA_matrix = "results/eccDNA/real"
circle_diff_real(eccDNA_matrix, eccDNA_tools, filtering_methods, data)

In [None]:
matrix_dir = "results/circRNA/real"
data = 'CNT'

circle_diff_real(matrix_dir, circRNA_tools, filtering_methods, data)

In [None]:
data = 'RNASE'

circle_diff_real(matrix_dir, circRNA_tools, filtering_methods, data)

### Combining Methods

In [None]:
# List of directories
eccDNA_directories = [
    "data/real/eccDNA/unfilter/CIRCexplorer2",
    "data/real/eccDNA/unfilter/Circle_finder",
    "data/real/eccDNA/unfilter/Circle-Map",
    "data/real/eccDNA/unfilter/ecc_finder-bwa",
    "data/real/eccDNA/unfilter/ecc_finder-minimap2"
]
# Call the function to merge Circle-Seq files
merge_files(eccDNA_directories, data='eccDNA', file_type="Circle-Seq")
merge_files(eccDNA_directories, data='eccDNA', file_type="ATAC-seq")

In [None]:
# List of directories
circRNA_directories = [
    "data/real/circRNA/unfilter/CIRCexplorer2",
    "data/real/circRNA/unfilter/circRNA_finder",
    "data/real/circRNA/unfilter/CIRIquant",
    "data/real/circRNA/unfilter/find_circ",
    "data/real/circRNA/unfilter/segemehl"   
]
# Call the function to merge Circle-Seq files
merge_files(circRNA_directories, data='circRNA', file_type="CNT")
# Call the function to merge Circle-Seq files
merge_files(circRNA_directories, data='circRNA', file_type="RNASE")

In [None]:
process_circle_matrix(
        "results/eccDNA/real/Circle-Seq/Circle-Seq.bed",
        "output/eccDNA/real/Circle-Seq/bwa/CS_sorted_unknown_circle.bam",
        "results/eccDNA/real/Circle-Seq/all_reads.csv",
        use_chr_prefix=False,
        N_offset=20, 
        verbose=False
    )

In [None]:
process_circle_matrix(
        "results/eccDNA/real/ATAC-seq/ATAC-seq.bed",
        "output/eccDNA/real/ATAC-seq/bwa/AS_sorted_unknown_circle.bam",
        "results/eccDNA/real/ATAC-seq/all_reads.csv",
        use_chr_prefix=True,
        N_offset=20, 
        verbose=False
    )

In [None]:
process_circle_matrix(
    "results/circRNA/real/CNT/CNT.bed",
    "output/circRNA/real/star/CNT.Aligned.sorted.out.bam",
    "results/circRNA/real/CNT/all_reads.csv",
    use_chr_prefix=True,
    N_offset=20, 
    verbose=False
)

In [None]:
process_circle_matrix(
    "results/circRNA/real/RNASE/RNASE.bed",
    "output/circRNA/real/star/RNASE.Aligned.sorted.out.bam",
    "results/circRNA/real/RNASE/all_reads.csv",
    use_chr_prefix=True,
    N_offset=20, 
    verbose=False
)

In [None]:
# Example usage
ros_combinations_mapping = {
    "CE_CM_CF": "CE_CM_CE_CF_CM_CF",
    "CE_CM_EB": "CE_CM_CE_EB_CM_EB",
    "CE_CM_EM": "CE_CM_CE_EM_CM_EM",
    "CE_CF_EB": "CE_CF_CE_EB_CF_EB",
    "CE_CF_EM": "CE_CF_CE_EM_CF_EM",
    "CE_EB_EM": "CE_EB_CE_EM_EB_EM",
    "CM_CF_EB": "CM_CF_CM_EB_CF_EB",
    "CM_CF_EM": "CM_CF_CM_EM_CF_EM",
    "CM_EB_EM": "CM_EB_CM_EM_EB_EM",
    "CF_EB_EM": "CF_EB_CF_EM_EB_EM",
    "CE_CM_CF_EB": "CE_CM_CE_CF_CE_EB_CM_CF_CM_EM_CF_EB",
    "CE_CM_CF_EM": "CE_CM_CE_CF_CE_EM_CM_CF_CM_EM_CF_EM",
    "CE_CM_EB_EM": "CE_CM_CE_EB_CE_EM_CM_EB_CM_EM_EB_EM",
    "CE_CF_EB_EM": "CE_CF_CE_EB_CE_EM_CF_EB_CF_EM_EB_EM",
    "CM_CF_EB_EM": "CM_CF_CM_EB_CM_EM_CF_EB_CF_EM_EB_EM",
    "CE_CM_CF_EB_EM": "CE_CM_CE_CF_CE_EB_CE_EM_CM_CF_CM_EB_CM_EM_CF_EB_CF_EM_EB_EM"
}

# Tool name to abbreviation mapping
eccDNA_tools_abbreviations = {
    'CIRCexplorer2': 'CE',
    'Circle-Map': 'CM',
    'Circle_finder': 'CF',
    'ecc_finder-bwa': 'EB',
    'ecc_finder-minimap2': 'EM',
    'segemehl': 'SE',
}

In [None]:
for filter in filtering_methods:
    matrix_path = f"results/eccDNA/real/{filter}/Circle-Seq/matrix.csv"  # Path to your input file
    extra_matrix_path = f"results/eccDNA/real/{filter}/Circle-Seq/matrix_with_reads.csv"  # Path to your input file
    output_base = f"results/eccDNA/real/{filter}/Circle-Seq"
    analyze_and_save_circles(matrix_path, extra_matrix_path, ros_combinations_mapping, eccDNA_tools_abbreviations, output_base)

In [None]:
for filter in filtering_methods:
    matrix_path = f"results/eccDNA/real/{filter}/ATAC-seq/matrix.csv"  # Path to your input file
    extra_matrix_path = f"results/eccDNA/real/{filter}/ATAC-seq/matrix_with_reads.csv"  # Path to your input file
    output_base = f"results/eccDNA/real/{filter}/ATAC-seq"
    analyze_and_save_circles(matrix_path, extra_matrix_path, ros_combinations_mapping, eccDNA_tools_abbreviations, output_base)

In [None]:
# Combinaciones adaptadas para Rosette
ros_combinations_mapping = {
    "CE_CF_CQ": "CE_CF_CE_CQ_CF_CQ",
    "CE_CF_FC": "CE_CF_CE_FC_CF_FC",
    "CE_CF_SE": "CE_CF_CE_SE_CF_SE",
    "CE_CQ_FC": "CE_CQ_CE_FC_CQ_FC",
    "CE_CQ_SE": "CE_CQ_CE_SE_CQ_SE",
    "CE_FC_SE": "CE_FC_CE_SE_FC_SE",
    "CF_CQ_FC": "CF_CQ_CF_FC_CQ_FC",
    "CF_CQ_SE": "CF_CQ_CF_SE_CQ_SE",
    "CF_FC_SE": "CF_FC_CF_SE_FC_SE",
    "CQ_FC_SE": "CQ_FC_CQ_SE_FC_SE",
    "CE_CF_CQ_FC": "CE_CF_CE_CQ_CE_FC_CF_CQ_CF_SE_CQ_FC",
    "CE_CF_CQ_SE": "CE_CF_CE_CQ_CE_SE_CF_CQ_CF_SE_CQ_SE",
    "CE_CF_FC_SE": "CE_CF_CE_FC_CE_SE_CF_FC_CF_SE_FC_SE",
    "CE_CQ_FC_SE": "CE_CQ_CE_FC_CE_SE_CQ_FC_CQ_SE_FC_SE",
    "CF_CQ_FC_SE": "CF_CQ_CF_FC_CF_SE_CQ_FC_CQ_SE_FC_SE",
    "CE_CF_CQ_FC_SE": "CE_CF_CE_CQ_CE_FC_CE_SE_CF_CQ_CF_FC_CF_SE_CQ_FC_CQ_SE_FC_SE"
}
# Tool name to abbreviation mapping
circRNA_tools_abbreviations = {
    'CIRCexplorer2': 'CE',
    'circRNA_finder': 'CF',
    'CIRIquant': 'CQ',
    'find_circ': 'FC',
    'segemehl': 'SE',
} 

for filter in filtering_methods:
    matrix_path = f"results/circRNA/real/{filter}/CNT/matrix.csv"  # Path to your input file
    extra_matrix_path = f"results/circRNA/real/{filter}/CNT/matrix_with_reads.csv"  # Path to your input file
    output_base = f"results/circRNA/real/{filter}/CNT"
    analyze_and_save_circles(matrix_path, extra_matrix_path, ros_combinations_mapping, circRNA_tools_abbreviations, output_base)

In [None]:
for filter in filtering_methods:
    matrix_path = f"results/circRNA/real/{filter}/RNASE/matrix.csv"  # Path to your input file
    extra_matrix_path = f"results/circRNA/real/{filter}/RNASE/matrix_with_reads.csv"  # Path to your input file
    output_base = f"results/circRNA/real/{filter}/RNASE"
    analyze_and_save_circles(matrix_path, extra_matrix_path, ros_combinations_mapping, circRNA_tools_abbreviations, output_base)

In [None]:
# Define the file paths and filtering methods
ros_combinations_mapping = {
    "CE_CM_CF": "CE_CM_CE_CF_CM_CF",
    "CE_CM_EB": "CE_CM_CE_EB_CM_EB",
    "CE_CM_EM": "CE_CM_CE_EM_CM_EM",
    "CE_CF_EB": "CE_CF_CE_EB_CF_EB",
    "CE_CF_EM": "CE_CF_CE_EM_CF_EM",
    "CE_EB_EM": "CE_EB_CE_EM_EB_EM",
    "CM_CF_EB": "CM_CF_CM_EB_CF_EB",
    "CM_CF_EM": "CM_CF_CM_EM_CF_EM",
    "CM_EB_EM": "CM_EB_CM_EM_EB_EM",
    "CF_EB_EM": "CF_EB_CF_EM_EB_EM",
    "CE_CM_CF_EB": "CE_CM_CE_CF_CE_EB_CM_CF_CM_EM_CF_EB",
    "CE_CM_CF_EM": "CE_CM_CE_CF_CE_EM_CM_CF_CM_EM_CF_EM",
    "CE_CM_EB_EM": "CE_CM_CE_EB_CE_EM_CM_EB_CM_EM_EB_EM",
    "CE_CF_EB_EM": "CE_CF_CE_EB_CE_EM_CF_EB_CF_EM_EB_EM",
    "CM_CF_EB_EM": "CM_CF_CM_EB_CM_EM_CF_EB_CF_EM_EB_EM",
    "CE_CM_CF_EB_EM": "CE_CM_CE_CF_CE_EB_CE_EM_CM_CF_CM_EB_CM_EM_CF_EB_CF_EM_EB_EM"
}

combining_method = ['rosette', 'union', 'intersect', 'double', 'unique']

In [None]:
# Call the function to plot the graph
diff_cj_combinations('eccDNA', 'Circle-Seq', ros_combinations_mapping, filtering_methods, combining_method)

In [None]:
# Call the function to plot the graph
diff_cj_combinations('eccDNA', 'ATAC-seq', ros_combinations_mapping, filtering_methods, combining_method)

In [None]:
ros_combinations_mapping = {
    "CE_CF_CQ": "CE_CF_CE_CQ_CF_CQ",
    "CE_CF_FC": "CE_CF_CE_FC_CF_FC",
    "CE_CF_SE": "CE_CF_CE_SE_CF_SE",
    "CE_CQ_FC": "CE_CQ_CE_FC_CQ_FC",
    "CE_CQ_SE": "CE_CQ_CE_SE_CQ_SE",
    "CE_FC_SE": "CE_FC_CE_SE_FC_SE",
    "CF_CQ_FC": "CF_CQ_CF_FC_CQ_FC",
    "CF_CQ_SE": "CF_CQ_CF_SE_CQ_SE",
    "CF_FC_SE": "CF_FC_CF_SE_FC_SE",
    "CQ_FC_SE": "CQ_FC_CQ_SE_FC_SE",
    "CE_CF_CQ_FC": "CE_CF_CE_CQ_CE_FC_CF_CQ_CF_SE_CQ_FC",
    "CE_CF_CQ_SE": "CE_CF_CE_CQ_CE_SE_CF_CQ_CF_SE_CQ_SE",
    "CE_CF_FC_SE": "CE_CF_CE_FC_CE_SE_CF_FC_CF_SE_FC_SE",
    "CE_CQ_FC_SE": "CE_CQ_CE_FC_CE_SE_CQ_FC_CQ_SE_FC_SE",
    "CF_CQ_FC_SE": "CF_CQ_CF_FC_CF_SE_CQ_FC_CQ_SE_FC_SE",
    "CE_CF_CQ_FC_SE": "CE_CF_CE_CQ_CE_FC_CE_SE_CF_CQ_CF_FC_CF_SE_CQ_FC_CQ_SE_FC_SE"
}
diff_cj_combinations('circRNA', 'CNT', ros_combinations_mapping, filtering_methods, combining_method)

In [None]:
diff_cj_combinations('circRNA', 'RNASE', ros_combinations_mapping, filtering_methods, combining_method)

In [None]:
import matplotlib.gridspec as gridspec
def plot_diffCJ_scatterplot(list_dfs):
    """
    Creates a multi-panel scatterplot showing Mean ΔCJ (≥) vs number of tools (n_tools) 
    across combinations and methods, using filtering strategies as hue.
    
    Parameters:
        list_dfs (list of pd.DataFrame): List of DataFrames, one per method.
    """

    # Add 'n_tools' column
    for df in list_dfs:
        if 'Key' in df.columns:
            df['n_tools'] = df['Key'].apply(lambda x: len(str(x).split('_')))

    list_methods = ["Circle-Seq", "ATAC-seq", "CNT", "RNASE"]
    combinations = ['intersect', 'rosette', 'double', 'union', 'unique']
    n_rows = len(combinations)
    n_cols = len(list_methods)

    palette = {
        'unfilter': '#d46014',
        'filter-split': '#ddcd3d',
        'filter-duplicates': '#064b76ff',
        'filter': '#63bdf6ff'
    }

    fig = plt.figure(figsize=(20, 8))
    gs = gridspec.GridSpec(n_rows, n_cols, figure=fig, wspace=0.2, hspace=0.15)

    all_handles = []
    all_labels = []

    for c, method in enumerate(list_methods):
        df = list_dfs[c]

        for r, combination in enumerate(combinations):
            ax = fig.add_subplot(gs[r, c])
            subset = df[df['Combination'] == combination]

            if not all(col in subset.columns for col in ['Mean ΔCJ (≥)', 'n_tools', 'Filtering']):
                ax.text(0.5, 0.5, "Missing columns", ha='center', va='center', fontsize=12, color='red')
                ax.set_axis_off()
                continue

            sns.scatterplot(
                data=subset,
                x='Mean ΔCJ (≥)',
                y='n_tools',
                hue='Filtering',
                palette=palette,
                s=50,
                ax=ax
            )

            if r == 0:
                ax.set_title(method, fontsize=16)

            if c == 0:
                ax.set_ylabel(combination, fontsize=16, rotation=0, ha='right', va='center')
            else:
                ax.set_ylabel('')

            ax.set_yticks([3, 4, 5])
            ax.set_ylim([2.5, 5.5])
            ax.tick_params(axis='y', labelsize=16)

            if r == 4:
                ax.set_xlabel('Mean ΔCJ (≥)', fontsize=16)
                ax.tick_params(axis='x', labelsize=16)

                if not subset.empty:
                    x_min = subset['Mean ΔCJ (≥)'].min()
                    x_max = subset['Mean ΔCJ (≥)'].max()
                    xticks = np.linspace(x_min, x_max, num=4)
                    ax.set_xticks(xticks)
                    ax.set_xticklabels([f"{x:.2f}" for x in xticks])
            else:
                ax.set_xticks([])
                ax.set_xticklabels([])
                ax.set_xlabel('')

            ax.get_legend().remove()

            if not all_handles:
                all_handles, all_labels = ax.get_legend_handles_labels()

    fig.legend(all_handles, all_labels, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=4, frameon=False, fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig("results/eccDNA/real/diffCJ_scatterplot.png", dpi=300, bbox_inches="tight")
    plt.show()

In [None]:
list_dfs = [
    pd.read_csv("results/eccDNA/real/Circle-Seq/diffCJ_stats.csv"),
    pd.read_csv("results/eccDNA/real/ATAC-seq/diffCJ_stats.csv"),
    pd.read_csv("results/circRNA/real/CNT/diffCJ_stats.csv"),
    pd.read_csv("results/circRNA/real/RNASE/diffCJ_stats.csv")
]

plot_diffCJ_scatterplot(list_dfs)

In [None]:
paths = [
    ("/data/benchmarking/results_good/eccDNA/real/Circle-Seq/diffCJ_stats.csv", "Circle-Seq"),
    ("/data/benchmarking/results_good/eccDNA/real/ATAC-seq/diffCJ_stats.csv", "ATAC-seq"),
    ("/data/benchmarking/results_good/circRNA/real/CNT/diffCJ_stats.csv", "CNT"),
    ("/data/benchmarking/results_good/circRNA/real/RNASE/diffCJ_stats.csv", "RNASE"),
]

df_all_ratios = process_and_plot_ratios(paths)