# Intersecting Haplotype Motif Annotations with CHM13
We use the CHM13 reference genome Non-B Annotations and intersect ALL Haplotypes with the reference to understand which Motifs are shared by Haplotypes

In [1]:
import pandas as pd

def process_intersections(bed_file):
    # Read BED file into a DataFrame
    df = pd.read_csv(bed_file, sep='\t', header=None, names=['chrom', 'start', 'end', 'stability1', 'filename', 'chrom2', 'start2', 'end2', 'stability2'])
    
    # Drop duplicate filenames for the same region, considering only the relevant columns
    df_unique = df.drop_duplicates(subset=['chrom', 'start', 'end', 'filename'])
    
    # Group by chrom, start, and end columns and count unique filenames
    grouped = df_unique.groupby(['chrom', 'start', 'end'])['filename'].nunique().reset_index(name='unique_haplotype_count')
    
    # Sort the grouped DataFrame by unique_haplotype_count in descending order
    sorted_grouped = grouped.sort_values(by='unique_haplotype_count', ascending=False)
    
    return sorted_grouped

# Main script
bed_file_path = '/home/alextu/scratch/results/bed_files/chm13/quadron_haplotype_intersections_chm13_query/verkko_batch123_chm13_quadron_posstrand_intersected_output.bed'
grouped_intersections = process_intersections(bed_file_path)

# Print the top 10 most shared G-quadruplex regions
print("Top 10 Most Shared Positive G-Quadruplex Regions:")
print(grouped_intersections.head(10))

Top 10 Most Shared Positive G-Quadruplex Regions:
       chrom      start        end  unique_haplotype_count
22539  chr12  133005163  133011865                      49
89108   chr8  144820876  144824591                      47
13132  chr11     692395     693434                      43
58316  chr20     321487     322359                      42
22297  chr12  131961785  131964685                      38
71594   chr4    1018653    1019280                      36
8916   chr10     511390     512817                      34
35183  chr17     941125     942260                      34
21705  chr12  121740473  121744476                      31
13510  chr11    1465130    1466290                      31


In [11]:
import pandas as pd

def extract_sample_name(filename):
    """Extract the first part of the filename before the first underscore."""
    return filename.split('_')[0]

def process_intersections(bed_file, metadata_file):
    """Process BED file and merge it with metadata to find ancestry-specific regions."""
    # Read the BED file into a DataFrame
    df = pd.read_csv(bed_file, sep='\t', header=None, 
                     names=['chrom', 'start', 'end', 'score', 'filename', 'chrom2', 'start2', 'end2', 'score2'])
    
    # Drop duplicate filenames for the same region
    df_unique = df.drop_duplicates(subset=['chrom', 'start', 'end', 'filename']).copy()
    
    # Extract sample name from filename
    df_unique['Sample'] = df_unique['filename'].apply(extract_sample_name)
    
    # Read the metadata file into a DataFrame
    metadata_df = pd.read_csv(metadata_file, delimiter='\t')
    
    # Merge metadata with the DataFrame
    df_merged = pd.merge(df_unique, 
                         metadata_df[['Sample name', 'Sex', 'Superpopulation name', 'Superpopulation code']],
                         left_on='Sample', right_on='Sample name', how='left')
    
    # Group by the chrom, start, and end columns and aggregate unique filenames and metadata
    grouped = df_merged.groupby(['chrom', 'start', 'end']).agg(
        unique_haplotype_count=('filename', 'nunique'),
        haplotypes=('filename', lambda x: list(x)),
        superpopulations=('Superpopulation name', lambda x: list(x)),
        superpopulation_codes=('Superpopulation code', lambda x: list(x)),
        unique_sample_count=('Sample', 'nunique')
    ).reset_index()
    
    return grouped

def find_shared_regions(grouped_intersections):
    """Identify regions shared by 5 or more haplotypes of the same ancestry."""
    shared_regions = grouped_intersections[
        (grouped_intersections['superpopulations'].apply(lambda x: len(set(x)) == 1)) & 
        (grouped_intersections['unique_sample_count'] >= 5)
    ]
    return shared_regions

# Main
bed_file_path = '/home/alextu/scratch/results/bed_files/chm13/quadron_haplotype_intersections_chm13_query/verkko_batch123_chm13_quadron_posstrand_intersected_output.bed'
metadata_file_path = '/home/alextu/scratch/igsr_sample_metadata/igsr_samples.tsv'

# Process intersections
grouped_intersections = process_intersections(bed_file_path, metadata_file_path)

# Find regions shared by 5 or more haplotypes of the same ancestry
shared_regions = find_shared_regions(grouped_intersections)

# Print the shared regions
print(shared_regions)

       chrom      start        end  unique_haplotype_count  \
93      chr1     469936     470018                       9   
114     chr1     486907     486944                       6   
140     chr1     520317     520447                       5   
160     chr1     565868     565926                       5   
165     chr1     567815     567861                       5   
1335    chr1    5954780    5954834                       5   
1352    chr1    5996534    5996559                       5   
8879   chr10      34292      34337                       5   
13227  chr11     889737     889788                       5   
13842  chr11    2185143    2185175                       5   
18724  chr11  132753385  132753412                       5   
18746  chr11  133367485  133367514                       5   
26188  chr14   95998057   95998119                       5   
30169  chr16     988458     988576                       5   
35165  chr17     865467     865689                       5   
35210  c