# Repeat Expansion Project: Map Molecule Distance results to Enfocus results

## Load Packages

In [535]:
# load packages
from itertools import groupby
from matplotlib_venn import venn2, venn2_circles
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
# from enrichment_util import *

import copy
import re 
import os
import pathlib
working_dir = pathlib.Path().absolute()
os.chdir(working_dir)


%matplotlib inline
pd.options.mode.chained_assignment = None  # default='warn'
working_dir

WindowsPath('c:/Users/sshukor/OneDrive - Bionano Genomics/Documents/Repeat Expansion Project/scripts')

Input files:
- Molecule Distance: ./for_Radboud_repeat_expansion_paper/Molecule_distance_output/
- Enfocus: ./for_Radboud_repeat_expansion_paper/Enfocus/

Process:
- Iterate through Molecule Distance Rows, save QryContigID
- Iterate through Enfocus .xmap files, save MapWt IDs
- For each QryContigID, list all Map IDs associated with it.
- Create meta table of .xmap Map IDs for each molecule distance table
- For each QryContigID, choose the best Map ID based on map length and MapWt score
- Manually assign alleles to each meta table Map IDs
- Assign alleles to previously mapped QryContigIDs

### Check if all input files are there

In [536]:
# check for mol distance complete_data.csv file
# check for merge files and list their maps

In [537]:
enfocus_list_1 = []
enfocus_list_2 = []
files_list = []
repeat_analysis_list = []

# Loop through xmap tables 
rootdir = r'C:\Users\sshukor\OneDrive - Bionano Genomics\Documents\Repeat Expansion Project\for_Radboud_repeat_expansion_paper\Enfocus'
n = 0
index = 5 # len(sample_list)
compiled_df = pd.DataFrame()
compiled_fp_df = pd.DataFrame()

for subdir, dirs, files in os.walk(rootdir):
    
    # ignore folder w/ no files
    if len(files) == 0:
        continue
    # print(subdir)
    if (r"output\contigs\exp_refineFinal1\alignmol\merge") in subdir:
        # print(subdir.split('\\')[9])
        enfocus_list_1.append(subdir.split('\\')[9])
        n += 1

        filepaths = []
        for file in files:
            if file.endswith(".xmap"):
                filepaths.append(f"{subdir}\\{file}")
                # print(f"found {file}")
        
        files_list.append(filepaths)
    if (r"output") in subdir:
        for file in files:
            if file.endswith("repeat_analysis_summary_for_Access.csv"):
                
                enfocus_list_2.append(subdir.split('\\')[9])
                repeat_analysis_list.append(f"{subdir}\\{file}")
                # print(f"{subdir}\\{file}")


files_list = [', '.join(ele) for ele in files_list]
print(n)         
print(len(repeat_analysis_list))

120
120


In [538]:
mol_dis_list = []
mol_dist_paths = []

# Loop through smap tables 
rootdir = r'C:\Users\sshukor\OneDrive - Bionano Genomics\Documents\Repeat Expansion Project\for_Radboud_repeat_expansion_paper\Molecule_distance_output'
n = 0


for subdir, dirs, files in os.walk(rootdir):
    
    # ignore folder w/ no files
    if len(files) == 0:
        continue

       
    for file in files:
        if file.endswith("complete_data.csv"):
            # print(subdir.split('\\')[9])
            mol_dis_list.append(subdir.split('\\')[9])
            mol_dist_paths.append(f"{subdir}\\{file}")
            print(f"found:{subdir}\\{file}")
            n += 1

# Now we have compiled df of ALL SVs, With columns of interest
print(n)
# print(compiled_df.shape)

found:C:\Users\sshukor\OneDrive - Bionano Genomics\Documents\Repeat Expansion Project\for_Radboud_repeat_expansion_paper\Molecule_distance_output\ATXN10\SUSL005\ATXN10_SUSL005_complete_data.csv
found:C:\Users\sshukor\OneDrive - Bionano Genomics\Documents\Repeat Expansion Project\for_Radboud_repeat_expansion_paper\Molecule_distance_output\ATXN10\SUSL007\ATXN10_SUSL007_complete_data.csv
found:C:\Users\sshukor\OneDrive - Bionano Genomics\Documents\Repeat Expansion Project\for_Radboud_repeat_expansion_paper\Molecule_distance_output\ATXN10\SUSL008\ATXN10_SUSL008_complete_data.csv
found:C:\Users\sshukor\OneDrive - Bionano Genomics\Documents\Repeat Expansion Project\for_Radboud_repeat_expansion_paper\Molecule_distance_output\ATXN10\SUSL012\ATXN10_SUSL012_complete_data.csv
found:C:\Users\sshukor\OneDrive - Bionano Genomics\Documents\Repeat Expansion Project\for_Radboud_repeat_expansion_paper\Molecule_distance_output\ATXN10\SUSL013\ATXN10_SUSL013_complete_data.csv
found:C:\Users\sshukor\OneDriv

### All samples accounted for, each having mol distance output and enfocus intermediate maps.

### Output table of samples with input details
Sample ID, Path to .csv, Path to maps, # of maps list of map files 

In [541]:
df_1 = pd.DataFrame(zip(enfocus_list_1, files_list))
df_1 = df_1.rename(columns={0:'Sample_ID', 1:'files'})
df_1['num_files'] = df_1['files'].apply(lambda x: len(x.split(', ')))
df_1

Unnamed: 0,Sample_ID,files,num_files
0,SUSL005,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,4
1,SUSL007,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,5
2,SUSL008,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,2
3,SUSL012,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,4
4,SUSL013,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,4
...,...,...,...
115,RFC1_27,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,4
116,RFC1_28,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,2
117,RFC1_29,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,2
118,RFC1_30,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,4


In [542]:
df_2 = pd.DataFrame(zip(mol_dis_list, mol_dist_paths))
df_2 = df_2.rename(columns={0:'Sample_ID', 1:'tables'})
df_2

Unnamed: 0,Sample_ID,tables
0,SUSL005,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
1,SUSL007,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
2,SUSL008,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
3,SUSL012,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
4,SUSL013,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
...,...,...
115,RFC1_27,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
116,RFC1_28,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
117,RFC1_29,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
118,RFC1_30,C:\Users\sshukor\OneDrive - Bionano Genomics\D...


In [543]:
df_3 = pd.DataFrame(zip(enfocus_list_2,repeat_analysis_list))
df_3 = df_3.rename(columns={0:'Sample_ID', 1:'repeat_summary'})
df_3

Unnamed: 0,Sample_ID,repeat_summary
0,SUSL005,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
1,SUSL007,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
2,SUSL008,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
3,SUSL012,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
4,SUSL013,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
...,...,...
115,RFC1_27,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
116,RFC1_28,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
117,RFC1_29,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
118,RFC1_30,C:\Users\sshukor\OneDrive - Bionano Genomics\D...


In [544]:
input_summary_df = pd.merge(df_2, df_1, on='Sample_ID')
input_summary_df = pd.merge(input_summary_df, df_3, on='Sample_ID')

In [545]:
working_dir

WindowsPath('c:/Users/sshukor/OneDrive - Bionano Genomics/Documents/Repeat Expansion Project/scripts')

In [546]:
input_summary_df.head()

Unnamed: 0,Sample_ID,tables,files,num_files,repeat_summary
0,SUSL005,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,4,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
1,SUSL007,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,5,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
2,SUSL008,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,2,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
3,SUSL012,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,4,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
4,SUSL013,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,4,C:\Users\sshukor\OneDrive - Bionano Genomics\D...


In [547]:
input_summary_df.to_csv(rf"C:/Users\sshukor\OneDrive - Bionano Genomics/Documents/Repeat Expansion Project/for_Radboud_repeat_expansion_paper/Mol_distance_mapped/mol_dist_xmap_input_summary.csv", index=False)

## Test and Development

### Load example to parse mol distance .csv and map to .xmap rows

In [548]:
# load mol distance mol_d
# load .xmap xrow and compile details
# for each row of mol_d
#   match QryContigID beween mol_d and xrow
#   if exist:
#       append RefContigID (map ID)
#       append size: QryStartPos - QryEndPos
#   if not:
#       append not found
# output new mol_d table 


In [549]:
test_path = input_summary_df['tables'][0]
test_path

'C:\\Users\\sshukor\\OneDrive - Bionano Genomics\\Documents\\Repeat Expansion Project\\for_Radboud_repeat_expansion_paper\\Molecule_distance_output\\ATXN10\\SUSL005\\ATXN10_SUSL005_complete_data.csv'

In [550]:
# helper function to load mol distance mol_d
def load_mol_distance_table(mol_d_path):
    out_df = pd.read_csv(mol_d_path, dtype=object,  index_col=0)
    # display(out_df.head())
    return out_df

load_mol_distance_table(test_path).head()

Unnamed: 0,QryContigID,Qrylabel.x,refpos.x,QryPos.x,Qrylabel.y,refpos.y,QryPos.y,distance,refstartlabelid,refendlabelid,refdistance
1,323759,21,45794058,141310.7,23,45799413,146360.0,5049,5045,5048,5355
2,1627511,20,45794058,189523.5,18,45799413,184188.9,5335,5045,5048,5355
3,1848195,13,45794058,93164.1,11,45799413,81544.9,11619,5045,5048,5355
4,1850898,13,45794058,124032.5,11,45799413,112601.9,11431,5045,5048,5355
5,2342770,26,45794058,143272.9,28,45799413,148411.1,5138,5045,5048,5355


In [551]:
test_path = input_summary_df['files'][0]
(test_path).split(', ')

['C:\\Users\\sshukor\\OneDrive - Bionano Genomics\\Documents\\Repeat Expansion Project\\for_Radboud_repeat_expansion_paper\\Enfocus\\ATXN10\\SUSL005\\output\\contigs\\exp_refineFinal1\\alignmol\\merge\\exp_refineFinal1_contig1271.xmap',
 'C:\\Users\\sshukor\\OneDrive - Bionano Genomics\\Documents\\Repeat Expansion Project\\for_Radboud_repeat_expansion_paper\\Enfocus\\ATXN10\\SUSL005\\output\\contigs\\exp_refineFinal1\\alignmol\\merge\\exp_refineFinal1_contig1272.xmap',
 'C:\\Users\\sshukor\\OneDrive - Bionano Genomics\\Documents\\Repeat Expansion Project\\for_Radboud_repeat_expansion_paper\\Enfocus\\ATXN10\\SUSL005\\output\\contigs\\exp_refineFinal1\\alignmol\\merge\\exp_refineFinal1_contig31.xmap',
 'C:\\Users\\sshukor\\OneDrive - Bionano Genomics\\Documents\\Repeat Expansion Project\\for_Radboud_repeat_expansion_paper\\Enfocus\\ATXN10\\SUSL005\\output\\contigs\\exp_refineFinal1\\alignmol\\merge\\exp_refineFinal1_contig32.xmap']

In [552]:
input_summary_df

Unnamed: 0,Sample_ID,tables,files,num_files,repeat_summary
0,SUSL005,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,4,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
1,SUSL007,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,5,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
2,SUSL008,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,2,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
3,SUSL012,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,4,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
4,SUSL013,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,4,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
...,...,...,...,...,...
115,RFC1_27,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,4,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
116,RFC1_28,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,2,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
117,RFC1_29,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,2,C:\Users\sshukor\OneDrive - Bionano Genomics\D...
118,RFC1_30,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,C:\Users\sshukor\OneDrive - Bionano Genomics\D...,4,C:\Users\sshukor\OneDrive - Bionano Genomics\D...


inspect rfc1_16

In [553]:
input_summary_df.iloc[95]['repeat_summary']

'C:\\Users\\sshukor\\OneDrive - Bionano Genomics\\Documents\\Repeat Expansion Project\\for_Radboud_repeat_expansion_paper\\Enfocus\\RFC1\\RFC1_07\\output\\repeat_analysis_summary_for_Access.csv'

In [554]:
input_summary_df['repeat_summary'][0]

'C:\\Users\\sshukor\\OneDrive - Bionano Genomics\\Documents\\Repeat Expansion Project\\for_Radboud_repeat_expansion_paper\\Enfocus\\ATXN10\\SUSL005\\output\\repeat_analysis_summary_for_Access.csv'

In [555]:
# Helper function to load enfocus summary
def load_repeat_summary_table(repeat_summary_path):
    out_df = pd.read_csv(repeat_summary_path, dtype=object,  index_col=0)
    # display(out_df.head())
    return out_df

# load_repeat_summary_table(input_summary_df['repeat_summary'][0]).head()
load_repeat_summary_table(input_summary_df.iloc[95]['repeat_summary'])
# input_summary_df.loc[input_summary_df['Sample_ID'] == 'RFC1_16']

Unnamed: 0_level_0,Sample,Sex,Chr,Start_ref,End_ref,Interval_ref,Count_repeat_ref,Repeat_unit_size,Irrelevant_ref,MapID,...,Count_repeat,P >= expansion_cutoff,Expanded_repeat,Realigned,CI_lower,CI_upper,Percentile,Repeat_spanning_coverage,Qry_contig_length,ImageText
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FMR1,exp,male,4,39343732.0,39350590.0,6858.0,12.0,5,6799.0,12,...,10,< 0.01%,600,False,0.0,32.0,19,44.0,10448525.2,"Chromosome 4, Map12 has a calculated repeat co..."
FMR1,exp,male,4,39343732.0,39350590.0,6858.0,12.0,5,6799.0,16571,...,643,>= 99.9%,600,False,608.0,679.0,100,45.0,4385697.7,"Chromosome 4, Map16571 has a calculated repeat..."


In [556]:
# Helper function to get smap table without preceding "#..."" lines
def get_xmap_table(xmap_path):

    with open(xmap_path) as file:
        for line in file:
            if line[0:2] == "#h":
                header = line[3:].strip()
            elif line[0:2] == "#f":
                # print(line)
                break
            elif line[0:1] == "#":
                pass
        
        data = [line.strip() for line in file]
        # print(data[:100])
        
    df = pd.DataFrame([row.split('\t') for row in data])
    df.columns = header.split('\t')[:len(df.columns)]
    # print((header.split('\t')))
    return df

# helper function to load .xmap xrow and compile details
def load_enfocus_xmap_tables(list_xmap_paths):
    xmap_paths = list_xmap_paths.split(', ')
    xmap_tables = []
    # print(xmap_paths)

    for x in xmap_paths:
        xmap_tables.append(get_xmap_table(x))

    return xmap_tables

for df in load_enfocus_xmap_tables(test_path):
    display(df.head(1))

Unnamed: 0,XmapEntryID,QryContigID,RefContigID,QryStartPos,QryEndPos,RefStartPos,RefEndPos,Orientation,Confidence,HitEnum,QryLen,RefLen,LabelChannel,Alignment,MapWt,MaxOutlierKB
0,1,13188804,1271,483092.7,11274.1,12135.5,484810.7,-,58.38,26M1D4M1D7M1I4M1I2D1M1D1M1D7M1D1M1D4M,656515.9,643795.8,1,"(1,57)(2,56)(3,55)(4,54)(5,53)(6,52)(7,51)(8,5...",1.0,0.885


Unnamed: 0,XmapEntryID,QryContigID,RefContigID,QryStartPos,QryEndPos,RefStartPos,RefEndPos,Orientation,Confidence,HitEnum,QryLen,RefLen,LabelChannel,Alignment,MapWt,MaxOutlierKB
0,1,17862463,1272,30410.4,512173.6,41079.3,524669.3,+,55.09,4M1D1M1D5M1I2M1D1M1D3M1D3M1D12M1I5M1D3...,516811.7,605916.8,1,"(1,2)(2,3)(3,4)(4,5)(6,6)(8,7)(9,8)(10,9)(11,1...",1.0,6.778


Unnamed: 0,XmapEntryID,QryContigID,RefContigID,QryStartPos,QryEndPos,RefStartPos,RefEndPos,Orientation,Confidence,HitEnum,QryLen,RefLen,LabelChannel,Alignment,MapWt,MaxOutlierKB
0,1,7494907,31,3455.6,377617.8,3455.6,378123.5,+,66.42,1M1D5M1I3M1I3M1D12M1D1M1D2M1D9M1D3M2D3...,384125.6,10964465.8,1,"(1,1)(3,2)(4,3)(5,4)(6,5)(7,6)(8,8)(9,9)(10,10...",1.0,0.0


Unnamed: 0,XmapEntryID,QryContigID,RefContigID,QryStartPos,QryEndPos,RefStartPos,RefEndPos,Orientation,Confidence,HitEnum,QryLen,RefLen,LabelChannel,Alignment,MapWt,MaxOutlierKB
0,1,2891422,32,134299.5,8964.5,2321.0,128101.9,-,23.95,12M1D2M1I6M1D1M,527205.6,10970732.5,1,"(1,22)(2,21)(3,20)(4,19)(5,18)(6,17)(7,16)(8,1...",1.0,0.0


In [557]:
# for each row of mol_d
#   match QryContigID beween mol_d and xrow
#   if exist:
#       append RefContigID (map ID)
#       append size: QryStartPos - QryEndPos
#   if not:
#       append not found
# output new mol_d table 

# helper function to append mol distance with xmap and distance
def match_map_by_mol(mol_d, xmap):


    meta_mapID = [] 
    meta_mapSize = [] 

    out_df = mol_d.copy()
    out_df['RefContigID'] = -1
    out_df['QryContigSize'] = -1
    out_df['MapWt'] = -1
    # display(out_df)
    for x in xmap:
        map_ID = x['RefContigID'].unique()[0]
        x = x.astype({'RefEndPos':'float', 'RefStartPos':'float', 'QryEndPos':'float', 'QryStartPos':'float'})
        map_size = x['RefEndPos'].max() - x['RefStartPos'].min()

        # print(map_ID, ': ', map_size)
        meta_mapID.append(map_ID)
        meta_mapSize.append(map_size)
        
        # mol_d[map_ID] = mol_d.apply(lambda x: map_ID if x['QryContigID'])
        x['QryContigSize'] = round(abs(x['QryEndPos'] - x['QryStartPos']), 0)
        x['RefContigSize'] = x['RefEndPos'].max() - x['RefStartPos'].min()

        # x_cols = ['QryContigID', 'RefContigID']
        # x_cols = ['QryContigID', 'QryContigSize']
        x_cols = ['QryContigID', 'MapWt']
        out_df = out_df.merge(x[x_cols], on='QryContigID', how='left', suffixes=(None, f'_{map_ID}'))
    
    out_df = out_df.drop(columns = 'RefContigID')
    out_df = out_df.drop(columns = 'QryContigSize')
    out_df = out_df.drop(columns = 'MapWt')

    # fill na w/ -1

    meta_df = pd.DataFrame(
        {'Map_ID':meta_mapID,
        'Map_size': meta_mapSize}
    )

    # display(meta_df)

    return out_df, meta_df

# test_out = match_map_by_mol(sus5_mol_d, sus5_xmap)

# sus5_mol_d = load_mol_distance_table(input_summary_df['tables'][0])
# sus5_xmap = load_enfocus_xmap_tables(input_summary_df['files'][0])
# out_df, meta_df = match_map_by_mol(sus5_mol_d, sus5_xmap)

r16_mol_d = load_mol_distance_table(input_summary_df.iloc[95]['tables'])
r16_xmap = load_enfocus_xmap_tables(input_summary_df.iloc[95]['files'])
out_df, meta_df = match_map_by_mol(r16_mol_d, r16_xmap)

In [558]:
# Helper function to split MapIDs into alleles by repeat count
def assign_allele_by_repeat_count(meta_df):
    # get mean repeat count
    mean_count = repeat_count_col.mean()
    

In [559]:
# helper function to choose the map with highest MapWt score
def assign_best_map_ID(mol_df, in_meta_df, repeat_summary_df):
    out_df = mol_df.copy(deep=True)

    # find only MapWt columns
    meta_df = in_meta_df.copy(deep=True)
    meta_df = meta_df.rename(columns={'Map_ID':'MapID'})
    meta_df = meta_df.merge(repeat_summary_df[['MapID', 'Count_repeat']], on='MapID', how='left')
    meta_df = meta_df.sort_values(by=['Map_size'], ascending=False)

    # display(meta_df)
    meta_df['Count_repeat'] = meta_df['Count_repeat'].fillna('Not in Enfocus')
    meta_df['Count_repeat'] = meta_df['Count_repeat'].astype(str).str.replace('-1', 'Unknown')
    # enfocus_maps = list(meta_df.loc[(meta_df['Count_repeat'] != 'Not in Enfocus') | (meta_df['Count_repeat'] != 'Unknown'), 'MapID'])
    enfocus_maps = list(meta_df.loc[~meta_df['Count_repeat'].isin(['Not in Enfocus', 'Unknown']), 'MapID'])
    enfocus_maps = ['MapWt_'+x for x in enfocus_maps]
    # print(enfocus_maps)

    # use lambda x to choose highest value, return column name of highest value
    # MapWt_cols = [col for col in out_df.columns if 'MapWt' in col]
    MapWt_cols = enfocus_maps

    print(MapWt_cols)
    
    out_df[MapWt_cols] = out_df[MapWt_cols].astype('float64')
    out_df['best_mapID'] = out_df[MapWt_cols].idxmax(axis=1)

    # display(out_df)
    out_df['best_mapID'] = out_df['best_mapID'].fillna("Not Found")
    out_df['best_mapID'] = out_df['best_mapID'].astype(str).str.replace('MapWt_', '')
    
    return out_df, meta_df

# sus5_repeat_summary = load_repeat_summary_table(input_summary_df['repeat_summary'][0])
# main, meta = assign_best_map_ID(out_df, meta_df, sus5_repeat_summary)

r16_mol_d = load_mol_distance_table(input_summary_df.iloc[95]['tables'])
r16_xmap = load_enfocus_xmap_tables(input_summary_df.iloc[95]['files'])
out_df, meta_df = match_map_by_mol(r16_mol_d, r16_xmap)
r16_repeat_summary = load_repeat_summary_table(input_summary_df.loc[95]['repeat_summary'])
main, meta = assign_best_map_ID(out_df, meta_df, r16_repeat_summary)

['MapWt_12', 'MapWt_16571']


In [560]:
display(main, meta)

Unnamed: 0,QryContigID,Qrylabel.x,refpos.x,QryPos.x,Qrylabel.y,refpos.y,QryPos.y,distance,refstartlabelid,refendlabelid,refdistance,MapWt_12,MapWt_16571,best_mapID
0,3255,24,39339156,145676.1,28,39362886.5,169407,23731,7722,7725,23730.5,,,Not Found
1,113308,40,39339156,202183.3,43,39362886.5,228900.3,26717,7722,7725,23730.5,,1.000000,16571
2,204187,22,39339156,132093,19,39362886.5,105112.1,26981,7722,7725,23730.5,,1.000000,16571
3,253240,57,39339156,313348.9,54,39362886.5,289842.5,23506,7722,7725,23730.5,1.0,0.547872,12
4,258273,15,39339156,79249.9,17,39362886.5,102729.7,23480,7722,7725,23730.5,1.0,,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,6018228,19,39339156,115696,16,39362886.5,89035.9,26660,7722,7725,23730.5,,1.000000,16571
170,6084062,23,39339156,110362.7,26,39362886.5,137003.9,26641,7722,7725,23730.5,,,Not Found
171,6091571,23,39339156,124329.2,25,39362886.5,151219.2,26890,7722,7725,23730.5,,,Not Found
172,6127818,9,39339156,51796.8,5,39362886.5,25433,26364,7722,7725,23730.5,,1.000000,16571


Unnamed: 0,MapID,Map_size,Count_repeat
0,12,10448139.2,10
1,16571,4377388.6,643


In [561]:
# def assign_allele_to_meta_df(in_meta_df):
#     meta_df = in_meta_df.copy()
#     meta_df = meta_df.loc[~meta_df['Count_repeat'].isin(['Not in Enfocus', 'Unknown']), 'MapID']

#     num_maps = len(meta_df)
#     mean_repeat = meta_df['Count_repeat'].mean()
#     median_repeat = meta_df['Count_repeat'].median()

#     if num_maps == 1:
#         meta_df['Allele'] = 1
#     elif num_maps == 2:
#         meta_df['Allele'] = [1,2]
#     elif num_maps >2:
#         meta_df['Allele']
#     else:
#         # no allele
#         meta_df['Allele'] = None

In [562]:
# test_out=match_map_by_mol(sus5_mol_d, sus5_xmap)
# test_out.to_csv(rf"C:/Users\sshukor\OneDrive - Bionano Genomics/Documents/Repeat Expansion Project/for_Radboud_repeat_expansion_paper/Mol_distance_mapped/test_susl005.tsv", sep='\t', index=False)
out_path = rf"C:/Users\sshukor\OneDrive - Bionano Genomics/Documents/Repeat Expansion Project/for_Radboud_repeat_expansion_paper/Mol_distance_mapped/"
with pd.ExcelWriter(f'{out_path}/sus005_test_output.xlsx') as writer:  # doctest: +SKIP
    main.to_excel(writer, sheet_name='molecules_mapped')
    meta.to_excel(writer, sheet_name='repeat_info')

## Main

In [563]:
def main_map_xmap_to_enfocus(mol_dist_table_path, xmap_tables_path, repeat_summary_path):
    qry_mol_df = load_mol_distance_table(mol_dist_table_path)
    xmap_df_list = load_enfocus_xmap_tables(xmap_tables_path)
    repeat_summary_df = load_repeat_summary_table(repeat_summary_path) 
    out_df_1, meta_df_1 = match_map_by_mol(qry_mol_df, xmap_df_list)
    out_df, meta_df = assign_best_map_ID(out_df_1, meta_df_1, repeat_summary_df)

    meta_df['Allele'] = 'Not Assigned'

    return out_df, meta_df

### STEP 1: Connect molecule distance to enfocus output

Each sample's molecule distance output and and Enfocus output (.xmap and repeat report summary) tables were parsed. Each molecule was matched with map IDs they map to in Enfocus, and the best map ID was chosen based on the largest map size with the best confidence (`MapWt`) score. A meta table of maps formed by these molecules was also output as another tab.

In [564]:
samples = pd.read_csv(rf"C:/Users\sshukor\OneDrive - Bionano Genomics/Documents/Repeat Expansion Project/for_Radboud_repeat_expansion_paper/Mol_distance_mapped/mol_dist_xmap_input_summary.csv")

for s in samples.itertuples():
    display(s[1])
    # display(s[2])
    # display(s[3])
    # display(s[5])
    sample_name = s[1]
    out_df, meta_df = main_map_xmap_to_enfocus(s[2], s[3], s[5])

    # display(out_df, meta_df)
    # out_df.to_csv(rf"C:/Users\sshukor\OneDrive - Bionano Genomics/Documents/Repeat Expansion Project/for_Radboud_repeat_expansion_paper/Mol_distance_mapped/{sample_name}_complete_data_mapped.tsv", sep='\t', index=False)

    out_path = rf"C:/Users\sshukor\OneDrive - Bionano Genomics/Documents/Repeat Expansion Project/for_Radboud_repeat_expansion_paper/Mol_distance_mapped/"
    with pd.ExcelWriter(f'{out_path}/{sample_name}_complete_data_mapped.xlsx') as writer:  # doctest: +SKIP
        out_df.to_excel(writer, sheet_name='molecules_mapped')
        meta_df.to_excel(writer, sheet_name='repeat_info')

'SUSL005'

['MapWt_32', 'MapWt_31', 'MapWt_1271', 'MapWt_1272']


'SUSL007'

['MapWt_42', 'MapWt_41', 'MapWt_1271', 'MapWt_1272', 'MapWt_2740']


'SUSL008'

['MapWt_3391', 'MapWt_3392']


'SUSL012'

['MapWt_5762', 'MapWt_5761']


'SUSL013'

['MapWt_41', 'MapWt_42', 'MapWt_1371', 'MapWt_1372']


'c9orf72_08-362'

['MapWt_12']


'c9orf72_106340'

['MapWt_12', 'MapWt_11', 'MapWt_411', 'MapWt_412']


'CNBP_01'

['MapWt_6141', 'MapWt_451']


'CNBP_02'

['MapWt_21', 'MapWt_351', 'MapWt_352']


'CNBP_03'

['MapWt_31', 'MapWt_32', 'MapWt_651', 'MapWt_652', 'MapWt_3981', 'MapWt_4650']


'CNBP_04'

['MapWt_22', 'MapWt_4631', 'MapWt_352', 'MapWt_4961', 'MapWt_3892']


'CNBP_05'

['MapWt_7562', 'MapWt_322', 'MapWt_392', 'MapWt_2681']


'CNBP_06'

['MapWt_22', 'MapWt_21', 'MapWt_1212', 'MapWt_1211', 'MapWt_2152', 'MapWt_2151', 'MapWt_2772', 'MapWt_2771']


'CNBP_07'

['MapWt_21', 'MapWt_971', 'MapWt_2562']


'CNBP_08'

['MapWt_31', 'MapWt_6452', 'MapWt_351', 'MapWt_352', 'MapWt_2400']


'CNBP_10'

['MapWt_22', 'MapWt_21', 'MapWt_361', 'MapWt_362', 'MapWt_1881', 'MapWt_1882', 'MapWt_2300', 'MapWt_3470', 'MapWt_3401', 'MapWt_3402']


'CNBP_11'

['MapWt_32', 'MapWt_5741', 'MapWt_581']


'CNBP_12'

['MapWt_21', 'MapWt_1032', 'MapWt_1031', 'MapWt_4190', 'MapWt_6790']


'CNBP_13'

['MapWt_31', 'MapWt_20552', 'MapWt_972', 'MapWt_1311', 'MapWt_1312', 'MapWt_11231']


'CNBP_14'

['MapWt_21', 'MapWt_11042', 'MapWt_352', 'MapWt_351', 'MapWt_2381', 'MapWt_2382']


'CNBP_15'

['MapWt_22', 'MapWt_21', 'MapWt_1041', 'MapWt_1042', 'MapWt_2320']


'CNBP_16'

['MapWt_21', 'MapWt_22', 'MapWt_101', 'MapWt_1252', 'MapWt_1251', 'MapWt_3760']


'CNBP_17'

['MapWt_21', 'MapWt_452', 'MapWt_5241']


'CNBP_18'

['MapWt_21', 'MapWt_6152', 'MapWt_382', 'MapWt_1151', 'MapWt_1152', 'MapWt_1522', 'MapWt_1521', 'MapWt_2281', 'MapWt_2282']


'CNBP_19'

['MapWt_32', 'MapWt_31', 'MapWt_1162', 'MapWt_1161', 'MapWt_2160']


'CNBP_20'

['MapWt_21', 'MapWt_420', 'MapWt_641', 'MapWt_642', 'MapWt_2982']


'CNBP_21'

['MapWt_12', 'MapWt_331', 'MapWt_1872', 'MapWt_1871']


'CNBP_22'

['MapWt_31', 'MapWt_8402', 'MapWt_1311']


'CNBP_23'

['MapWt_32', 'MapWt_31', 'MapWt_381', 'MapWt_382', 'MapWt_3900']


'CNBP_24'

['MapWt_15802', 'MapWt_23691', 'MapWt_531', 'MapWt_532', 'MapWt_1231', 'MapWt_1232', 'MapWt_1740']


'CNBP_25'

['MapWt_22', 'MapWt_12961', 'MapWt_1221', 'MapWt_1222', 'MapWt_1531', 'MapWt_1532', 'MapWt_3020']


'CNBP_26'

['MapWt_10642', 'MapWt_15951', 'MapWt_351', 'MapWt_352', 'MapWt_2052', 'MapWt_2051']


'DMPK_01'

['MapWt_6101', 'MapWt_6102']


'DMPK_02'

['MapWt_4302', 'MapWt_4301']


'DMPK_03'

['MapWt_7641']


'DMPK_04'

['MapWt_7851', 'MapWt_7852']


'DMPK_05'

['MapWt_7932', 'MapWt_7931', 'MapWt_2390']


'DMPK_05_repeat'

['MapWt_9382', 'MapWt_18731', 'MapWt_4100']


'DMPK_06'

['MapWt_7631', 'MapWt_15232']


'DMPK_07'

['MapWt_6252', 'MapWt_12481']


'DMPK_08'

['MapWt_8392']


'DMPK_09'

['MapWt_7301', 'MapWt_14572']


'DMPK_10'

['MapWt_12712', 'MapWt_12711', 'MapWt_1940']


'DMPK_11'

['MapWt_40311', 'MapWt_26882', 'MapWt_2631', 'MapWt_2632']


'DMPK_12'

['MapWt_31151', 'MapWt_46702']


'DMPK_13'

['MapWt_16161', 'MapWt_16162', 'MapWt_2530']


'DMPK_14'

['MapWt_16731', 'MapWt_16732']


'DMPK_15'

['MapWt_20171', 'MapWt_20172', 'MapWt_4181', 'MapWt_5091', 'MapWt_4182', 'MapWt_7100', 'MapWt_7410']


'DMPK_16'

['MapWt_9722']


'DMPK_17'

['MapWt_14672', 'MapWt_14671', 'MapWt_4961', 'MapWt_4962']


'DMPK_18'

['MapWt_12422', 'MapWt_12421']


'DMPK_19'

['MapWt_15031', 'MapWt_5480']


'DMPK_20'

['MapWt_16832', 'MapWt_33631']


'DMPK_21'

['MapWt_9361', 'MapWt_9362']


'DMPK_22'

['MapWt_51', 'MapWt_52', 'MapWt_5150', 'MapWt_8160']


'DMPK_23'

['MapWt_14891', 'MapWt_14892', 'MapWt_3270', 'MapWt_4280']


'DMPK_24'

['MapWt_11361', 'MapWt_11362', 'MapWt_4482', 'MapWt_5380', 'MapWt_4481']


'DMPK_25'

['MapWt_12071', 'MapWt_6610']


'DMPK_26'

['MapWt_14212', 'MapWt_28401', 'MapWt_12440']


'DMPK_27'

['MapWt_51', 'MapWt_52']


'DMPK_28'

['MapWt_5792', 'MapWt_5791']


'DMPK_29'

['MapWt_8672', 'MapWt_17301']


'DMPK_30'

['MapWt_3941', 'MapWt_7862']


'DMPK_HvB_01'

['MapWt_5081', 'MapWt_52', 'MapWt_3311', 'MapWt_3312']


'DMPK_HvB_02'

['MapWt_52', 'MapWt_51', 'MapWt_2831', 'MapWt_2832']


'DMPK_HvB_03'

['MapWt_52', 'MapWt_4431', 'MapWt_871', 'MapWt_872']


'DMPK_RW_01'

['MapWt_6231', 'MapWt_18612', 'MapWt_2820', 'MapWt_4171', 'MapWt_4172']


'DMPK_RW_02'

['MapWt_5561', 'MapWt_11092', 'MapWt_2052', 'MapWt_2051']


'DMPK_RW_03'

['MapWt_10482', 'MapWt_15711', 'MapWt_2501', 'MapWt_2502']


'DMPK_RW_04'

['MapWt_7692', 'MapWt_11531']


'DMPK_RW_05'

['MapWt_11851', 'MapWt_11852', 'MapWt_2411', 'MapWt_2412']


'DMPK_RW_06'

['MapWt_4562', 'MapWt_9091']


'FMR1_01'

['MapWt_310', 'MapWt_2360']


'FMR1_02'

['MapWt_311']


'FMR1_03'

['MapWt_301']


'FMR1_05'

['MapWt_321', 'MapWt_322']


'FMR1_06'

['MapWt_301']


'FMR1_07'

['MapWt_301', 'MapWt_302']


'FMR1_08'

['MapWt_300']


'FMR1_09'

['MapWt_290']


'FMR1_10'

['MapWt_321', 'MapWt_13152']


'FMR1_11'

['MapWt_312', 'MapWt_311', 'MapWt_342', 'MapWt_11111']


'FMR1_12'

['MapWt_301', 'MapWt_302']


'FMR1_13'

['MapWt_302', 'MapWt_301']


'FMR1_14'

['MapWt_302', 'MapWt_301']


'FXN_01'

['MapWt_6501', 'MapWt_6502', 'MapWt_1862', 'MapWt_1861']


'FXN_02'

['MapWt_8611', 'MapWt_302', 'MapWt_301', 'MapWt_2051']


'NOP56_01'

['MapWt_51', 'MapWt_52', 'MapWt_311']


'NOP56_02'

['MapWt_41', 'MapWt_1172']


'RFC1_01'

['MapWt_11', 'MapWt_12', 'MapWt_751', 'MapWt_752']


'RFC1_02'

['MapWt_12', 'MapWt_11']


'RFC1_03'

['MapWt_11', 'MapWt_12', 'MapWt_741', 'MapWt_742']


'RFC1_04'

['MapWt_12', 'MapWt_11']


'RFC1_05'

['MapWt_12', 'MapWt_11', 'MapWt_2321', 'MapWt_2322']


'RFC1_06'

['MapWt_11', 'MapWt_12', 'MapWt_2512', 'MapWt_2511']


'RFC1_07'

['MapWt_12', 'MapWt_16571']


'RFC1_08'

['MapWt_12']


'RFC1_09'

['MapWt_12', 'MapWt_11', 'MapWt_1392', 'MapWt_1391', 'MapWt_6760']


'RFC1_10'

['MapWt_11', 'MapWt_12', 'MapWt_671', 'MapWt_672']


'RFC1_11'

['MapWt_12', 'MapWt_13181']


'RFC1_12'

['MapWt_12', 'MapWt_11', 'MapWt_661', 'MapWt_662']


'RFC1_13'

['MapWt_12', 'MapWt_11']


'RFC1_14'

['MapWt_11', 'MapWt_3120']


'RFC1_15'

['MapWt_11', 'MapWt_12', 'MapWt_1200']


'RFC1_16'

['MapWt_11', 'MapWt_12', 'MapWt_3001', 'MapWt_3002', 'MapWt_12170', 'MapWt_12180']


'RFC1_17'

['MapWt_9421', 'MapWt_32']


'RFC1_18'

['MapWt_12', 'MapWt_11', 'MapWt_1511', 'MapWt_1391', 'MapWt_1392', 'MapWt_1512', 'MapWt_2812', 'MapWt_2811']


'RFC1_19'

['MapWt_12', 'MapWt_11', 'MapWt_3102', 'MapWt_3101']


'RFC1_20'

['MapWt_12', 'MapWt_11']


'RFC1_21'

['MapWt_12', 'MapWt_11', 'MapWt_2141', 'MapWt_2142']


'RFC1_22'

['MapWt_11', 'MapWt_451', 'MapWt_452']


'RFC1_23'

['MapWt_11', 'MapWt_12']


'RFC1_24'

['MapWt_12', 'MapWt_6161', 'MapWt_1421', 'MapWt_1422']


'RFC1_25'

['MapWt_12', 'MapWt_11']


'RFC1_26'

['MapWt_11', 'MapWt_12']


'RFC1_27'

['MapWt_11', 'MapWt_12', 'MapWt_691', 'MapWt_692']


'RFC1_28'

['MapWt_9302']


'RFC1_29'

['MapWt_12', 'MapWt_11']


'RFC1_30'

['MapWt_12', 'MapWt_11', 'MapWt_402', 'MapWt_401']


'STARD7_39292'

['MapWt_7211', 'MapWt_1521']


### STEP 2: QC - Inspect Percent of molecules mapped to Enfocus

Map rate of molecule distance molecules that successfully map to Enfocus map IDs were calculated for each sample. 100% means that all molecule distance script molecules were found in at least one Enfocus map.

In [565]:
samples = pd.read_csv(rf"C:/Users\sshukor\OneDrive - Bionano Genomics/Documents/Repeat Expansion Project/for_Radboud_repeat_expansion_paper/Mol_distance_mapped/mol_dist_xmap_input_summary.csv")

QC_status = []
# for s in samples.iloc[95:96].itertuples():
for s in samples.itertuples():
    display(s[1])
    sample_name = s[1]
    mapped_df = pd.read_excel(f'{out_path}/{sample_name}_complete_data_mapped.xlsx', index_col=0)

    # # check if mapped_df is empty
    # IDs_found = list(mapped_df['best_mapID'].unique())

    # print(IDs_found)

    # if (len(IDs_found) == 1) & (IDs_found[0] == 'Not Found'):
    #     # print('no molecules pass enfocus')
    #     QC_status.append('No molecules pass enfocus')
    # elif (len(IDs_found) > 1) & ('Not Found' in IDs_found):
    #     QC_status.append('Some molecules pass enfocus')
    # elif ('Not Found' not in IDs_found):
    #     QC_status.append('All molecules pass enfocus')
    # else:
    #     percent_mol_mapped = len(mapped_df.loc[mapped_df['best_mapID'] != 'Not Found'])/len(mapped_df)
    #     QC_status.append('unknown')
    
    percent_mol_mapped = len(mapped_df.loc[mapped_df['best_mapID'] != 'Not Found'])/len(mapped_df)
    print(percent_mol_mapped)
    QC_status.append(percent_mol_mapped)

samples['percent_mol_mapped'] = QC_status

samples.to_csv(rf"C:/Users\sshukor\OneDrive - Bionano Genomics/Documents/Repeat Expansion Project/for_Radboud_repeat_expansion_paper/Mol_distance_mapped/mol_dist_xmap_input_summary.csv", index=False)

'SUSL005'

1.0


'SUSL007'

1.0


'SUSL008'

1.0


'SUSL012'

0.660377358490566


'SUSL013'

0.9861111111111112


'c9orf72_08-362'

0.4140625


'c9orf72_106340'

0.8235294117647058


'CNBP_01'

1.0


'CNBP_02'

0.7421875


'CNBP_03'

0.8913043478260869


'CNBP_04'

0.8343949044585988


'CNBP_05'

0.86


'CNBP_06'

0.9122807017543859


'CNBP_07'

0.7615384615384615


'CNBP_08'

0.940677966101695


'CNBP_10'

0.9555555555555556


'CNBP_11'

0.6565656565656566


'CNBP_12'

0.7261146496815286


'CNBP_13'

0.8709677419354839


'CNBP_14'

0.9365079365079365


'CNBP_15'

0.698019801980198


'CNBP_16'

0.7671957671957672


'CNBP_17'

0.42592592592592593


'CNBP_18'

0.9523809523809523


'CNBP_19'

0.8817204301075269


'CNBP_20'

0.8888888888888888


'CNBP_21'

0.7219512195121951


'CNBP_22'

0.7105263157894737


'CNBP_23'

0.9552238805970149


'CNBP_24'

0.9323308270676691


'CNBP_25'

0.981651376146789


'CNBP_26'

0.9032258064516129


'DMPK_01'

0.5963302752293578


'DMPK_02'

0.5858585858585859


'DMPK_03'

0.5575221238938053


'DMPK_04'

0.5663716814159292


'DMPK_05'

0.9428571428571428


'DMPK_05_repeat'

0.5390625


'DMPK_06'

0.543859649122807


'DMPK_07'

0.6181818181818182


'DMPK_08'

0.5892857142857143


'DMPK_09'

0.4748201438848921


'DMPK_10'

0.9156626506024096


'DMPK_11'

0.9017857142857143


'DMPK_12'

0.6666666666666666


'DMPK_13'

0.8924731182795699


'DMPK_14'

0.6111111111111112


'DMPK_15'

0.8846153846153846


'DMPK_16'

0.76


'DMPK_17'

0.84


'DMPK_18'

0.6262626262626263


'DMPK_19'

0.7209302325581395


'DMPK_20'

0.8518518518518519


'DMPK_21'

0.6153846153846154


'DMPK_22'

0.6576576576576577


'DMPK_23'

1.0


'DMPK_24'

0.7452830188679245


'DMPK_25'

0.7049180327868853


'DMPK_26'

0.6372549019607843


'DMPK_27'

0.5643564356435643


'DMPK_28'

0.41496598639455784


'DMPK_29'

0.52


'DMPK_30'

0.5614035087719298


'DMPK_HvB_01'

0.768


'DMPK_HvB_02'

0.8169014084507042


'DMPK_HvB_03'

0.6233766233766234


'DMPK_RW_01'

0.847457627118644


'DMPK_RW_02'

0.7175572519083969


'DMPK_RW_03'

0.648


'DMPK_RW_04'

0.46835443037974683


'DMPK_RW_05'

0.7481481481481481


'DMPK_RW_06'

0.46153846153846156


'FMR1_01'

0.9523809523809523


'FMR1_02'

0.5714285714285714


'FMR1_03'

0.9315068493150684


'FMR1_05'

0.7903225806451613


'FMR1_06'

0.9354838709677419


'FMR1_07'

0.5803571428571429


'FMR1_08'

0.575


'FMR1_09'

0.86


'FMR1_10'

0.8478260869565217


'FMR1_11'

0.8191489361702128


'FMR1_12'

0.4358974358974359


'FMR1_13'

0.8048780487804879


'FMR1_14'

0.828125


'FXN_01'

0.8242424242424242


'FXN_02'

0.865979381443299


'NOP56_01'

0.5315315315315315


'NOP56_02'

0.36492890995260663


'RFC1_01'

0.7073170731707317


'RFC1_02'

0.42073170731707316


'RFC1_03'

0.8253012048192772


'RFC1_04'

0.4550898203592814


'RFC1_05'

0.8671875


'RFC1_06'

0.6915422885572139


'RFC1_07'

0.4942528735632184


'RFC1_08'

0.32679738562091504


'RFC1_09'

0.8028169014084507


'RFC1_10'

0.6853146853146853


'RFC1_11'

0.8117647058823529


'RFC1_12'

0.6275862068965518


'RFC1_13'

0.47058823529411764


'RFC1_14'

0.7272727272727273


'RFC1_15'

0.62


'RFC1_16'

0.7631578947368421


'RFC1_17'

0.46107784431137727


'RFC1_18'

0.9508196721311475


'RFC1_19'

0.7686567164179104


'RFC1_20'

0.4875


'RFC1_21'

0.9009009009009009


'RFC1_22'

0.6642335766423357


'RFC1_23'

0.5524475524475524


'RFC1_24'

0.7351351351351352


'RFC1_25'

0.3967391304347826


'RFC1_26'

0.7654320987654321


'RFC1_27'

0.7380952380952381


'RFC1_28'

0.3805970149253731


'RFC1_29'

0.4375


'RFC1_30'

0.8782608695652174


'STARD7_39292'

0.6521739130434783


### STEP 3: Assign Allele to each file

After step 2. each sample's meta tables were manually curated to designate alleles. Alleles were assigned based on de novo assembly and Enfocus estimated repeat expansion counts. Then, each mapped molecule was assigned an allele based on its curated meta table.

In [566]:
samples = pd.read_csv(rf"C:/Users/sshukor/OneDrive - Bionano Genomics/Documents/Repeat Expansion Project/for_Radboud_repeat_expansion_paper/Mol_distance_mapped/05Dec2023/mol_dist_xmap_input_summary.csv")
in_path = rf"C:/Users/sshukor/OneDrive - Bionano Genomics/Documents/Repeat Expansion Project/for_Radboud_repeat_expansion_paper/Mol_distance_mapped/05Dec2023/"

QC_status = []
# for s in samples.iloc[95:96].itertuples():
for s in samples.itertuples():
    display(s[1])
    sample_name = s[1]
    xls = pd.ExcelFile(f'{in_path}/{sample_name}_complete_data_mapped.xlsx')
    mol_df = pd.read_excel(xls, 'molecules_mapped')
    meta_df = pd.read_excel(xls, 'repeat_info')

    mol_df['best_mapID'] = mol_df['best_mapID'].astype('string')
    meta_df['MapID'] = meta_df['MapID'].astype('string')

    # map allele in mol_df based on meta_df
    mapping = dict(meta_df[['MapID', 'Allele']].values)
    mol_df['Allele'] = mol_df['best_mapID'].map(mapping)

    # display(mol_df.head())
    # drop index columns
    mol_df = mol_df.drop(columns=['Unnamed: 0'])
    meta_df = meta_df.drop(columns=['Unnamed: 0'])

    out_path = rf"C:/Users/sshukor/OneDrive - Bionano Genomics/Documents/Repeat Expansion Project/for_Radboud_repeat_expansion_paper/Mol_distance_mapped/Alleles_mapped_06Dec2023/"
    with pd.ExcelWriter(f'{out_path}/{sample_name}_complete_data_mapped.xlsx') as writer:  # doctest: +SKIP
        mol_df.to_excel(writer, sheet_name='molecules_mapped', index=False)
        meta_df.to_excel(writer, sheet_name='repeat_info', index=False)

samples.to_csv(rf"C:/Users\sshukor\OneDrive - Bionano Genomics/Documents/Repeat Expansion Project/for_Radboud_repeat_expansion_paper/Mol_distance_mapped/Alleles_mapped_06Dec2023/mol_dist_xmap_input_summary.csv", index=False)

'SUSL005'

'SUSL007'

'SUSL008'

'SUSL012'

'SUSL013'

'c9orf72_08-362'

'c9orf72_106340'

'CNBP_01'

'CNBP_02'

'CNBP_03'

'CNBP_04'

'CNBP_05'

'CNBP_06'

'CNBP_07'

'CNBP_08'

'CNBP_10'

'CNBP_11'

'CNBP_12'

'CNBP_13'

'CNBP_14'

'CNBP_15'

'CNBP_16'

'CNBP_17'

'CNBP_18'

'CNBP_19'

'CNBP_20'

'CNBP_21'

'CNBP_22'

'CNBP_23'

'CNBP_24'

'CNBP_25'

'CNBP_26'

'DMPK_01'

'DMPK_02'

'DMPK_03'

'DMPK_04'

'DMPK_05'

'DMPK_05_repeat'

'DMPK_06'

'DMPK_07'

'DMPK_08'

'DMPK_09'

'DMPK_10'

'DMPK_11'

'DMPK_12'

'DMPK_13'

'DMPK_14'

'DMPK_15'

'DMPK_16'

'DMPK_17'

'DMPK_18'

'DMPK_19'

'DMPK_20'

'DMPK_21'

'DMPK_22'

'DMPK_23'

'DMPK_24'

'DMPK_25'

'DMPK_26'

'DMPK_27'

'DMPK_28'

'DMPK_29'

'DMPK_30'

'DMPK_HvB_01'

'DMPK_HvB_02'

'DMPK_HvB_03'

'DMPK_RW_01'

'DMPK_RW_02'

'DMPK_RW_03'

'DMPK_RW_04'

'DMPK_RW_05'

'DMPK_RW_06'

'FMR1_01'

'FMR1_02'

'FMR1_03'

'FMR1_05'

'FMR1_06'

'FMR1_07'

'FMR1_08'

'FMR1_09'

'FMR1_10'

'FMR1_11'

'FMR1_12'

'FMR1_13'

'FMR1_14'

'FXN_01'

'FXN_02'

'NOP56_01'

'NOP56_02'

'RFC1_01'

'RFC1_02'

'RFC1_03'

'RFC1_04'

'RFC1_05'

'RFC1_06'

'RFC1_07'

'RFC1_08'

'RFC1_09'

'RFC1_10'

'RFC1_11'

'RFC1_12'

'RFC1_13'

'RFC1_14'

'RFC1_15'

'RFC1_16'

'RFC1_17'

'RFC1_18'

'RFC1_19'

'RFC1_20'

'RFC1_21'

'RFC1_22'

'RFC1_23'

'RFC1_24'

'RFC1_25'

'RFC1_26'

'RFC1_27'

'RFC1_28'

'RFC1_29'

'RFC1_30'

'STARD7_39292'