In [22]:
import re
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from Karsimulator_Start_Genome import get_event_chr
from COMPARISON_with_graphs import *

## Check if dependent cluster segregation is correct

Result: Passed

In [33]:
data_folder = 'cluster_files_testbuild/'

In [34]:
files = []

for file in os.listdir(data_folder):
	files.append(file)

files.sort()

In [35]:
data = []

for file in files:
	new_data = {}
	file_name = file.split('cluster')[0]
	cluster_number = file.split('cluster_')[1].replace('.txt', '')
	with open(data_folder + file) as fp_read:
		line1 = fp_read.readline()
		matches = re.findall(r'<(.*?)>', line1)
		origins = eval(matches[1])
		new_data = {'file_name': file_name,
					'cluster': matches[0],
					'n_origin_chr': len(origins),
					'origin_chr': origins,
					'n_path_karsim': int(matches[2]),
					'n_path_omkar': int(matches[3])}
	alignment_file = file.split('.')[0] + '.alignment.txt'
	with open('alignment_files/' + alignment_file) as fp_read:
		line1 = fp_read.readline()
		line1 = line1.replace('\n', '').split(': ')[1]
		new_data['total_alignment_cost'] = int(line1)
		alignment_costs = []
		for line in fp_read:
			if line.startswith('alignment'):
				line = line.replace('\n', '').split('cost: ')[1]
				alignment_costs.append(int(line))
		new_data['alignment_costs'] = alignment_costs
	data.append(new_data)

In [36]:
df = pd.DataFrame(data)

In [37]:
merged_by_filename_df = df.groupby('file_name')['origin_chr'].agg(lambda x: sum(x, []))
merged_by_filename_df = merged_by_filename_df.reset_index()

In [38]:
def custom_sort(chr_list):
    order = {'Chr' + str(i): i for i in range(1, 23)}
    order['ChrX'] = 23
    order['ChrY'] = 24
    return sorted(chr_list, key=lambda x: order.get(x, float('inf')))

merged_by_filename_df['origin_chr_sorted'] = merged_by_filename_df['origin_chr'].apply(custom_sort)

In [39]:
merged_by_filename_df['n_chr'] = merged_by_filename_df['origin_chr_sorted'].apply(len)

In [40]:
def check_chr_coverage(row):
	female = [f'Chr{i}' for i in range(1, 23)] + ['ChrX']
	male = [f'Chr{i}' for i in range(1, 23)] + ['ChrX', 'ChrY']
	return row == female or row == male

merged_by_filename_df['coverage'] = merged_by_filename_df['origin_chr_sorted'].apply(check_chr_coverage)

In [41]:
merged_by_filename_df

Unnamed: 0,file_name,origin_chr,origin_chr_sorted,n_chr,coverage
0,23X_15q26_overgrowth_r1,"[Chr1, Chr2, Chr11, Chr12, Chr13, Chr14, Chr15...","[Chr1, Chr2, Chr3, Chr4, Chr5, Chr6, Chr7, Chr...",23,True
1,23X_1q21_recurrent_microduplication_r1,"[Chr1, Chr2, Chr12, Chr13, Chr14, Chr15, Chr16...","[Chr1, Chr2, Chr3, Chr4, Chr5, Chr6, Chr7, Chr...",23,True
2,23X_1q21_recurrent_microduplication_r2,"[Chr3, Chr4, Chr15, Chr5, Chr16, Chr17, Chr18,...","[Chr1, Chr2, Chr3, Chr4, Chr5, Chr6, Chr7, Chr...",23,True
3,23X_22q11-2_distal_deletion_r1,"[Chr1, Chr2, Chr11, Chr12, Chr13, Chr14, Chr15...","[Chr1, Chr2, Chr3, Chr4, Chr5, Chr6, Chr7, Chr...",23,True
4,23X_22q11-2_distal_deletion_r2,"[Chr2, Chr3, Chr13, Chr14, Chr15, Chr16, Chr17...","[Chr1, Chr2, Chr3, Chr4, Chr5, Chr6, Chr7, Chr...",23,True
5,23X_22q11_duplication_r2,"[Chr1, Chr2, Chr12, Chr13, Chr6, Chr15, Chr16,...","[Chr1, Chr2, Chr3, Chr4, Chr5, Chr6, Chr7, Chr...",23,True
6,23X_Angelman_r1,"[Chr2, Chr3, Chr12, Chr13, Chr14, Chr15, Chr16...","[Chr1, Chr2, Chr3, Chr4, Chr5, Chr6, Chr7, Chr...",23,True
7,23X_Angelman_r2,"[Chr2, Chr3, Chr13, Chr14, Chr15, Chr16, Chr17...","[Chr1, Chr2, Chr3, Chr4, Chr5, Chr6, Chr7, Chr...",23,True
8,23X_Cri_du_Chat_r1,"[Chr1, Chr2, Chr13, Chr14, Chr15, Chr16, Chr17...","[Chr1, Chr2, Chr3, Chr4, Chr5, Chr6, Chr7, Chr...",23,True
9,23X_Cri_du_Chat_r2,"[Chr1, Chr2, Chr12, Chr13, Chr14, Chr15, Chr16...","[Chr1, Chr2, Chr3, Chr4, Chr5, Chr6, Chr7, Chr...",23,True


In [42]:
(merged_by_filename_df['coverage'] == False).sum()

0

## Investigate how the new OMKar Build reduced the number of cluster that has multiple chrs

Result: No change

In [43]:
data_folder = 'cluster_files_testbuild2/'

In [44]:
files = []

for file in os.listdir(data_folder):
	files.append(file)

files.sort()

In [45]:
data = []

for file in files:
	new_data = {}
	file_name = file.split('cluster')[0]
	cluster_number = file.split('cluster_')[1].replace('.txt', '')
	with open(data_folder + file) as fp_read:
		line1 = fp_read.readline()
		matches = re.findall(r'<(.*?)>', line1)
		origins = eval(matches[1])
		new_data = {'file_name': file_name,
					'cluster': matches[0],
					'n_origin_chr': len(origins),
					'origin_chr': origins,
					'n_path_karsim': int(matches[2]),
					'n_path_omkar': int(matches[3])}
	alignment_file = file.split('.')[0] + '.alignment.txt'
	with open('alignment_files/' + alignment_file) as fp_read:
		line1 = fp_read.readline()
		line1 = line1.replace('\n', '').split(': ')[1]
		new_data['total_alignment_cost'] = int(line1)
		alignment_costs = []
		for line in fp_read:
			if line.startswith('alignment'):
				line = line.replace('\n', '').split('cost: ')[1]
				alignment_costs.append(int(line))
		new_data['alignment_costs'] = alignment_costs
	data.append(new_data)

In [46]:
df = pd.DataFrame(data)

In [47]:
(df['n_origin_chr'] >= 2).sum()

50

In [48]:
df['n_path_diff'] = df['n_path_omkar'] - df['n_path_karsim']
(df['n_path_diff'] != 0).sum()

19

In [49]:
df[df['n_path_diff'] != 0]

Unnamed: 0,file_name,cluster,n_origin_chr,origin_chr,n_path_karsim,n_path_omkar,total_alignment_cost,alignment_costs,n_path_diff
173,23X_Cri_du_Chat_r1,1,1,[Chr2],2,3,18881946,"[9500886, 19160, 9361900]",1
184,23X_Cri_du_Chat_r1,2,1,[Chr5],1,2,482396,"[89984, 392412]",1
214,23X_Cri_du_Chat_r2,9,2,"[Chr10, Chr11]",5,4,561934814,"[126999146, 300870288, 1214889, 132843435, 7056]",-1
267,23X_STS_r2,3,2,"[Chr2, Chr5]",4,5,10337022,"[5185137, 15432, 95405, 45277, 4995771]",1
269,23X_STS_r2,5,1,[Chr8],2,3,16057728,"[9122593, 25359, 6909776]",1
332,23Y_2p15-16-1_microdeletion_r2,0,1,[Chr1],2,3,10930353,"[5458997, 18587, 5452769]",1
344,23Y_2p15-16-1_microdeletion_r2,2,1,[Chr4],2,3,15177308,"[7671227, 8500, 7497581]",1
346,23Y_2p15-16-1_microdeletion_r2,3,1,[Chr6],2,3,89981517,"[89915181, 66336]",1
391,23Y_CMT1A_r1,3,1,[Chr5],1,2,482396,"[89984, 392412]",1
393,23Y_CMT1A_r1,5,2,"[Chr3, Chr7]",5,4,232634535,"[345896, 41577, 113872151, 110766683, 7608228]",-1
