In [51]:
import pandas as pd
import torch
import os
import sys
sys.path.insert(0,'../code/data_utils/')
from ConfigDataset import ConfigDataset

Get data pre-processed with my own code 

In [29]:

all_config_datasets = {
    cell_number:{
        replicate:ConfigDataset(
            '../data/processed_data.hdf5',
            segment_length=64,
            remove_diagonal=False,
            batch_size=0,
            normalize_distances=False,
            geos=None,
            organisms=None,
            cell_types=None,
            cell_numbers=[cell_number],
            chroms=None,
            replicates=[replicate],
            shuffle=True,
            allow_overlap=True,
            two_channels=False,
            try_GPU=False,
            mean_dist_fp=None,
            mean_sq_dist_fp=None
        ) for replicate in range(3)
    } for cell_number in range(1,18) if cell_number != 8
}

'''
config_ds = ConfigDataset(
    '../data/processed_data.hdf5',
    segment_length=64,
    remove_diagonal=False,
    batch_size=0,
    normalize_distances=False,
    geos=None,
    organisms=None,
    cell_types=None,
    cell_numbers=None,
    chroms=None,
    replicates=None,
    shuffle=True,
    allow_overlap=True,
    two_channels=False,
    try_GPU=False,
    mean_dist_fp=None,
    mean_sq_dist_fp=None
)
'''

Get data directly from the Tan files

In [19]:
raw_tan_data_dir = '../data/tan_single-cell_2018/'


def get_cell_number(fp):
    return int( fp.split('/')[-1].split('_')[-1].split('-')[0] )

# Find the directories that actually contain 
all_dirs = [ raw_tan_data_dir + d for d in os.listdir( raw_tan_data_dir ) ]
valid_dirs = []
for d in all_dirs:

    files = os.listdir(d)
    contains_clean = False
    for f in files: 
        if 'impute3.round4.clean.3dg' in f:
            contains_clean = True
            break

    if contains_clean:
        valid_dirs.append(
            (
                get_cell_number(d),
                d
            )
        )
valid_dirs.sort()


In [39]:
def get_raw_coords(replicate_number,directory):

    if directory[-1] != '/':
        directory+= '/'

    clean_structure_files = [f for f in os.listdir(directory) if 'impute3.round4.clean.3dg' in f]

    for f in clean_structure_files:

        if '_rep1_' in f:
            if replicate_number == 1:
                filepath = directory + f
                break
        elif '_rep2_' in f:
            if replicate_number == 2:
                filepath = directory + f
                break
        elif replicate_number == 0:
            filepath = directory + f

    coord_df = pd.read_csv(
        filepath,
        sep='\t',
        header=None,
        names=['Chromosome','Genomic_Index','x','y','z']
    )
    return coord_df

In [97]:
def extract_specific_region(sub_coord_df,genomic_index,region_length=64,resolution=20_000):

    # Get the relevant region
    sub_coord_df = sub_coord_df[ 
        (sub_coord_df.Genomic_Index >= genomic_index) & 
        ( sub_coord_df.Genomic_Index < genomic_index + resolution * region_length )  
    ]
    sub_coord_df = sub_coord_df.sort_values('Genomic_Index',axis=0,ignore_index=True)

    # Fetch both maternal & paternal data
    coords = []
    for chrom in sub_coord_df.Chromosome.unique():
        vals = torch.from_numpy(sub_coord_df[ sub_coord_df.Chromosome == chrom ][['x','y','z']].values)

        if 'pat' in chrom:
            coords.append(vals)
        else:
            coords.insert(0,vals)

    return coords

def compare_to_raw(cell_number, raw_directory, all_config_datasets=all_config_datasets):

    unmatched_regions = [[],[],[]]
    n_regions = []
    
    for replicate in range(0,3):

        coord_df = get_raw_coords(replicate, raw_directory)

        processed_data = all_config_datasets[cell_number][replicate]

        genomic_regions = processed_data.get_genomic_regions()

        n_regions.append( len(genomic_regions) )

        for chrom in genomic_regions.Chromosome.unique():

            chrom_regions = genomic_regions[ genomic_regions.Chromosome == chrom ]

            sub_coord_df = coord_df[ (coord_df.Chromosome == f'{chrom}(mat)') | (coord_df.Chromosome == f'{chrom}(pat)')  ]
            
            for _,row in chrom_regions.iterrows():
                genomic_index = row.Start
                coord_info, processed_coords = processed_data.fetch_specific_coords(chrom,genomic_index)
                raw_coords = extract_specific_region(sub_coord_df,genomic_index)
                
                if raw_coords is None or \
                not torch.allclose( processed_coords[coord_info.Lineage=='mat'].squeeze(), raw_coords[0] ) or \
                not torch.allclose( processed_coords[coord_info.Lineage=='pat'].squeeze(), raw_coords[1] ):
                    unmatched_regions[replicate].append( (chrom,genomic_index) )

    return unmatched_regions, n_regions

In [103]:
import time
all_mismatched_regions = {}

for cell_number, directory in valid_dirs:
    t = -time.time()
    unmatched_regions,n_regions = compare_to_raw(cell_number,directory)
    print(f'Cell {cell_number}: {time.time()+t:.4f}')

    for replicate_number in range(3):
        print( cell_number, replicate_number, n_regions[replicate_number], len(unmatched_regions[replicate_number]) )
    print('')
    all_mismatched_regions[cell_number] = {
        'unmatched_regions':unmatched_regions,
        'n_regions':n_regions
    }
    '''
    for replicate_number in range(3):
        unmatched_regions,n_regions = compare_to_raw(*valid_dirs[0])
        all_mismatched_regions[cell_number][replicate_number] = {
            'unmatched_regions':unmatched_regions,
            'n_regions':n_regions
        }
        print( cell_number, replicate_number, result_dict['n_regions'], len(result_dict['unmatched_regions']) )
    '''


Cell 1: 1922.0202
1 0 118163 0
1 1 118284 0
1 2 117986 0



KeyboardInterrupt: 

In [100]:
for cell_number,replicate_details in all_mismatched_regions.items():
    for replicate_number,result_dict in replicate_details.items():
        print( cell_number, replicate_number, result_dict['n_regions'], len(result_dict['unmatched_regions']) )

In [102]:
f'{1234.123412341234:.4f}'

'1234.1234'