In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import PIL.Image
import networkx as nx
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import string

import sys
path=r'C:\Users\31649\Documents\genome analysis\genome_topology\functions'
sys.path.append(path)

from genome_topology import open_pdb
from genome_topology import select_chrom
from genome_topology import geom_distance
from genome_topology import fractal_dimension
from genome_topology import get_matrix
from genome_topology import normalize_psc
from matrix_analysis import write_topology_matrix

## Functions

In [2]:
def SplitChromosomesIntoSegments (coordinates, n_segments):
    length_coords= len(coordinates)
    length_segments = length_coords//n_segments
    rest=np.mod(length_coords,n_segments)
    segments = []

    for i in range(0, length_coords - rest, length_segments):
        segments.append(coordinates[i:i + length_segments])

    return segments

def contact_indexes(pos1, pos2):
    index = [pos1, pos2]
    index=np.array(index)
    index= np.transpose(index)
    return index

# Split the chromosomes into segments to produce local matrices

##  Load data
Pick the resolution 

In [3]:
resolution = '40'
path = 'data/zoomify processed'
samples = ['Control1', 'Control2', 'Treated1', 'Treated2']
control1 = pd.DataFrame()
control2 = pd.DataFrame()
treated1 = pd.DataFrame()
treated2 = pd.DataFrame()

frames = [None]*4

for ind, sample in enumerate(samples):
    df =  pd.read_csv(f'{path}/{sample}{resolution}', sep = '\t', header = None, 
                      names = ['chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2', 'count', 'balanced'],
                     dtype={'chrom1': 'str'})
    
    
    frames[ind] = df.dropna().reset_index(drop=True)
    sample_col = [sample]* len(frames[ind])
    frames[ind]['Sample'] = sample_col
    
    
control1 = frames[0]
control2 = frames[1]
treated1 = frames[2]
treated2 = frames[3]

## Filter data by their number of counts
 - Decide the quantile for data filtering 
 - Decide the type of counts you want to filter by (count/balanced count)

In [5]:
threshold_quantile = 0.99
count = 'count'

frames = [control1, control2, treated1, treated2]
contacts = pd.DataFrame()

total_data = list(control1[count]) + list(control2[count]) + list(treated1[count]) + list(treated2[count])
data = {'counts': total_data}
data = pd.DataFrame(data)
threshold = data['counts'].quantile(threshold_quantile)

for frame in frames:
    frame = frame[frame[count]>= threshold]
    frame = pd.DataFrame(frame)
    frames = [contacts, frame]
    contacts = pd.concat(frames)

    
new_index = np.linspace(1, len(contacts), len(contacts), dtype = int)
contacts['Index'] = new_index
contacts = contacts.set_index('Index')

In [6]:
chromosomes = control1['chrom1'].unique()


## Split chromosomes in segments and produce topology matrices for the segments
Pick the path for saving the matrices

In [8]:
save_matrix = False

In [None]:
n_segments = 4
path_matrices = f'results counts/matrices/local matrices/{resolution}/{n_segments} segments'
plot_matrix = False

for sample in samples:
    print(sample)
    contacts_sample = contacts[contacts['Sample'] == sample]
    for chrom in chromosomes:
        print(f'Chromosome {chrom}')
        contacts_chr= contacts_sample[(contacts_sample['chrom1']== chrom) & (contacts_sample['chrom2']==chrom)]
        
        length1 = (contacts_chr['end1'] -  contacts_chr['start1'])//2
        contacts_chr['position1']=  pd.Series(contacts_chr['start1'] + length1)
    
        length2 = (contacts_chr['end2'] -  contacts_chr['start2'])//2
        contacts_chr['position2']=  pd.Series(contacts_chr['start2'] + length2)
        
        maxim_coord = contacts_chr['end1'].max() 
        bins = np.linspace(0, maxim_coord, n_segments + 1,  dtype= int)
        
        segments = []

        for t in range(len(bins)-1):
            contacts_pos1 = contacts_chr[(contacts_chr['position1']> bins[t]) & (contacts_chr['position1']< bins[t + 1])]
            contacts_pos2 = contacts_pos1[(contacts_pos1['position2']> bins[t]) & (contacts_pos1['position2']< bins[t + 1])]
    
            index = contact_indexes(contacts_pos2['position1'], contacts_pos2['position2'])
            segments.append(index)
            
        for j in range(n_segments):
            print(j)
            N_contacts= len(segments[j])
            mat, psc = get_matrix(segments[j], 'segm')
            if save_matrix:
                write_topology_matrix(mat, path_matrices , namefile = f'{chrom}_{sample}_{j}')
                
            if plot_matrix:
                plt.figure()
                plt.imshow(mat)
        
        

Control1
Chromosome 1
0
1
