# Predictions for high-loop-density areas

Author: Amulya Garimella

Last updated: 2025/05/02

## Input files

In [10]:
input_cool_file = "/Users/amulyagarimella/Documents/2241finalproject/data/GM12878.GSE115524.Homo_Sapiens.CTCF.b1.mcool"

## baseline: linear + logistic regression

In [11]:
import cooler
import numpy as np
from sklearn.linear_model import LogisticRegression, LinearRegression
import pybedtools
import pandas as pd

In [24]:
# We will divide the matrix into 64 sub-matrices

chrsizes = pd.read_csv("/Users/amulyagarimella/Documents/2241finalproject/HiC2MicroC/data/hg38.sizes", sep="\t", header=None, index_col=0)
input_res = 10000
CHRNAME = "chr1"
CHRLEN = chrsizes.loc[CHRNAME].values[0]

In [26]:
input_loops_file = f"/Users/amulyagarimella/Documents/2241finalproject/data/loops/GM12878.GSE115524.Homo_Sapiens.CTCF.b1.{input_res}.fithichip_hp.longrange.bed"

def process_loop_catalog_dataset(dataset_path):
    """
    Process the loop catalog dataset to extract relevant columns and convert them to a PyRanges object.
    """
    df = pd.read_csv(dataset_path, header=None, sep='\t')
    extra = df[3].str.split(r',|:|-', expand=True)
    df = pd.concat([df.drop(columns=[3]), extra], axis=1)
    df.columns = ["chr_1", "start_1", "end_1", "chr_2", "start_2", "end_2", "score"]
    # Convert start and end columns to integers
    df[["start_1", "end_1", "start_2", "end_2"]] = df[["start_1", "end_1", "start_2", "end_2"]].astype(int)
    
    # Convert score to float
    df["score"] = df["score"].astype(float)
    return df

loops = process_loop_catalog_dataset(input_loops_file)

In [27]:
# extract part of cooler
clr = cooler.Cooler(f'{input_cool_file}::/resolutions/{input_res}')
start, size = 0, min(input_res*100, CHRLEN)
#mat = clr.matrix(balance=False)[start:start+size, start:start+size]

In [47]:
chrlen_bins = CHRLEN//input_res
min_submatrices = 100
submatrix_len = 100*input_res
submatrix_len_bins = submatrix_len//input_res

overlap_len = max((submatrix_len_bins*min_submatrices - chrlen_bins) // (min_submatrices - 1),0)
stridelen_bins = submatrix_len_bins - overlap_len

In [48]:
CHRLEN

np.int64(248956422)

In [54]:
# Feature extraction: divide mat into submatrices
# Labels: determine number of loops in each submatrix
features = []
labels = []


for i in np.arange(0, chrlen_bins, stridelen_bins):
    # Extract the submatrix
    i_end = min(i + submatrix_len_bins, chrlen_bins)
    #print(f"Bin {i} to {i_end}")
    
    start_coord = int(i*input_res)
    end_coord = int(i_end*input_res)
    #print(f"Start coord: {start_coord}, End coord: {end_coord}")

    counts = clr.pixels()[i:i_end]
 
    
    submat = counts["count"].values 
    
    # Create feature vector from the submatrix
    if len(submat) < submatrix_len_bins:
        # Pad the submatrix with zeros
        submat = np.pad(submat, (0, submatrix_len_bins - len(submat)), 'constant')
        print(f"Submatrix {i} padded to {submatrix_len_bins} bins")

    features.append(submat)
    # Determine the number of loops in the submatrix
    # print(loops.head())
    loops_in_submat = loops.loc[
        (loops['chr_1'] == CHRNAME) & 
        (loops['start_1'] >= start_coord) & 
        (loops['end_1'] <= end_coord) & 
        (loops['chr_2'] == CHRNAME) & 
        (loops['start_2'] >= start_coord) & 
        (loops['end_2'] <= end_coord)
    ]

    labels.append(len(loops_in_submat))
    if len(loops_in_submat) > 0:
        print(f"Submatrix {i} has {len(loops_in_submat)} loops")
    else:
        print(f"Submatrix {i} has no loops")


# Convert to numpy arrays
features = np.array(features)
print(f"Created {len(features)} submatrices")
print(f"Feature shape: {features.shape}")

# Convert labels to numpy array
labels = np.array(labels)
print(f"Labels shape: {labels.shape}")

Submatrix 0 has 8 loops
Submatrix 100 has 76 loops
Submatrix 200 has 62 loops
Submatrix 300 has 20 loops
Submatrix 400 has 2 loops
Submatrix 500 has 12 loops
Submatrix 600 has 42 loops
Submatrix 700 has 14 loops
Submatrix 800 has 38 loops
Submatrix 900 has 64 loops
Submatrix 1000 has 70 loops
Submatrix 1100 has 48 loops
Submatrix 1200 has 72 loops
Submatrix 1300 has 12 loops
Submatrix 1400 has 14 loops
Submatrix 1500 has 46 loops
Submatrix 1600 has 50 loops
Submatrix 1700 has 50 loops
Submatrix 1800 has 10 loops
Submatrix 1900 has 66 loops
Submatrix 2000 has 18 loops
Submatrix 2100 has 42 loops
Submatrix 2200 has 20 loops
Submatrix 2300 has 60 loops
Submatrix 2400 has 52 loops
Submatrix 2500 has no loops
Submatrix 2600 has no loops
Submatrix 2700 has no loops
Submatrix 2800 has no loops
Submatrix 2900 has no loops
Submatrix 3000 has no loops
Submatrix 3100 has no loops
Submatrix 3200 has no loops
Submatrix 3300 has no loops
Submatrix 3400 has no loops
Submatrix 3500 has no loops
Submat

In [55]:
# Train a linear regression model

linreg = LinearRegression()
linreg.fit(features, labels)

In [57]:
linreg

## baseline 2: single layer CNN

In [9]:
import torch
import torch.nn as nn
import cooler

In [None]:
# load sparse mat
clr = cooler.Cooler('input.cool::/resolutions/10000')
mat = clr.matrix(balance=True)[i0:i0+3, i0:i0+3]
x = torch.tensor(mat, dtype=torch.float32)[None,None]

class LiteLoopNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.dw = nn.Conv2d(1,1,3,padding=1,groups=1)  # depthwise on 1 channel
        self.pw = nn.Conv2d(1,1,1)                     # pointwise
        self.pool = nn.AdaptiveAvgPool2d(1)
        # self.dropout = nn.Dropout(p=0.5)               # added dropout layer
    def forward(self, x):
        x = nn.functional.relu(self.pw(self.dw(x)))
        return torch.sigmoid(self.pool(x).view(-1))

model = LiteLoopNet()
out = model(x)


## GNN

In [5]:
import cooler
import torch
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv

  Referenced from: <9BBB4F05-41BA-385E-B2A3-76313944D73A> /Users/amulyagarimella/Documents/2241finalproject/predictions_venv/lib/python3.11/site-packages/libpyg.so
  Reason: tried: '/Library/Frameworks/Python.framework/Versions/3.11/Python' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/Library/Frameworks/Python.framework/Versions/3.11/Python' (no such file), '/Library/Frameworks/Python.framework/Versions/3.11/Python' (no such file)
  Referenced from: <9BBB4F05-41BA-385E-B2A3-76313944D73A> /Users/amulyagarimella/Documents/2241finalproject/predictions_venv/lib/python3.11/site-packages/libpyg.so
  Reason: tried: '/Library/Frameworks/Python.framework/Versions/3.11/Python' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/Library/Frameworks/Python.framework/Versions/3.11/Python' (no such file), '/Library/Frameworks/Python.framework/Versions/3.11/Python' (no such file)


In [None]:
# 1. load cooler pixels for region
clr = cooler.Cooler('input.cool::/resolutions/10000')
pixels = clr.pixels().fetch('chr1')
mask = (pixels['bin1_id'].between(i0,i1) &
        pixels['bin2_id'].between(i0,i1))
sub = pixels[mask]

# 2. build graph edges + node features
edge_index = torch.tensor([
    sub['bin1_id'].values - i0,
    sub['bin2_id'].values - i0
], dtype=torch.long)
x = torch.tensor(sub['count'].values, dtype=torch.float32).unsqueeze(1)
data = Data(x=x, edge_index=edge_index)

# 3. GCN definition
class LoopGNN(torch.nn.Module):
    def __init__(self, in_ch, hid_ch):
        super().__init__()
        self.conv1 = GCNConv(in_ch, hid_ch)
        self.conv2 = GCNConv(hid_ch, 1)
    def forward(self, d):
        h = torch.relu(self.conv1(d.x, d.edge_index))
        return self.conv2(h, d.edge_index)

model = LoopGNN(1,16).to('cuda')
opt   = torch.optim.Adam(model.parameters(), lr=1e-3)
crit  = torch.nn.BCEWithLogitsLoss()

# 4. training
loader = DataLoader([data], batch_size=1)
for batch in loader:
    batch = batch.to('cuda')
    opt.zero_grad()
    out = model(batch).mean()            # graph-level score :contentReference[oaicite:7]{index=7}
    loss = crit(out, torch.tensor([1.],device='cuda'))
    loss.backward()
    opt.step()

## gzipped data