# Predictions for high-loop-density areas

Author: Amulya Garimella

Last updated: 2025/05/02

In [238]:
import cooler
import numpy as np
from sklearn.linear_model import LogisticRegression, LinearRegression
import pybedtools
import pandas as pd
import sklearn.metrics
import scipy.stats

## Input files

In [213]:
input_cool_file = "/Users/amulyagarimella/Documents/2241finalproject/data/GM12878.GSE115524/GM12878.GSE115524.Homo_Sapiens.CTCF.b1.mcool"

## Feature and label extraction

In [214]:
# We will divide the matrix into 64 sub-matrices

chrsizes = pd.read_csv("/Users/amulyagarimella/Documents/2241finalproject/HiC2MicroC/data/hg38.sizes", sep="\t", header=None, index_col=0)
input_res = 10000

In [215]:
input_loops_file = f"/Users/amulyagarimella/Documents/2241finalproject/data/GM12878.GSE115524/loops/GM12878.GSE115524.Homo_Sapiens.CTCF.b1.{input_res}.fithichip_hp.longrange.bed"

def process_loop_catalog_dataset(dataset_path):
    """
    Process the loop catalog dataset to extract relevant columns and convert them to a PyRanges object.
    """
    df = pd.read_csv(dataset_path, header=None, sep='\t')
    extra = df[3].str.split(r',|:|-', expand=True)
    df = pd.concat([df.drop(columns=[3]), extra], axis=1)
    df.columns = ["chr_1", "start_1", "end_1", "chr_2", "start_2", "end_2", "score"]
    # Convert start and end columns to integers
    df[["start_1", "end_1", "start_2", "end_2"]] = df[["start_1", "end_1", "start_2", "end_2"]].astype(int)
    
    # Convert score to float
    df["score"] = df["score"].astype(float)
    return df

loops = process_loop_catalog_dataset(input_loops_file)

In [216]:
# extract part of cooler
clr = cooler.Cooler(f'{input_cool_file}::/resolutions/{input_res}')
#mat = clr.matrix(balance=False)[start:start+size, start:start+size]

In [263]:
# Feature extraction: divide mat into submatrices
# Labels: determine number of loops in each submatrix

CHRNAME = "chr1"

def get_features_and_labels (chrnames, res):
    features = []
    labels = []

    for chrname in chrnames:
        chrlen = chrsizes.loc[chrname].values[0]
        chrlen_bins = chrlen//res
        min_submatrices = 100
        submatrix_len = 100*res
        submatrix_len_bins = submatrix_len//res

        overlap_len = max((submatrix_len_bins*min_submatrices - chrlen_bins) // (min_submatrices - 1),0)
        stridelen_bins = submatrix_len_bins - overlap_len

        for i in np.arange(0, chrlen_bins, stridelen_bins):
            # Extract the submatrix
            i_end = min(i + submatrix_len_bins, chrlen_bins)
            #print(f"Bin {i} to {i_end}")
            
            start_coord = int(i*res)
            end_coord = int(i_end*res)
            #print(f"Start coord: {start_coord}, End coord: {end_coord}")

            counts = clr.pixels()[i:i_end]
        
            
            submat = counts["count"].values 
            
            # Create feature vector from the submatrix
            if len(submat) < submatrix_len_bins:
                # Pad the submatrix with zeros
                submat = np.pad(submat, (0, submatrix_len_bins - len(submat)), 'constant')
                print(f"Submatrix {i} padded to {submatrix_len_bins} bins")

            features.append(submat)
            # Determine the number of loops in the submatrix
            # print(loops.head())
            loops_in_submat = loops.loc[
                (loops['chr_1'] == chrname) & 
                (loops['start_1'] >= start_coord) & 
                (loops['end_1'] <= end_coord) & 
                (loops['chr_2'] == chrname) & 
                (loops['start_2'] >= start_coord) & 
                (loops['end_2'] <= end_coord)
            ]

            labels.append(len(loops_in_submat))
            if len(loops_in_submat) > 0:
                print(f"Submatrix {i} has {len(loops_in_submat)} loops")
            else:
                print(f"Submatrix {i} has no loops")

    # Convert to numpy arrays
    features = np.array(features)
    print(f"Created {len(features)} submatrices")
    print(f"Feature shape: {features.shape}")

    # Convert labels to numpy array
    labels = np.array(labels)
    print(f"Labels shape: {labels.shape}")

    return features, labels

In [297]:
chrnames = chrsizes.index.values

In [299]:
test_chrname = "chr21"
train_chrnames = [x for x in chrnames if x != test_chrname]


In [300]:
feats_train, labels_train = get_features_and_labels(train_chrnames ,input_res)

# TODO: visualize after 2500

Submatrix 0 has 8 loops
Submatrix 100 has 76 loops
Submatrix 200 has 62 loops
Submatrix 300 has 20 loops
Submatrix 400 has 2 loops
Submatrix 500 has 12 loops
Submatrix 600 has 42 loops
Submatrix 700 has 14 loops
Submatrix 800 has 38 loops
Submatrix 900 has 64 loops
Submatrix 1000 has 70 loops
Submatrix 1100 has 48 loops
Submatrix 1200 has 72 loops
Submatrix 1300 has 12 loops
Submatrix 1400 has 14 loops
Submatrix 1500 has 46 loops
Submatrix 1600 has 50 loops
Submatrix 1700 has 50 loops
Submatrix 1800 has 10 loops
Submatrix 1900 has 66 loops
Submatrix 2000 has 18 loops
Submatrix 2100 has 42 loops
Submatrix 2200 has 20 loops
Submatrix 2300 has 60 loops
Submatrix 2400 has 52 loops
Submatrix 2500 has 56 loops
Submatrix 2600 has 68 loops
Submatrix 2700 has 114 loops
Submatrix 2800 has 44 loops
Submatrix 2900 has no loops
Submatrix 3000 has 32 loops
Submatrix 3100 has 80 loops
Submatrix 3200 has 28 loops
Submatrix 3300 has 20 loops
Submatrix 3400 has 14 loops
Submatrix 3500 has 22 loops
Subma

## baseline 1: linreg

In [301]:
# Train a linear regression model

linreg = LinearRegression()
linreg.fit(feats_train, labels_train)

In [302]:
feats_test, labels_test = get_features_and_labels([test_chrname], input_res)

Submatrix 0 has no loops
Submatrix 47 has no loops
Submatrix 94 has no loops
Submatrix 141 has no loops
Submatrix 188 has no loops
Submatrix 235 has no loops
Submatrix 282 has no loops
Submatrix 329 has no loops
Submatrix 376 has no loops
Submatrix 423 has 2 loops
Submatrix 470 has 2 loops
Submatrix 517 has no loops
Submatrix 564 has no loops
Submatrix 611 has no loops
Submatrix 658 has no loops
Submatrix 705 has no loops
Submatrix 752 has 4 loops
Submatrix 799 has 4 loops
Submatrix 846 has 4 loops
Submatrix 893 has 4 loops
Submatrix 940 has no loops
Submatrix 987 has 2 loops
Submatrix 1034 has no loops
Submatrix 1081 has no loops
Submatrix 1128 has no loops
Submatrix 1175 has no loops
Submatrix 1222 has no loops
Submatrix 1269 has no loops
Submatrix 1316 has 6 loops
Submatrix 1363 has 6 loops
Submatrix 1410 has 28 loops
Submatrix 1457 has 8 loops
Submatrix 1504 has no loops
Submatrix 1551 has 4 loops
Submatrix 1598 has 4 loops
Submatrix 1645 has no loops
Submatrix 1692 has 16 loops
Su

In [309]:
predictions_test = linreg.predict(feats_test)
clipped_predictions_test = np.clip(predictions_test, 0, None)

In [316]:
sklearn.metrics.mean_squared_error(labels_test, clipped_predictions_test)

4048.5198057572456

In [319]:
scipy.stats.pearsonr(labels_test, clipped_predictions_test)

PearsonRResult(statistic=np.float64(-0.021817521489326596), pvalue=np.float64(0.8294103751890585))

In [321]:
scipy.stats.spearmanr(labels_test, clipped_predictions_test)

SignificanceResult(statistic=np.float64(-0.1231053347052325), pvalue=np.float64(0.2223801829593944))

## baseline 2: logreg

In [362]:
loop_threshold = 0.75

binary_labels_train = np.where(labels_train > np.quantile(labels_train, loop_threshold), 1, 0)

In [363]:
logreg = LogisticRegression(max_iter=10000, random_state=42, penalty='l2')
logreg.fit(feats_train, binary_labels_train)

In [364]:
binary_labels_test = np.where(labels_test > np.quantile(labels_test, loop_threshold), 1, 0)

binary_predictions_test = logreg.predict(feats_test)

binary_predictions_test_proba = logreg.predict_proba(feats_test)[:, 1]
error = sklearn.metrics.mean_squared_error(binary_labels_test, binary_predictions_test)

print(f"Binary classification accuracy: {sklearn.metrics.accuracy_score(binary_labels_test, binary_predictions_test)}")
print(f"Binary classification precision: {sklearn.metrics.precision_score(binary_labels_test, binary_predictions_test)}")
print(f"Binary classification recall: {sklearn.metrics.recall_score(binary_labels_test, binary_predictions_test)}")
print(f"Binary classification f1 score: {sklearn.metrics.f1_score(binary_labels_test, binary_predictions_test)}")
print(f"Binary classification roc_auc: {sklearn.metrics.roc_auc_score(binary_labels_test, binary_predictions_test_proba)}")
print(f"Binary classification roc_auc (labels): {sklearn.metrics.roc_auc_score(binary_labels_test, binary_predictions_test)}")
print(f"Binary classification roc_auc (predictions): {sklearn.metrics.roc_auc_score(binary_labels_test, binary_predictions_test)}")
print(f"Binary classification roc_auc (predictions probabilities): {sklearn.metrics.roc_auc_score(binary_labels_test, binary_predictions_test_proba)}")

Binary classification accuracy: 0.73
Binary classification precision: 0.25
Binary classification recall: 0.04
Binary classification f1 score: 0.06896551724137931
Binary classification roc_auc: 0.4893333333333333
Binary classification roc_auc (labels): 0.5
Binary classification roc_auc (predictions): 0.5
Binary classification roc_auc (predictions probabilities): 0.4893333333333333


In [361]:
binary_predictions_test

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

### baseline 2: single layer CNN

In [242]:
import torch
import torch.nn as nn
import cooler

In [None]:
# load sparse mat

class LiteLoopNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.dw = nn.Conv1d(100,1,3,padding=1,groups=1)  # depthwise on 1 channel
        self.pw = nn.Conv1d(1,1,1)                     # pointwise
        self.pool = nn.AdaptiveAvgPool1d(1)
        # self.dropout = nn.Dropout(p=0.5)               # added dropout layer
    
    def forward(self, x):
        x = torch.tensor(x, dtype=torch.float32)
        # Reshape each sample into (1, 100, feature_length)
        x = x.reshape(x.shape[0], 100, -1)
        
        # Process each sample through the network
        dw_out = self.dw(x)
        pw_out = self.pw(dw_out)
        
        # Apply activation and pooling
        activated = nn.functional.relu(pw_out)
        pooled = self.pool(activated).squeeze(-1)
        
        # Apply sigmoid for final prediction
        return torch.sigmoid(pooled)
    
    def train (self, train_feats, train_labels):
        # Define loss function and optimizer
        criterion = nn.BCELoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        
        # Convert features and labels to PyTorch tensors
        features_tensor = torch.tensor(train_feats, dtype=torch.float32)
        labels_tensor = torch.tensor(train_labels, dtype=torch.float32).unsqueeze(1)
        
        # Training loop
        for epoch in range(100):
            optimizer.zero_grad()
            outputs = self(features_tensor)
            loss = criterion(outputs, labels_tensor)
            loss.backward()
            optimizer.step()
            
            if (epoch+1) % 10 == 0:
                print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')




model = LiteLoopNet()




AttributeError: 'LiteLoopNet' object has no attribute 'fit'

## GNN

In [5]:
import cooler
import torch
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv

  Referenced from: <9BBB4F05-41BA-385E-B2A3-76313944D73A> /Users/amulyagarimella/Documents/2241finalproject/predictions_venv/lib/python3.11/site-packages/libpyg.so
  Reason: tried: '/Library/Frameworks/Python.framework/Versions/3.11/Python' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/Library/Frameworks/Python.framework/Versions/3.11/Python' (no such file), '/Library/Frameworks/Python.framework/Versions/3.11/Python' (no such file)
  Referenced from: <9BBB4F05-41BA-385E-B2A3-76313944D73A> /Users/amulyagarimella/Documents/2241finalproject/predictions_venv/lib/python3.11/site-packages/libpyg.so
  Reason: tried: '/Library/Frameworks/Python.framework/Versions/3.11/Python' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/Library/Frameworks/Python.framework/Versions/3.11/Python' (no such file), '/Library/Frameworks/Python.framework/Versions/3.11/Python' (no such file)


In [None]:
# 1. load cooler pixels for region
clr = cooler.Cooler('input.cool::/resolutions/10000')
pixels = clr.pixels().fetch('chr1')
mask = (pixels['bin1_id'].between(i0,i1) &
        pixels['bin2_id'].between(i0,i1))
sub = pixels[mask]

# 2. build graph edges + node features
edge_index = torch.tensor([
    sub['bin1_id'].values - i0,
    sub['bin2_id'].values - i0
], dtype=torch.long)
x = torch.tensor(sub['count'].values, dtype=torch.float32).unsqueeze(1)
data = Data(x=x, edge_index=edge_index)

# 3. GCN definition
class LoopGNN(torch.nn.Module):
    def __init__(self, in_ch, hid_ch):
        super().__init__()
        self.conv1 = GCNConv(in_ch, hid_ch)
        self.conv2 = GCNConv(hid_ch, 1)
    def forward(self, d):
        h = torch.relu(self.conv1(d.x, d.edge_index))
        return self.conv2(h, d.edge_index)

model = LoopGNN(1,16).to('cuda')
opt   = torch.optim.Adam(model.parameters(), lr=1e-3)
crit  = torch.nn.BCEWithLogitsLoss()

# 4. training
loader = DataLoader([data], batch_size=1)
for batch in loader:
    batch = batch.to('cuda')
    opt.zero_grad()
    out = model(batch).mean()            # graph-level score :contentReference[oaicite:7]{index=7}
    loss = crit(out, torch.tensor([1.],device='cuda'))
    loss.backward()
    opt.step()

## gzipped data