# Predictions for high-loop-density areas

Author: Amulya Garimella

Last updated: 2025/05/02

In [1]:
import cooler
import numpy as np
from sklearn.linear_model import LogisticRegression, LinearRegression
import pybedtools
import pandas as pd
import sklearn.metrics
import scipy.stats
import torch

ModuleNotFoundError: No module named 'sklearn'

## Input files

In [516]:
input_cool_file = "/Users/amulyagarimella/Documents/2241finalproject/data/GM12878.GSE115524/GM12878.GSE115524.Homo_Sapiens.CTCF.b1.mcool"

## Feature and label extraction

In [517]:
# We will divide the matrix into 64 sub-matrices

chrsizes = pd.read_csv("/Users/amulyagarimella/Documents/2241finalproject/HiC2MicroC/data/hg38.sizes", sep="\t", header=None, index_col=0)
input_res = 10000

In [524]:
input_loops_file = f"/Users/amulyagarimella/Documents/2241finalproject/data/GM12878.GSE115524/loops/GM12878.GSE115524.Homo_Sapiens.CTCF.b1.{input_res}.fithichip_hp.longrange.bed"

def process_loop_catalog_dataset(dataset_path):
    """
    Process the loop catalog dataset to extract relevant columns and convert them to a PyRanges object.
    """
    df = pd.read_csv(dataset_path, header=None, sep='\t')
    extra = df[3].str.split(r',|:|-', expand=True)
    df = pd.concat([df.drop(columns=[3]), extra], axis=1)
    df.columns = ["chr_1", "start_1", "end_1", "chr_2", "start_2", "end_2", "score"]
    # Convert start and end columns to integers
    df[["start_1", "end_1", "start_2", "end_2"]] = df[["start_1", "end_1", "start_2", "end_2"]].astype(int)
    
    # Convert score to float
    df["score"] = df["score"].astype(float)
    return df

loops = process_loop_catalog_dataset(input_loops_file)

In [525]:
# extract part of cooler
clr = cooler.Cooler(f'{input_cool_file}::/resolutions/{input_res}')
#mat = clr.matrix(balance=False)[start:start+size, start:start+size]

In [676]:
# Feature extraction: divide mat into submatrices
# Labels: determine number of loops in each submatrix

CHRNAME = "chr1"


def get_features_and_labels (chrnames, res):
    features = []
    labels = []

    for chrname in chrnames:
        chrlen = chrsizes.loc[chrname].values[0]
        chrlen_bins = chrlen//res
        min_submatrices = 100
        submatrix_len = 100*res
        submatrix_len_bins = submatrix_len//res

        overlap_len = max((submatrix_len_bins*min_submatrices - chrlen_bins) // (min_submatrices - 1),0)
        stridelen_bins = submatrix_len_bins - overlap_len

        for i in np.arange(0, chrlen_bins, stridelen_bins):
            # Extract the submatrix
            i_end = min(i + submatrix_len_bins, chrlen_bins)
            #print(f"Bin {i} to {i_end}")
            
            start_coord = int(i*res)
            end_coord = int(i_end*res)
            #print(f"Start coord: {start_coord}, End coord: {end_coord}")

            counts = clr.matrix(sparse=True, balance=False).fetch(
                f"{chrname}:{start_coord}-{end_coord}"
            )

            try:
                counts.set_shape((1, submatrix_len_bins ** 2))
            except Exception as e:
                print(f"Error setting shape for submatrix {i}: {e}")
                counts = scipy.sparse.coo_matrix((counts.data, (counts.row, counts.col)), shape=(submatrix_len_bins, submatrix_len_bins))
                counts.set_shape((1, submatrix_len_bins ** 2))
                    
            '''
            print(counts.head())
            print(counts.tail())
            print(counts.shape)
            indices = torch.LongTensor([counts.bin1_id, counts.bin2_id])
            values  = torch.FloatTensor(counts["count"])
            shape   = torch.Size(counts.shape)
            sp_tensor = torch.sparse_coo_tensor(indices, values, shape)
            print(sp_tensor)
            '''

            '''submat = counts["counts"].values 
            
            # Create feature vector from the submatrix
            if len(submat) < submatrix_len_bins:
                # Pad the submatrix with zeros
                submat = np.pad(submat, (0, submatrix_len_bins - len(submat)), 'constant')
                print(f"Submatrix {i} padded to {submatrix_len_bins} bins")'''

            features.append(counts)
            # Determine the number of loops in the submatrix
            # print(loops.head())
            loops_in_submat = loops.loc[
                (loops['chr_1'] == chrname) & 
                (loops['start_1'] >= start_coord) & 
                (loops['end_1'] <= end_coord) & 
                (loops['chr_2'] == chrname) & 
                (loops['start_2'] >= start_coord) & 
                (loops['end_2'] <= end_coord)
            ]

            labels.append(len(loops_in_submat))
            if len(loops_in_submat) > 0:
                print(f"Submatrix {i} has {len(loops_in_submat)} loops")
            else:
                print(f"Submatrix {i} has no loops")

    # Convert to numpy arrays
    features = np.array(features)
    print(f"Created {len(features)} submatrices")
    print(f"Feature shape: {features.shape}")

    # Convert labels to numpy array
    labels = np.array(labels)
    print(f"Labels shape: {labels.shape}")

    features_coo = scipy.sparse.vstack(features, format='coo')
    features_csr = scipy.sparse.vstack([x.tocsr() for x in features], format='csr')
    features_csc = scipy.sparse.vstack([x.tocsc() for x in features], format='csc')
    

    return features_coo, features_csr, features_csc, labels

In [677]:
chrnames = chrsizes.index.values

In [678]:
test_chrname = "chr21"
train_chrnames = [x for x in chrnames if x != test_chrname]


In [679]:
feats_train_coo, feats_train_csr, feats_train_csc, labels_train = get_features_and_labels(["chr1"] ,input_res)

Submatrix 0 has 8 loops
Submatrix 100 has 76 loops
Submatrix 200 has 62 loops
Submatrix 300 has 20 loops
Submatrix 400 has 2 loops
Submatrix 500 has 12 loops
Submatrix 600 has 42 loops
Submatrix 700 has 14 loops
Submatrix 800 has 38 loops
Submatrix 900 has 64 loops
Submatrix 1000 has 70 loops
Submatrix 1100 has 48 loops
Submatrix 1200 has 72 loops
Submatrix 1300 has 12 loops
Submatrix 1400 has 14 loops
Submatrix 1500 has 46 loops
Submatrix 1600 has 50 loops
Submatrix 1700 has 50 loops
Submatrix 1800 has 10 loops
Submatrix 1900 has 66 loops
Submatrix 2000 has 18 loops
Submatrix 2100 has 42 loops
Submatrix 2200 has 20 loops
Submatrix 2300 has 60 loops
Submatrix 2400 has 52 loops
Submatrix 2500 has 56 loops
Submatrix 2600 has 68 loops
Submatrix 2700 has 114 loops
Submatrix 2800 has 44 loops
Submatrix 2900 has no loops
Submatrix 3000 has 32 loops
Submatrix 3100 has 80 loops
Submatrix 3200 has 28 loops
Submatrix 3300 has 20 loops
Submatrix 3400 has 14 loops
Submatrix 3500 has 22 loops
Subma

## baseline 1: regressions

### Ridge

In [680]:
sklearn.__version__

'1.6.1'

In [681]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from scipy.sparse import coo_matrix

# X_coo, y as above
model = Ridge(
    alpha=1.0,
    solver="lsqr",          # or 'lsqr','sparse_cg','lbfgs'
    max_iter=1000,
    random_state=42
)


In [682]:
model.fit(feats_train_csc, labels_train)       # accepts sparse X when using supported solvers :contentReference[oaicite:7]{index=7}
coefs = model.coef_

In [683]:
feats_test_coo, feats_test_csc, feats_test_csr, labels_test = get_features_and_labels([test_chrname], input_res)

Submatrix 0 has no loops
Submatrix 47 has no loops
Submatrix 94 has no loops
Submatrix 141 has no loops
Submatrix 188 has no loops
Submatrix 235 has no loops
Submatrix 282 has no loops
Submatrix 329 has no loops
Submatrix 376 has no loops
Submatrix 423 has 2 loops
Submatrix 470 has 2 loops
Submatrix 517 has no loops
Submatrix 564 has no loops
Submatrix 611 has no loops
Submatrix 658 has no loops
Submatrix 705 has no loops
Submatrix 752 has 4 loops
Submatrix 799 has 4 loops
Submatrix 846 has 4 loops
Submatrix 893 has 4 loops
Submatrix 940 has no loops
Submatrix 987 has 2 loops
Submatrix 1034 has no loops
Submatrix 1081 has no loops
Submatrix 1128 has no loops
Submatrix 1175 has no loops
Submatrix 1222 has no loops
Submatrix 1269 has no loops
Submatrix 1316 has 6 loops
Submatrix 1363 has 6 loops
Submatrix 1410 has 28 loops
Submatrix 1457 has 8 loops
Submatrix 1504 has no loops
Submatrix 1551 has 4 loops
Submatrix 1598 has 4 loops
Submatrix 1645 has no loops
Submatrix 1692 has 16 loops
Su

In [686]:
predictions_test = model.predict(feats_test_csc)
clipped_predictions_test = np.clip(predictions_test, 0, None)


In [690]:
print(sklearn.metrics.mean_squared_error(labels_test, predictions_test))
print(sklearn.metrics.mean_squared_error(labels_test, clipped_predictions_test))

1036.4280688379358
999.3646711264354


In [691]:
scipy.stats.pearsonr(labels_test, clipped_predictions_test)

PearsonRResult(statistic=np.float64(0.5985666065793712), pvalue=np.float64(4.791922677387746e-11))

In [694]:
scipy.stats.spearmanr(labels_test, clipped_predictions_test)

SignificanceResult(statistic=np.float64(0.6169586457400738), pvalue=np.float64(8.192979581480794e-12))

## baseline 2: logreg

In [720]:
loop_threshold = 0

binary_labels_train = np.where(labels_train > np.quantile(labels_train, loop_threshold), 1, 0)

In [721]:
logreg = LogisticRegression(max_iter=10000, random_state=42, penalty='l2')
logreg.fit(feats_train_csr, binary_labels_train)

In [722]:
binary_labels_test = np.where(labels_test > np.quantile(labels_test, loop_threshold), 1, 0)

binary_predictions_test = logreg.predict(feats_test_csr)

binary_predictions_test_proba = logreg.predict_proba(feats_test_csr)[:, 1]
error = sklearn.metrics.mean_squared_error(binary_labels_test, binary_predictions_test)

print(f"Binary classification accuracy: {sklearn.metrics.accuracy_score(binary_labels_test, binary_predictions_test)}")
print(f"Binary classification balanced accuracy: {sklearn.metrics.balanced_accuracy_score(binary_labels_test, binary_predictions_test)}")

print(f"Binary classification precision: {sklearn.metrics.precision_score(binary_labels_test, binary_predictions_test)}")
print(f"Binary classification recall: {sklearn.metrics.recall_score(binary_labels_test, binary_predictions_test)}")
print(f"Binary classification f1 score: {sklearn.metrics.f1_score(binary_labels_test, binary_predictions_test)}")
print(f"Binary classification roc_auc: {sklearn.metrics.roc_auc_score(binary_labels_test, binary_predictions_test_proba)}")
print(f"Binary classification roc_auc (labels): {sklearn.metrics.roc_auc_score(binary_labels_test, binary_predictions_test)}")
print(f"Binary classification roc_auc (predictions): {sklearn.metrics.roc_auc_score(binary_labels_test, binary_predictions_test)}")
print(f"Binary classification roc_auc (predictions probabilities): {sklearn.metrics.roc_auc_score(binary_labels_test, binary_predictions_test_proba)}")

Binary classification accuracy: 0.71
Binary classification balanced accuracy: 0.7307692307692308
Binary classification precision: 0.86
Binary classification recall: 0.6615384615384615
Binary classification f1 score: 0.7478260869565218
Binary classification roc_auc: 0.7665934065934066
Binary classification roc_auc (labels): 0.7307692307692307
Binary classification roc_auc (predictions): 0.7307692307692307
Binary classification roc_auc (predictions probabilities): 0.7665934065934066


In [725]:
baseline = sum(binary_labels_test) / len(binary_labels_test)
print(max(baseline, 1 - baseline))

0.65


In [726]:
binary_predictions_test

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0])