In [None]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
cath_df = pd.read_csv("./data/cath_moments.tsv", sep='\t', header=None) # For Train
ecod_df = pd.read_csv("./data/ecod_moments.tsv", sep='\t', header=None) # For eval

In [None]:
cath_info = cath_df.info()

In [None]:
ecod_info = ecod_df.info()

In [None]:
# Summarize shapes, classes, and a preview
cath_summary = {
    "shape": cath_df.shape,
    "unique_classes": cath_df[0].nunique(),
    "class_distribution": cath_df[0].value_counts(),
    "head": cath_df.head()
}

ecod_summary = {
    "shape": ecod_df.shape,
    "unique_classes": ecod_df[0].nunique(),
    "class_distribution": ecod_df[0].value_counts(),
    "head": ecod_df.head()
}

In [None]:
print(cath_summary)

In [None]:
print(ecod_summary)

In [None]:
for i in range(0,2):
    print(type(cath_df.iloc[0,i]))

In [None]:
# Plot class distribution for CATH
cath_class_counts = cath_df[0].value_counts()
plt.figure(figsize=(12, 4))
plt.hist(cath_class_counts, bins=30, edgecolor='black')
plt.title('Class Frequency Distribution in CATH Dataset')
plt.xlabel('Number of proteins per class')
plt.ylabel('Number of classes')
plt.grid(True)
plt.tight_layout()
plt.show()

# Plot class distribution for ECOD
ecod_class_counts = ecod_df[0].value_counts()
plt.figure(figsize=(12, 4))
plt.hist(ecod_class_counts, bins=20, edgecolor='black')
plt.title('Class Frequency Distribution in ECOD Dataset')
plt.xlabel('Number of proteins per class')
plt.ylabel('Number of classes')
plt.grid(True)
plt.tight_layout()
plt.show()


### Visualizing Class Imbalance

Each protein in the dataset is associated with a structural class label (column 0), which identifies its 3D(1D in this case) shape category. Understanding how many proteins fall into each class is critical because the dataset is not balanced—some classes have many proteins, while others have very few.

To quantify this, we use `value_counts()` to compute the frequency of each class label and visualize the distribution with a histogram.

#### Why this is important:
During training, we will create pairs of proteins to determine structural similarity. If a particular class contains many proteins, it will generate significantly more pairs, which may bias the model toward frequently occurring classes. This can lead to overfitting and poor generalization, especially on underrepresented classes.

By plotting the class frequency distribution:
- We confirm whether class imbalance is present.
- We motivate the need for sampling strategies such as `WeightedRandomSampler` to correct for this imbalance during training.

This analysis is a key step in understanding the structure of the data and informing how we design the training process.


In [1]:
import pandas as pd
import torch

from train import train_model, test_model_on_ecod

In [2]:
# Load datasets
cath_df = pd.read_csv("./data/cath_moments.tsv", sep='\t', header=None).dropna(axis=1)
ecod_df = pd.read_csv("./data/ecod_moments.tsv", sep='\t', header=None).dropna(axis=1)

print(f"CATH: {cath_df.shape}, ECOD: {ecod_df.shape}")


CATH: (2685, 3923), ECOD: (761, 3923)


In [6]:
import pandas as pd
from cache_utils import cache_pairwise_data

df = pd.read_csv("./data/cath_moments.tsv", sep="\t", header=None).dropna(axis=1)
cache_pairwise_data(df.head(2685), cache_dir="./cache/cath_buffered", buffer_limit_mb=100)

Generating pairwise data from 2685 proteins using 12 cores...
Flushed buffer 0 with 1 items → ./cache/cath_buffered/part_0.pkl
Flushed buffer 1 with 1000 items → ./cache/cath_buffered/part_1.pkl
Flushed buffer 2 with 1000 items → ./cache/cath_buffered/part_2.pkl
Flushed buffer 3 with 1000 items → ./cache/cath_buffered/part_3.pkl
Flushed buffer 4 with 1000 items → ./cache/cath_buffered/part_4.pkl
Flushed buffer 5 with 1000 items → ./cache/cath_buffered/part_5.pkl
Flushed buffer 6 with 1000 items → ./cache/cath_buffered/part_6.pkl
Flushed buffer 7 with 1000 items → ./cache/cath_buffered/part_7.pkl
Flushed buffer 8 with 1000 items → ./cache/cath_buffered/part_8.pkl
Flushed buffer 9 with 1000 items → ./cache/cath_buffered/part_9.pkl
Flushed buffer 10 with 1000 items → ./cache/cath_buffered/part_10.pkl
Flushed buffer 11 with 1000 items → ./cache/cath_buffered/part_11.pkl
Flushed buffer 12 with 1000 items → ./cache/cath_buffered/part_12.pkl
Flushed buffer 13 with 1000 items → ./cache/cath_bu

In [None]:
from cache_utils import load_cached_parts
from dataset import ProteinPairDataset

# Step 1: Load and merge buffered part_*.pkl files
features, labels = load_cached_parts("./cache/cath_buffered")

# Step 2: Pass them into your dataset directly
dataset = ProteinPairDataset(features=features, labels=labels)

Loading 3605 cached parts from: ./cache/cath_buffered


In [7]:
2685*2684/2

3603270.0

In [None]:
from cache_utils import cache_pairwise_data
import time

# caching proteins for fast prototyping
tic = time.time_ns()
num_items = 2685
cache_pairwise_data(cath_df.head(num_items),cache_dir="cache/test_cache", buffer_limit_mb=10)
tac = time.time_ns()
print((tac-tic)/(10**6),"ms")

In [None]:
# # Train logistic regression
# model_logistic = train_model(cath_df, hidden_dim=None, num_epochs=5, batch_size=8)

# Train small neural net
model_fcn = train_model(cath_df.head(150), hidden_dim=64, num_epochs=5, batch_size=8)