In [None]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
cath_df = pd.read_csv("./data/cath_moments.tsv", sep='\t', header=None) # For Train
ecod_df = pd.read_csv("./data/ecod_moments.tsv", sep='\t', header=None) # For eval

In [None]:
cath_info = cath_df.info()

In [None]:
ecod_info = ecod_df.info()

In [None]:
# Summarize shapes, classes, and a preview
cath_summary = {
    "shape": cath_df.shape,
    "unique_classes": cath_df[0].nunique(),
    "class_distribution": cath_df[0].value_counts(),
    "head": cath_df.head()
}

ecod_summary = {
    "shape": ecod_df.shape,
    "unique_classes": ecod_df[0].nunique(),
    "class_distribution": ecod_df[0].value_counts(),
    "head": ecod_df.head()
}

In [None]:
print(cath_summary)

In [None]:
print(ecod_summary)

In [None]:
for i in range(0,2):
    print(type(cath_df.iloc[0,i]))

In [None]:
# Plot class distribution for CATH
cath_class_counts = cath_df[0].value_counts()
plt.figure(figsize=(12, 4))
plt.hist(cath_class_counts, bins=30, edgecolor='black')
plt.title('Class Frequency Distribution in CATH Dataset')
plt.xlabel('Number of proteins per class')
plt.ylabel('Number of classes')
plt.grid(True)
plt.tight_layout()
plt.show()

# Plot class distribution for ECOD
ecod_class_counts = ecod_df[0].value_counts()
plt.figure(figsize=(12, 4))
plt.hist(ecod_class_counts, bins=20, edgecolor='black')
plt.title('Class Frequency Distribution in ECOD Dataset')
plt.xlabel('Number of proteins per class')
plt.ylabel('Number of classes')
plt.grid(True)
plt.tight_layout()
plt.show()


### Visualizing Class Imbalance

Each protein in the dataset is associated with a structural class label (column 0), which identifies its 3D(1D in this case) shape category. Understanding how many proteins fall into each class is critical because the dataset is not balanced—some classes have many proteins, while others have very few.

To quantify this, we use `value_counts()` to compute the frequency of each class label and visualize the distribution with a histogram.

#### Why this is important:
During training, we will create pairs of proteins to determine structural similarity. If a particular class contains many proteins, it will generate significantly more pairs, which may bias the model toward frequently occurring classes. This can lead to overfitting and poor generalization, especially on underrepresented classes.

By plotting the class frequency distribution:
- We confirm whether class imbalance is present.
- We motivate the need for sampling strategies such as `WeightedRandomSampler` to correct for this imbalance during training.

This analysis is a key step in understanding the structure of the data and informing how we design the training process.


In [None]:
import sys
import numpy as np

print("Python version:", sys.version)
print("NumPy version:", np.__version__)


In [1]:
import pandas as pd
import torch
import importlib
import train
import time
import os

importlib.reload(train)

from train import train_model, test_model_on_ecod

In [2]:
# Load datasets
cath_df = pd.read_csv("./data/cath_moments.tsv", sep='\t', header=None).dropna(axis=1)
ecod_df = pd.read_csv("./data/ecod_moments.tsv", sep='\t', header=None).dropna(axis=1)

print(f"CATH: {cath_df.shape}, ECOD: {ecod_df.shape}")


CATH: (2685, 3923), ECOD: (761, 3923)


In [3]:
import pandas as pd
from cache_utils import cache_pairwise_data

df = pd.read_csv("./data/cath_moments.tsv", sep="\t", header=None).dropna(axis=1)
num_proteins = 200

cache_dir = "./cache/cath_"+str(num_proteins)
merge_dir = cache_dir+"/cath_merged.pkl"

if not os.path.exists(merge_dir):
    cache_pairwise_data(df.head(num_proteins), cache_dir=cache_dir, buffer_limit_mb=100)

In [4]:
from cache_utils import load_cached_parts,load_and_merge_parts
from dataset import ProteinPairDataset

tic = time.time_ns()
# Step 1: Load and merge buffered part_*.pkl files
# features, labels = load_cached_parts("cache/cath_buffered", max_threads=16)

#Parallel Load all the parts
features, labels = load_and_merge_parts(
    cache_dir=cache_dir,
    save_path=merge_dir, 
    max_threads=16
)

tac = time.time_ns()
print("Loaded in ",(tac-tic)/(10**6),"ms")

Loading 21 parts from ./cache/cath_200 using 16 threads...


100%|███████████████████████████████████████████| 21/21 [00:00<00:00, 23.78it/s]


 Merged total pairs: 19900
 Saved merged dataset to: ./cache/cath_200/cath_merged.pkl
Loaded in  1562.653 ms


In [5]:
# Step 2: Pass them into your dataset directly
dataset = ProteinPairDataset(features=features, labels=labels)

In [6]:
print("Expected pairs:",num_proteins*(num_proteins-1)//2)

Expected pairs: 19900


In [None]:
# Optional: re-import the functions explicitly
from train import train_model, test_model_on_ecod

# # Train logistic regression
# model_logistic = train_model(cath_df, hidden_dim=None, num_epochs=5, batch_size=8)

# Train small neural net
model = train_model(features=features, labels=labels, hidden_dim=64, num_epochs=100)

Training on: cpu (CPU)
Training model (hidden_dim=64) for 100 epochs...
[TensorBoard] Logging to: tensorboard_logs/baseline_h64_bs4_lr0.001_ep100_20250326-041136


Epoch 1/100:   0%|          | 0/3980 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 2/100:   0%|          | 0/3980 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 3/100:   0%|          | 0/3980 [00:00<?, ?it/s]

In [None]:
## Logistic Regression

In [None]:
model = train_model(features=features, labels=labels, hidden_dim=None, num_epochs=5)