In [None]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
cath_df = pd.read_csv("./data/cath_moments.tsv", sep='\t', header=None) # For Train
ecod_df = pd.read_csv("./data/ecod_moments.tsv", sep='\t', header=None) # For eval

In [None]:
cath_info = cath_df.info()

In [None]:
ecod_info = ecod_df.info()

In [None]:
# Summarize shapes, classes, and a preview
cath_summary = {
    "shape": cath_df.shape,
    "unique_classes": cath_df[0].nunique(),
    "class_distribution": cath_df[0].value_counts(),
    "head": cath_df.head()
}

ecod_summary = {
    "shape": ecod_df.shape,
    "unique_classes": ecod_df[0].nunique(),
    "class_distribution": ecod_df[0].value_counts(),
    "head": ecod_df.head()
}

In [None]:
print(cath_summary)

In [None]:
print(ecod_summary)

In [None]:
a = [1.2,3.4,5.6],
b = [2.4,3.2,4.5]

z_ab = abs(a[i]-b[i])

2*(1.2-2.4)/1+abs(1.2)+abs(2.4)

Vector in the original dimenstion

30 proteins 
30*29/2-> pairs of proteins

In [None]:
cath_df.iloc[0]

In [None]:
for i in range(0,2):
    print(type(cath_df.iloc[0,i]))

In [None]:
# Plot class distribution for CATH
cath_class_counts = cath_df[0].value_counts()
plt.figure(figsize=(12, 4))
plt.hist(cath_class_counts, bins=30, edgecolor='black')
plt.title('Class Frequency Distribution in CATH Dataset')
plt.xlabel('Number of proteins per class')
plt.ylabel('Number of classes')
plt.grid(True)
plt.tight_layout()
plt.show()

# Plot class distribution for ECOD
ecod_class_counts = ecod_df[0].value_counts()
plt.figure(figsize=(12, 4))
plt.hist(ecod_class_counts, bins=20, edgecolor='black')
plt.title('Class Frequency Distribution in ECOD Dataset')
plt.xlabel('Number of proteins per class')
plt.ylabel('Number of classes')
plt.grid(True)
plt.tight_layout()
plt.show()


### Visualizing Class Imbalance

Each protein in the dataset is associated with a structural class label (column 0), which identifies its 3D(1D in this case) shape category. Understanding how many proteins fall into each class is critical because the dataset is not balanced—some classes have many proteins, while others have very few.

To quantify this, we use `value_counts()` to compute the frequency of each class label and visualize the distribution with a histogram.

#### Why this is important:
During training, we will create pairs of proteins to determine structural similarity. If a particular class contains many proteins, it will generate significantly more pairs, which may bias the model toward frequently occurring classes. This can lead to overfitting and poor generalization, especially on underrepresented classes.

By plotting the class frequency distribution:
- We confirm whether class imbalance is present.
- We motivate the need for sampling strategies such as `WeightedRandomSampler` to correct for this imbalance during training.

This analysis is a key step in understanding the structure of the data and informing how we design the training process.


In [None]:
import sys
import numpy as np

print("Python version:", sys.version)
print("NumPy version:", np.__version__)


In [1]:
import pandas as pd
import torch
import importlib
import train
import dataset
import time
import os
importlib.reload(train)
importlib.reload(dataset)
import random

random.seed(42)


from train import train_model, test_model_on_ecod

[INFO] Using 9 CPU threads
[INFO] Using 9 CPU threads


In [2]:
# Load datasets
cath_df = pd.read_csv("./data/cath_moments.tsv", sep='\t', header=None).dropna(axis=1)
ecod_df = pd.read_csv("./data/ecod_moments.tsv", sep='\t', header=None).dropna(axis=1)

print(f"CATH: {cath_df.shape}, ECOD: {ecod_df.shape}")
features,labels=None,None


CATH: (2685, 3923), ECOD: (761, 3923)


In [3]:
num_proteins = 2685

In [None]:
import pandas as pd
from cache_utils import cache_pairwise_data

df = pd.read_csv("./data/cath_moments.tsv", sep="\t", header=None).dropna(axis=1)


cache_dir = "./cache/cath_"+str(num_proteins)
merge_dir = cache_dir+"/cath_merged.pkl"

# if not os.path.exists(merge_dir):
#     cache_pairwise_data(df.head(num_proteins), cache_dir=cache_dir, buffer_limit_mb=100)

In [None]:
from cache_utils import load_cached_parts,load_and_merge_parts
from dataset import ProteinPairDataset

tic = time.time_ns()
# Step 1: Load and merge buffered part_*.pkl files
features, labels = load_cached_parts(cache_dir, max_threads=16)

# #Parallel Load all the parts
# features, labels = load_and_merge_parts(
#     cache_dir=cache_dir,
#     save_path=merge_dir, 
#     max_threads=16
# )

tac = time.time_ns()
print("Loaded in ",(tac-tic)/(10**6),"ms")

In [None]:
from cache_utils import load_cached_parts

# Load cached features/labels 
features, labels = load_cached_parts("./cache/cath_"+str(num_proteins))

# Check dimensions
print(f"[INFO] Loaded {len(features)} pairs with shape {features[0].shape}")

In [4]:
print("Expected pairs:",num_proteins*(num_proteins-1)//2)

Expected pairs: 3603270


In [None]:
input_dim = cath_df.shape[1]-1
streaming = True

if streaming==False:
    print("-"*20, "CACHE","-"*20)
    model = train_model(features=features, labels=labels, hidden_dim=64, input_dim=input_dim, streaming=streaming,batch_size=128,num_epochs=20,val_split=0.1)
else:
    print("-"*20, "OTG","-"*20)
    model = train_model(protein_df=cath_df.head(num_proteins), hidden_dim=64, input_dim=input_dim, streaming=streaming,batch_size=128,num_epochs=20,val_split=0.1)

In [5]:
#Load the best model
import torch
from model import ProteinClassifier
input_dim = cath_df.shape[1]-1
# Define your model architecture (must match the saved model)
model = ProteinClassifier(hidden_dim=64, input_dim=input_dim)

# Load saved weights
model.load_state_dict(torch.load("./modelData/baseline_h64_bs128_lr0.001_ep20_20250328-041027_best.pt", map_location="cpu"))

# Set model to evaluation mode
model.eval()

  model.load_state_dict(torch.load("./modelData/baseline_h64_bs128_lr0.001_ep20_20250328-041027_best.pt", map_location="cpu"))


ProteinClassifier(
  (linear1): Linear(in_features=3922, out_features=64, bias=True)
  (relu): ReLU()
  (linear2): Linear(in_features=64, out_features=1, bias=True)
)

In [6]:
test_model_on_ecod(model, ecod_df)

[INFO] Streaming init done in 0.01 seconds
[FINAL TEST on ECOD] Loss: 1393.0452 | ROC AUC: 0.950 | PR AUC: 0.670 | MCC: 0.601


(1393.045249933036, 0.9500322503509842, 0.6697077900184016, 0.6010258640442017)