In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import umap

In [2]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

def train_and_validate(model, train_loader, test_loader, epochs=20):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        train_losses = []
        train_outputs = []
        train_labels = []

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
            train_outputs.extend(outputs.detach().cpu().numpy())
            train_labels.extend(labels.cpu().numpy())

        train_loss = np.mean(train_losses)
        train_preds = np.argmax(train_outputs, axis=1)
        train_acc = accuracy_score(train_labels, train_preds)
        train_f1 = f1_score(train_labels, train_preds, average='micro')
        train_prec = precision_score(train_labels, train_preds, average='micro')
        train_recall = recall_score(train_labels, train_preds, average='micro')

        # Validation part
        model.eval()
        val_losses = []
        val_outputs = []
        val_labels = []
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_losses.append(loss.item())
                val_outputs.extend(outputs.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        val_loss = np.mean(val_losses)
        val_preds = np.argmax(val_outputs, axis=1)
        val_acc = accuracy_score(val_labels, val_preds)
        val_f1 = f1_score(val_labels, val_preds, average='micro')
        val_prec = precision_score(val_labels, val_preds, average='micro')
        val_recall = recall_score(val_labels, val_preds, average='micro')

        print(f'Epoch {epoch + 1}/{epochs} - Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Train F1: {train_f1:.4f}, Train Prec: {train_prec:.4f}, Train Recall: {train_recall:.4f}')
        print(f'Epoch {epoch + 1}/{epochs} - Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}, Val Prec: {val_prec:.4f}, Val Recall: {val_recall:.4f}')
        print()
        # print(f'Epoch {epoch + 1}/{epochs} - Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
        #print(f'Epoch {epoch + 1}/{epochs} - Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
        # print()

In [3]:
# Load data
X = np.load("scRNA_full.npy")
X = torch.tensor(X, dtype=torch.float32)

# Open labels and process
with open("label.txt") as f:
    labels = [line.strip() for line in f.readlines()]
label_to_index = {label: i for i, label in enumerate(np.unique(labels))}
y = torch.tensor([label_to_index[label] for label in labels], dtype=torch.long)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=True)


In [4]:
X_train.shape

torch.Size([7022, 13822])

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class PyTorchModel(nn.Module):
    def __init__(self):
        super(PyTorchModel, self).__init__()
        # First 1D convolutional layer
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=7, stride=4, padding=3)
        self.pool1 = nn.MaxPool1d(kernel_size=4, stride=4)
        
        # Second 1D convolutional layer
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=5, stride=2, padding=2)
        self.pool2 = nn.MaxPool1d(kernel_size=4, stride=4)
        
        # Calculate the output size after the convolution and pooling layers
        self.conv1_output_size = (13822 + 2 * 3 - 7) // 4 + 1  # padding=3, kernel_size=7, stride=4
        self.pool1_output_size = self.conv1_output_size // 4  # pool_size=4, stride=4

        self.conv2_output_size = (self.pool1_output_size + 2 * 2 - 5) // 2 + 1  # padding=2, kernel_size=5, stride=2
        self.pool2_output_size = self.conv2_output_size // 4  # pool_size=4, stride=4

        # Fully connected layers
        self.fc1 = nn.Linear(128 * self.pool2_output_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.output_layer = nn.Linear(256, 14)
        
        # Regularization layers
        self.dropout = nn.Dropout(0.5)
        self.batch_norm1 = nn.BatchNorm1d(512)
        self.batch_norm2 = nn.BatchNorm1d(256)

    def forward(self, x):
        # Add channel dimension
        x = x.unsqueeze(1)  # Shape (batch_size, 1, 13822)
        
        # Convolutional layers
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        
        # Flatten
        x = x.view(x.size(0), -1)
        
        # Fully connected layers with dropout and batch normalization
        x = self.dropout(F.relu(self.batch_norm1(self.fc1(x))))
        x = self.dropout(F.relu(self.batch_norm2(self.fc2(x))))
        
        # Output layer
        x = self.output_layer(x)
        return x

class ModelEnsemble(nn.Module):
    def __init__(self, n_models, base_model):
        super(ModelEnsemble, self).__init__()
        self.models = nn.ModuleList([base_model() for _ in range(n_models)])
    
    def forward(self, x):
        outputs = [model(x) for model in self.models]
        # Average the outputs from different models
        avg_output = torch.mean(torch.stack(outputs), dim=0)
        return avg_output

# Instantiate the ensemble with a number of models
n_models = 5
ensemble_model = ModelEnsemble(n_models, PyTorchModel)

# Define the optimizer and loss function
optimizer = optim.Adam(ensemble_model.parameters(), lr=0.0005)
criterion = nn.CrossEntropyLoss()

# Model summary-like printout
print(ensemble_model)


ModelEnsemble(
  (models): ModuleList(
    (0): PyTorchModel(
      (conv1): Conv1d(1, 64, kernel_size=(7,), stride=(4,), padding=(3,))
      (pool1): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
      (conv2): Conv1d(64, 128, kernel_size=(5,), stride=(2,), padding=(2,))
      (pool2): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
      (fc1): Linear(in_features=13824, out_features=512, bias=True)
      (fc2): Linear(in_features=512, out_features=256, bias=True)
      (output_layer): Linear(in_features=256, out_features=14, bias=True)
      (dropout): Dropout(p=0.5, inplace=False)
      (batch_norm1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (batch_norm2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): PyTorchModel(
      (conv1): Conv1d(1, 64, kernel_size=(7,), stride=(4,), padding=(3,))
      (pool1): MaxPool1d(kernel_size=4, stride

In [6]:
train_and_validate(ensemble_model, train_loader, test_loader, epochs=1000)

Epoch 1/1000 - Train Loss: 2.4278, Train Acc: 0.2726, Train F1: 0.2726, Train Prec: 0.2726, Train Recall: 0.2726
Epoch 1/1000 - Val Loss: 2.1872, Val Acc: 0.5047, Val F1: 0.5047, Val Prec: 0.5047, Val Recall: 0.5047

Epoch 2/1000 - Train Loss: 2.0620, Train Acc: 0.4534, Train F1: 0.4534, Train Prec: 0.4534, Train Recall: 0.4534
Epoch 2/1000 - Val Loss: 1.9358, Val Acc: 0.5970, Val F1: 0.5970, Val Prec: 0.5970, Val Recall: 0.5970

Epoch 3/1000 - Train Loss: 1.8503, Train Acc: 0.5212, Train F1: 0.5212, Train Prec: 0.5212, Train Recall: 0.5212
Epoch 3/1000 - Val Loss: 1.7761, Val Acc: 0.6429, Val F1: 0.6429, Val Prec: 0.6429, Val Recall: 0.6429

Epoch 4/1000 - Train Loss: 1.7037, Train Acc: 0.5738, Train F1: 0.5738, Train Prec: 0.5738, Train Recall: 0.5738
Epoch 4/1000 - Val Loss: 1.6504, Val Acc: 0.6731, Val F1: 0.6731, Val Prec: 0.6731, Val Recall: 0.6731

Epoch 5/1000 - Train Loss: 1.5929, Train Acc: 0.6072, Train F1: 0.6072, Train Prec: 0.6072, Train Recall: 0.6072
Epoch 5/1000 - Val 

KeyboardInterrupt: 