In [2]:
import anndata as ad

In [3]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale 
from sklearn import model_selection
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import magic
from sklearn.decomposition import PCA

In [4]:
# Training Data
adata_gex = ad.read_h5ad("Gex_processed_training.h5ad") #gex is gene expression which are RNA
adata_adt = ad.read_h5ad("Adt_processed_training.h5ad") #adt contains info about the proteins in the cells

# Testing Data
test_gex = ad.read_h5ad("Gex_processed_testing.h5ad")
test_adt = ad.read_h5ad("Adt_processed_testing.h5ad")

In [5]:
# TRAINING DATA
# This returns the NORMALIZED COUNT of ALL RNA sequences within the cell
C = adata_gex.X

# We keep the protein datat the same because we want to predict for only the number of CD86 Proteins in the cell
C1 = adata_adt.X

# Utilize sklearn train_test_split function to split up the C array. Small array contains 10% the data of the full array
# Notice that C_small has only ~4000 row of data instead of 13000
C_small, C_large, C1_small, C1_large = train_test_split(C,C1,train_size=0.01)
C_small

# Then we want to turn it into an array so we can work on it.
X_train = C_small.toarray()
y_train = C1_small.toarray()

In [None]:
magic_operator = magic.MAGIC()
X_train = magic_operator.fit_transform(X_train)
X_test = magic_operator.fit_transoform(X_test)

Calculating MAGIC...
  Running MAGIC on 421 cells and 13953 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


In [5]:
# This returns the NORMALIZED COUNT of the RNA sequence CD86 in EACH CELL
C_test = test_gex.X

# Same thing below except for proteins
C1_test = test_adt.X

# Then we want to turn it into an array so we can work on it
X_test = C_test.toarray()
y_test = C1_test.toarray()

In [6]:
# Hyperparameters for our network
input_size = 13953
hidden_sizes = [5000,500]
output_size = 134

# Build a feed-forward network
model = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
                      nn.BatchNorm1d(hidden_sizes[0], affine=False),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[0], hidden_sizes[1]),
                      nn.BatchNorm1d(hidden_sizes[1], affine=False),
                      nn.Sigmoid(),
                      nn.Linear(hidden_sizes[1], output_size))
print(model)

Sequential(
  (0): Linear(in_features=13953, out_features=5000, bias=True)
  (1): BatchNorm1d(5000, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  (2): ReLU()
  (3): Linear(in_features=5000, out_features=500, bias=True)
  (4): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  (5): Sigmoid()
  (6): Linear(in_features=500, out_features=134, bias=True)
)


In [7]:
X_train_tensor = torch.Tensor(X_train)
y_train_tensor = torch.Tensor(y_train)
X_test_tensor  = torch.Tensor(X_test)
y_test_tensor  = torch.Tensor(y_test)

mydata_train = TensorDataset(X_train_tensor,y_train_tensor)
mydata_test  = TensorDataset(X_test_tensor,y_test_tensor)

trainloader = torch.utils.data.DataLoader(mydata_train, batch_size=10, shuffle=True, drop_last=True)
testloader = torch.utils.data.DataLoader(mydata_test, batch_size=10, shuffle=True,drop_last=True)

In [8]:
y_test_tensor.numpy()

array([[1.171824  , 0.29471976, 0.9662469 , ..., 0.7485523 , 0.8265028 ,
        2.082733  ],
       [0.        , 0.59661454, 1.1975185 , ..., 2.258381  , 0.6688309 ,
        0.4343594 ],
       [0.        , 0.32750162, 1.0775542 , ..., 0.4582131 , 0.4582131 ,
        0.4582131 ],
       ...,
       [0.5736808 , 0.7711177 , 1.3115053 , ..., 0.93592685, 1.3115053 ,
        0.        ],
       [0.5026481 , 0.19696192, 0.8355951 , ..., 1.4505513 , 1.0849457 ,
        1.0085349 ],
       [0.4453852 , 0.2472863 , 0.6106549 , ..., 1.9302828 , 0.2472863 ,
        1.2598553 ]], dtype=float32)

In [9]:
%time 

criterion = nn.MSELoss()# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.SGD(model.parameters(), lr=0.003)
epochs = 1
for e in range(epochs):
    running_loss = 0
    for data, target in trainloader:
    
        # Training pass
        optimizer.zero_grad()
        model.train()
        output = model(data) #<--- note this line is using the model you set up at the beginning of this section
        output = output.float()
        target = target.float()
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    else:
        print(f"Training loss: {running_loss/len(trainloader)}")

CPU times: user 2 µs, sys: 2 µs, total: 4 µs
Wall time: 7.63 µs
Training loss: 0.8776291012763977


In [10]:
predict = model(X_test_tensor)
#predict = model(X_train_tensor)

In [20]:
#criterion(predict,y_test_tensor)

tensor(0.2824, grad_fn=<MseLossBackward0>)

1. 1% of data, No Magic, Nothing else: 0.3640
2. 1% of data, No Magic, Batch Normalization: 0.2824
3. 1% of data, Magic, Batch Normalization: 0.3348
4. 1% of data, PCA, Batch Normalization: 0.

In [15]:
mean_squared_error(predict.detach().numpy(), y_test_tensor.numpy(), squared = False)

0.78214717