In [None]:
# Import libraries that are required to run your project
# You are allowed to add more libraries as you need

import pyBigWig
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
from scipy.stats import spearmanr

## Work Package 1.1 - Modeling Choices & Data Pre-processing

In [None]:
from dataset import preprocess_cage_data, prepare_train_validation, prepare_test, create_dataset

path_data= "../label_data/"
path_expression = "../CAGE-train"
# ---------------------------INSERT CODE HERE---------------------------

if not os.path.exists(path_data):
    os.makedirs(path_data)
    preprocess_cage_data(path_expression)

    # Prepare the data for training and validation
    prepare_train_validation(path_data)

    # Prepare the data for testing
    prepare_test(path_data)

bin_size = 250
create_dataset(path_data, 20000, bin_size)

# ---------------------------------------------------------------------- 

## Work Package 1.2 - Model Building

In [None]:
from model import *

# ---------------------------INSERT CODE HERE---------------------------

# fix random seeds
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
# load data
X_train = pickle.load(open("../data/X_train.pickle", "rb"))
X_val = pickle.load(open("../data/X_val.pickle", "rb"))
train = list(X_train.items())
validation = list(X_val.items())

# random shuffle train
random.shuffle(train)
# Prepare train tensor
train_names = [x[0] for x in train]
train_features = [x[1][0] for x in train]
train_labels = [x[1][1] for x in train]
train_features_nparray = np.stack([i for i in train_features], axis=0)
# to torch tensor
x_data_train = torch.from_numpy(train_features_nparray.astype('float32'))
y_data_train = torch.from_numpy(np.array(train_labels).astype('float32'))
# Prepare validation tensor
val_names = [x[0] for x in validation]
val_features = [x[1][0] for x in validation]
val_labels = [x[1][1] for x in validation]
val_features_nparray = np.stack([i for i in val_features], axis=0)
x_data_val = torch.from_numpy(val_features_nparray.astype('float32'))  # batch x seq_len x dim
y_data_val = torch.from_numpy(np.array(val_labels).astype('float32'))
# Create a -1000 tensor and append to the first column of x_data in the dimension 1
initial_tensor = torch.full((x_data_train.size(dim=0), 1, x_data_train.size(dim=2)), -1000)
x_data_train = torch.cat((initial_tensor, x_data_train), dim=1)
# check that x_data has -1000 in first position of dimension 1
assert torch.all(x_data_train[:, 0, :] == -1000)
x_data_train.to(device)
y_data_train.to(device)
# split data
# x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.1, random_state=42)
# load pytorch model from file 'model.pt'
# create model
model = TransformerRegressor()
# if file exists load
#if os.path.isfile('model_transformer.pt'):
#   model.model.load_state_dict(torch.load('model_transformer.pt'))
model.set_validation_data(x_data_val, y_data_val)
# fit model
model.fit(x_data_train, y_data_train)
# detach from gpu and save model
model.model.cpu()
torch.save(model.model.state_dict(), 'model_improved_100epochs_seed.pt')

# ----------------------------------------------------------------------


## Work Package 1.3 - Prediction on Test Data (Evaluation Metric)

In [None]:
# TODO:
# Using the model trained in WP 1.2, make predictions on the test data (chr 1 of cell line X3).
# Store predictions in a variable called "pred" which is a numpy array.

pred = None
# ---------------------------INSERT CODE HERE---------------------------




# ----------------------------------------------------------------------

# Check if "pred" meets the specified constrains
assert isinstance(pred, np.ndarray), 'Prediction array must be a numpy array'
assert np.issubdtype(pred.dtype, np.number), 'Prediction array must be numeric'
assert pred.shape[0] == len(test_genes), 'Each gene should have a unique predicted expression'

#### Store Predictions in the Required Format

In [None]:
# Store predictions in a ZIP. 
# Upload this zip on the project website under "Your submission".
# Zip this notebook along with the conda environment (and README, optional) and upload this under "Your code".

save_dir = 'path/to/save/output/file'  # TODO
file_name = 'gex_predicted.csv'         # PLEASE DO NOT CHANGE THIS
zip_name = "LastName_FirstName_Project1.zip" # TODO
save_path = f'{save_dir}/{zip_name}'
compression_options = dict(method="zip", archive_name=file_name)

test_genes['gex_predicted'] = pred.tolist()
test_genes[['gene_name', 'gex_predicted']].to_csv(save_path, compression=compression_options)