<a href="https://colab.research.google.com/github/YunseolPark/GenePredictionWorkshop/blob/main/%5B2020_01_25%5DGenePredictionWorkshop_ParkYunseol.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Translation Initiation Site Prediction Using Synthetic Datasets**


---

Prediction of translation initiation sites (TISs) can give insight into translation and the proteins synthesized by certain mRNAs. Thus, it is important for genome analysis.

We take a look at a prediction model and some feature analysis methods to study more indepth on this topic

## Building a prediction model

In [None]:
import numpy as np
import torch
import random
from torch.utils.data.dataset import Dataset

class TISDataset(Dataset):
    """
    Class to generate TIS dataset for dataloader
    """

    def read_dna(self, file, class_id):
        """
        Reads DNA files and save to a list of sequences

        Args:
            file: file that contains DNA sequences
            class_id: class of the sequence file (positive: 1, negative: 0)
        Return:
            List that contains tuples consisting of a sequence and class id for the given file
        """
        dna_list = []
        # read and save file as a list with the corresponding class
        for line in open(file):
            dna_list.append((line.strip(), class_id))
        return dna_list

    def __init__(self, pos_data, neg_data):
        # Make a list of DNA with class id
        self.dna_list = self.read_dna(pos_data, 1)      # Positive: 1
        self.dna_list.extend(self.read_dna(neg_data, 0))    # Negative: 0
        self.data_len = len(self.dna_list)
        #self.test_len = self.__len__()
        #self.train_len = 4 // self.test_len * 5

    def __getitem__(self, index):
        # Read data
        dna_data, label = self.dna_list[index]
        # Convert AGCT to 0123, N is treated as A (92 Ns)
        n = random.randint(0, 3)
        dna = {'A': 0, 'G': 1, 'C': 2, 'T': 3, 'N': n}
        # Assign values
        one_hot = np.zeros((4, len(dna_data)))
        for i, nuc in enumerate(dna_data):
            one_hot[dna[nuc], i] = 1
        # Convert numpy to tensor
        tensor_dna = torch.from_numpy(one_hot).float()
        return tensor_dna, label

    def __len__(self):
        return self.data_len