# 0 Data Processing

1. Initialisation
2. Language Model Data
3. Supervised Data
    - 3.1 Long non-coding RNA (lncRNA) vs Messenger RNA (mRNA)
    - 3.2 Promotor regions vs Non-promoter regions

## 1. Initialisation

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from pathlib import Path

# Environment
HOME_LOCAL  = Path('/home/jovyan/ml_genomics')
HOME_REMOTE = Path('/home/athon/')

# Data for Language Model - Human Reference Genome
HOME        = HOME_REMOTE
HUMAN       = HOME / Path('thesis/data/human/')

# Labels for supervised problems
NB_DATA     = HOME / Path('thesis/notebooks/2_seq_modelling/data')
LNCRNA_MRNA = NB_DATA / Path('lncrna/')
PROMOTERS   = NB_DATA / Path('promoters/')

# Hyperparams
C1  = 10000   # chunking for genomic data
C2  = 2000    # number of tokens per row

## 2. Language Model Data
Parse `GRCh38 p13` human reference genome `.fasta` file using `biopython`.

In [None]:
from utils import process_fasta

# parse raw data
RAW_DATA = HUMAN / 'GCF_000001405.39_GRCh38.p13_genomic.fna'
data = process_fasta(RAW_DATA, C1, C2, filter_txt='NC_')

# write to dataframe
df = pd.DataFrame(data, columns=['Sequence'])
df['Source'] = 'NCBI Human'

# save to .csv
df.to_csv(HUMAN / 'human_genome_data_fa.csv', index=False)

## 3. Supervised Data

### 3.1 lncRNA vs mRNA

In [3]:
import pandas as pd
from Bio import SeqIO

def parse_fasta(path):
    """Read fasta data with BioPython and then save to dataframe with metadata in Name column.
    """
    # read fasta
    fasta = SeqIO.parse(path, 'fasta')
    fs = [i for i in fasta]
    seqs = [str(i.seq) for i in fs]
    
    # save to df according to name
    name = path.split('/')[-1]
    df = pd.DataFrame(seqs, columns=['Sequence'])
    df['Name'] = name
    df = df.drop_duplicates()
    return df

def partition_data(df):
    train_size = int(len(df)*0.90)
    valid_size = int(len(df)) - train_size
    
    train_df = df.sample(train_size)
    valid_df = df.drop(train_df.index)
    
    train_df['set'] = 'train'
    valid_df['set'] = 'valid'
    
    return pd.concat([train_df, valid_df])

from glob import glob
fasta_files = glob(str(LNCRNA_MRNA / '*.fa'))
fasta_files

data_df = pd.concat([parse_fasta(file) for file in fasta_files])
data_df.to_csv(HUMAN/'lncRNA.csv')