In [1]:
# Generate polya index from genome GFF.
from data_dir import annotated_grch38_gff, polya_grch38_index_csv
from data_preparation import generate_polya_index_from_annotated_genome
print('Generate polya index from annotated grch38 gff: {}'.format(generate_polya_index_from_annotated_genome(annotated_grch38_gff, polya_grch38_index_csv)))

Generate polya index from annotated grch38 gff: True


In [2]:
"""
Generate positive set from polya index.
"""
from data_preparation import generate_polya_positive_dataset_from_index
from data_dir import polya_grch38_index_csv, polya_grch38_positive_csv
print('Generate polya positive set from polya index: {}'.format(generate_polya_positive_dataset_from_index(polya_grch38_index_csv, polya_grch38_positive_csv)))
print(f"Generated samples for positive set from polya")

Generate polya positive set from polya index: True
Generated samples for positive set from polya


In [1]:
"""
Generate negative set from positive set.
"""
from data_preparation import generate_negative_dataset
from data_dir import polya_grch38_positive_csv, polya_grch38_negative_csv
print("Generate polya negative set from positive data {}".format(generate_negative_dataset(polya_grch38_positive_csv, polya_grch38_negative_csv)))

Generate polya negative set from positive data True


In [7]:
"""Chunks positive and negative into 512-chars-chunk."""
import os
positive = os.path.join("data", "poly-a", "grch38", "polya_positive.csv")
positive_chunk = os.path.join("data", "poly-a", "grch38", "polya_positive.expanded.csv")
negative = os.path.join("data", "poly-a", "grch38", "polya_negative.csv")
negative_chunk = os.path.join("data", "poly-a", "grch38", "polya_negative.expanded.csv")

from data_preparation import generate_chunk_csv
generate_chunk_csv(positive, positive_chunk, chunk_size=512, stride=256)
generate_chunk_csv(negative, negative_chunk, chunk_size=512, stride=256)

Generating kmer for <data\poly-a\grch38\polya_negative.csv>: 1908/1908                                                      

True

In [9]:
"""
Create balanced dataset. Size of promoter is 6943 so do sampling for positive and negative data for 6943 instance only.
"""
import os
import pandas as pd
positive_balanced = os.path.join("data", "poly-a", "grch38", "polya_positive.balanced.csv")
positive_chunk = os.path.join("data", "poly-a", "grch38", "polya_positive.expanded.csv")
negative_balanced = os.path.join("data", "poly-a", "grch38", "polya_negative.balanced.csv")
negative_chunk = os.path.join("data", "poly-a", "grch38", "polya_negative.expanded.csv")
size_pos = pd.read_csv(positive_chunk).shape[0]
size_neg = pd.read_csv(negative_chunk).shape[0]
print(f"original positive size {size_pos}")
print(f"original negative size {size_neg}")

if (size_pos > 6943):
    positive_df = pd.read_csv(positive_chunk).sample(n=6943)
    positive_df.to_csv(positive_balanced, index=False)
    print(f"original positive size {size_pos} => balanced positive size = {positive_df.shape[0]}")
if (size_neg > 6943):
    negative_df = pd.read_csv(negative_chunk).sample(n=6943)
    negative_df.to_csv(negative_balanced, index=False)
    print(f"original negative size {size_neg} => balanced negative size = {negative_df.shape[0]}")

original positive size 14477
original negative size 14477
original positive size 14477 => balanced positive size = 6943
original negative size 14477 => balanced negative size = 6943


In [11]:
import os
from data_preparation import split_csv
"""Split imbalanced dataset"""
positive_chunk = os.path.join("data", "poly-a", "grch38", "polya_positive.expanded.csv")
positive_chunk_train = os.path.join("data", "poly-a", "grch38", "polya_positive.train.csv")
positive_chunk_validation = os.path.join("data", "poly-a", "grch38", "polya_positive.validation.csv")
split_csv(positive_chunk, [0.8, 0.2], [positive_chunk_train, positive_chunk_validation])


negative_chunk = os.path.join("data", "poly-a", "grch38", "polya_negative.expanded.csv")
negative_chunk_train = os.path.join("data", "poly-a", "grch38", "polya_negative.train.csv")
negative_chunk_validation = os.path.join("data", "poly-a", "grch38", "polya_negative.validation.csv")
split_csv(negative_chunk, [0.8, 0.2], [negative_chunk_train, negative_chunk_validation])


"""Split balanced dataset"""
positive_balanced = os.path.join("data", "poly-a", "grch38", "polya_positive.balanced.csv")
positive_balanced_train = os.path.join("data", "poly-a", "grch38", "polya_positive.balanced.train.csv")
positive_balanced_validation = os.path.join("data", "poly-a", "grch38", "polya_positive.balanced.validation.csv")
split_csv(positive_balanced, [0.8, 0.2], [positive_balanced_train, positive_balanced_validation])

negative_balanced = os.path.join("data", "poly-a", "grch38", "polya_negative.balanced.csv")
negative_balanced_train = os.path.join("data", "poly-a", "grch38", "polya_negative.balanced.train.csv")
negative_balanced_validation = os.path.join("data", "poly-a", "grch38", "polya_negative.balanced.validation.csv")
split_csv(negative_balanced, [0.8, 0.2], [negative_balanced_train, negative_balanced_validation])

In [12]:
"""Merge training and validation imbalanced data"""
import os
from data_preparation import merge_csv
positive_chunk_train = os.path.join("data", "poly-a", "grch38", "polya_positive.train.csv")
positive_chunk_validation = os.path.join("data", "poly-a", "grch38", "polya_positive.validation.csv")
negative_chunk_train = os.path.join("data", "poly-a", "grch38", "polya_negative.train.csv")
negative_chunk_validation = os.path.join("data", "poly-a", "grch38", "polya_negative.validation.csv")
chunk_train = os.path.join("data", "poly-a", "grch38", "polya_train.csv")
chunk_validation = os.path.join("data", "poly-a", "grch38", "polya_validation.csv")

merge_csv([positive_chunk_train, negative_chunk_train], chunk_train)
merge_csv([positive_chunk_validation, negative_chunk_validation], chunk_validation)

"""Merge training and validation balanced data"""
positive_balanced_train = os.path.join("data", "poly-a", "grch38", "polya_positive.balanced.train.csv")
positive_balanced_validation = os.path.join("data", "poly-a", "grch38", "polya_positive.balanced.validation.csv")
negative_balanced_train = os.path.join("data", "poly-a", "grch38", "polya_negative.balanced.train.csv")
negative_balanced_validation = os.path.join("data", "poly-a", "grch38", "polya_negative.balanced.validation.csv")
chunk_balanced_train = os.path.join("data", "poly-a", "grch38", "polya_train.balanced.csv")
chunk_balanced_validation = os.path.join("data", "poly-a", "grch38", "polya_validation.balanced.csv")

merge_csv([positive_balanced_train, negative_balanced_train], chunk_balanced_train)
merge_csv([positive_balanced_validation, negative_balanced_validation], chunk_balanced_validation)




100%|██████████| 2/2 [00:00<00:00,  7.48it/s]
100%|██████████| 2/2 [00:00<00:00, 27.68it/s]
100%|██████████| 2/2 [00:00<00:00, 10.41it/s]
100%|██████████| 2/2 [00:00<00:00, 25.78it/s]


True

In [None]:
"""
Generate kmer version for both positive and negative polya.
"""
from data_preparation import generate_kmer_csv
from data_dir import polya_grch38_positive_csv, polya_grch38_negative_csv, polya_grch38_positive_kmer_csv, polya_grch38_negative_kmer_csv

print("Generate kmer version of polya positive data: {}".format(generate_kmer_csv(polya_grch38_positive_csv, polya_grch38_positive_kmer_csv)))
print("Generate kmer version of polya negative data: {}".format(generate_kmer_csv(polya_grch38_negative_csv, polya_grch38_negative_kmer_csv)))

In [1]:
"""
Generate train and validation for positive and negative
"""
from data_preparation import split_csv
import os

fractions = [0.8, 0.2]
positive_chunk = os.path.join("data", "poly-a", "grch38", "polya_positive.expanded.csv")
positive_chunk_train = os.path.join("data", "poly-a", "grch38", "polya_positive.expanded.train.csv")
positive_chunk_val = os.path.join("data", "poly-a", "grch38", "polya_positive.expanded.validation.csv")

negative_chunk = os.path.join("data", "poly-a", "grch38", "polya_negative.expanded.csv")
negative_chunk_train = os.path.join("data", "poly-a", "grch38", "polya_negative.expanded.train.csv")
negative_chunk_val = os.path.join("data", "poly-a", "grch38", "polya_negative.expanded.validation.csv")

split_csv(positive_chunk, fractions, [positive_chunk_train, positive_chunk_val])
split_csv(negative_chunk, fractions, [negative_chunk_train, negative_chunk_val])

In [5]:
"""Merge train and validation"""
from data_preparation import merge_csv
import os

trains = [
    os.path.join("data", "poly-a", "grch38", "polya_positive.expanded.train.csv"),
    os.path.join("data", "poly-a", "grch38", "polya_negative.expanded.train.csv"),
]
vals = [
    os.path.join("data", "poly-a", "grch38", "polya_positive.expanded.validation.csv"),
    os.path.join("data", "poly-a", "grch38", "polya_negative.expanded.validation.csv"),
]

merge_csv(trains, os.path.join("data", "poly-a", "grch38", "polya_train.imbalanced.csv"))
merge_csv(vals, os.path.join("data", "poly-a", "grch38", "polya_val.imbalanced.csv"))

100%|██████████| 2/2 [00:00<00:00,  7.46it/s]
100%|██████████| 2/2 [00:00<00:00, 42.56it/s]


True