In [1]:
# Generate polya index from genome GFF.
from data_dir import annotated_grch38_gff, polya_grch38_index_csv
from data_preparation import generate_polya_index_from_annotated_genome
print('Generate polya index from annotated grch38 gff: {}'.format(generate_polya_index_from_annotated_genome(annotated_grch38_gff, polya_grch38_index_csv)))

./data/genome/grch38/GRCh38_latest_genomic.gff
./data/poly-a/grch38/polya_index.csv
./data/poly-a/grch38/polya_positive.csv
Generate polya index from annotated grch38 gff: True


In [1]:
"""
Generate positive set from polya index.
"""
from data_preparation import generate_polya_positive_dataset_from_index
from data_dir import polya_grch38_index_csv, polya_grch38_positive_csv
print('Generate polya positive set from polya index: {}'.format(generate_polya_positive_dataset_from_index(polya_grch38_index_csv, polya_grch38_positive_csv)))

Generate polya positive set from polya index: True


In [None]:
"""
Generate negative set from positive set.
"""
from data_preparation import generate_negative_dataset
from data_dir import polya_grch38_positive_csv, polya_grch38_negative_csv
print("Generate polya negative set from positive data {}".format(generate_negative_dataset(polya_grch38_positive_csv, polya_grch38_negative_csv)))

In [None]:
"""
Generate kmer version for both positive and negative polya.
"""
from data_preparation import generate_kmer_csv
from data_dir import polya_grch38_positive_csv, polya_grch38_negative_csv, polya_grch38_positive_kmer_csv, polya_grch38_negative_kmer_csv

print("Generate kmer version of polya positive data: {}".format(generate_kmer_csv(polya_grch38_positive_csv, polya_grch38_positive_kmer_csv)))
print("Generate kmer version of polya negative data: {}".format(generate_kmer_csv(polya_grch38_negative_csv, polya_grch38_negative_kmer_csv)))

In [4]:
"""
Generate train, validation, and test set.
"""
from data_preparation import generate_datasets
from data_dir import polya_grch38_positive_kmer_csv, polya_grch38_negative_kmer_csv, polya_grch38_negative_dir, polya_grch38_positive_dir

print("Generate dataset from positive kmer: {}".format(generate_datasets(polya_grch38_positive_kmer_csv, polya_grch38_positive_dir)))
print("Generate dataset from negative kmer: {}".format(generate_datasets(polya_grch38_negative_kmer_csv, polya_grch38_negative_dir)))

Generate dataset from positive kmer: ['./data/poly-a/grch38/positive/train.csv', './data/poly-a/grch38/positive/validation.csv', './data/poly-a/grch38/positive/test.csv']
Generate dataset from negative kmer: ['./data/poly-a/grch38/negative/train.csv', './data/poly-a/grch38/negative/validation.csv', './data/poly-a/grch38/negative/test.csv']


In [5]:
"""
Generate samples from kmer positive and kmer negative data.
"""
from data_preparation import generate_sample
from data_dir import polya_grch38_positive_dir, polya_grch38_negative_dir, sample_polya_pos_dir, sample_polya_neg_dir

# Create samples.
polya_pos_srcs = [
    ("{}/train.csv".format(polya_grch38_positive_dir), "{}/train.csv".format(sample_polya_pos_dir)),
    ("{}/validation.csv".format(polya_grch38_positive_dir), "{}/validation.csv".format(sample_polya_pos_dir)),
    ("{}/test.csv".format(polya_grch38_positive_dir), "{}/test.csv".format(sample_polya_pos_dir) ),
]
polya_neg_srcs = [
    ("{}/train.csv".format(polya_grch38_negative_dir), "{}/train.csv".format(sample_polya_neg_dir)), 
    ("{}/validation.csv".format(polya_grch38_negative_dir), "{}/validation.csv".format(sample_polya_neg_dir)),
    ("{}/test.csv".format(polya_grch38_negative_dir), "{}/test.csv".format(sample_polya_neg_dir)),
]
domain = [polya_pos_srcs, polya_neg_srcs]
for dom in domain:
    for src in dom:
        src_csv = src[0]
        target_csv = src[1]
        generate_sample(src_csv, target_csv, frac_sample=0.1)


Generate sample for ./data/poly-a/grch38/positive/train.csv => ./sample/polya/positive/train.csv
Generate sample for ./data/poly-a/grch38/positive/validation.csv => ./sample/polya/positive/validation.csv
Generate sample for ./data/poly-a/grch38/positive/test.csv => ./sample/polya/positive/test.csv
Generate sample for ./data/poly-a/grch38/negative/train.csv => ./sample/polya/negative/train.csv
Generate sample for ./data/poly-a/grch38/negative/validation.csv => ./sample/polya/negative/validation.csv
Generate sample for ./data/poly-a/grch38/negative/test.csv => ./sample/polya/negative/test.csv


In [3]:
"""
Generate train, validation, and test for positive and negative data.
"""
from data_preparation import generate_datasets
from data_dir import polya_grch38_positive_csv, polya_grch38_negative_csv, sample_polya_pos_dir, sample_polya_neg_dir

print("Generate train, validation, and test for positive data: {}".format(generate_datasets(polya_grch38_positive_csv, sample_polya_pos_dir)))
print("Generate train, validation, and test for negative data: {}".format(generate_datasets(polya_grch38_negative_csv, sample_polya_neg_dir)))

Generate train, validation, and test for positive data: ['./sample/polya/positive/train.csv', './sample/polya/positive/validation.csv', './sample/polya/positive/test.csv']
Generate train, validation, and test for negative data: ['./sample/polya/negative/train.csv', './sample/polya/negative/validation.csv', './sample/polya/negative/test.csv']


In [1]:
"""
Merge positive and negative train, validation, and test data.
"""
from data_preparation import merge_csv
from data_dir import sample_polya_pos_dir, sample_polya_neg_dir, sample_polya_train_csv, sample_polya_validation_csv, sample_polya_test_csv

merge_csv([
    "{}/train.csv".format(sample_polya_pos_dir),
    "{}/train.csv".format(sample_polya_neg_dir)
], sample_polya_train_csv)
merge_csv([
    "{}/validation.csv".format(sample_polya_pos_dir),
    "{}/validation.csv".format(sample_polya_neg_dir)
], sample_polya_validation_csv)
merge_csv([
    "{}/test.csv".format(sample_polya_pos_dir),
    "{}/test.csv".format(sample_polya_neg_dir)
], sample_polya_test_csv)

True

In [1]:
"""
Merge positive and negative poly A.
"""
from data_preparation import merge_csv
from data_dir import polya_grch38_negative_kmer_csv, polya_grch38_positive_kmer_csv, polya_grch38_train_csv

files = [polya_grch38_negative_kmer_csv, polya_grch38_positive_kmer_csv]
print("Merging {} => {}: {}".format(files, polya_grch38_train_csv, merge_csv(files, polya_grch38_train_csv)))


Merging ['./data/poly-a/grch38/polya_negative_kmer.csv', './data/poly-a/grch38/polya_positive_kmer.csv'] => ./data/poly-a/grch38/train.csv: True


In [1]:
"""
Expand sample train, validation, and test data.
"""
from data_preparation import expand_files_in_dir
from data_dir import sample_polya_dir

print("Generate expanded datasets: {}".format(expand_files_in_dir(sample_polya_dir, length=510)))

Error [Errno 28] No space left on devicesv: 404/404
Error Traceback (most recent call last):
  File "w:\Research\_sequence-processing\data_preparation.py", line 993, in expand_by_sliding_window
    target_df.to_csv(target_csv, index=False)
  File "c:\.virtualenv\sequence-processing-py39\lib\site-packages\pandas\core\generic.py", line 3563, in to_csv
    return DataFrameRenderer(formatter).to_csv(
  File "c:\.virtualenv\sequence-processing-py39\lib\site-packages\pandas\io\formats\format.py", line 1180, in to_csv
    csv_formatter.save()
  File "c:\.virtualenv\sequence-processing-py39\lib\site-packages\pandas\io\formats\csvs.py", line 261, in save
    self._save()
  File "c:\.virtualenv\sequence-processing-py39\lib\site-packages\pandas\io\formats\csvs.py", line 266, in _save
    self._save_body()
  File "c:\.virtualenv\sequence-processing-py39\lib\site-packages\pandas\io\formats\csvs.py", line 304, in _save_body
    self._save_chunk(start_i, end_i)
  File "c:\.virtualenv\sequence-process