In [3]:
"""Create sample gene index for testing."""
import pandas as pd
df = pd.read_csv("workspace\\seqlab\\by_genes\\gene_dir\\seqlab.strand-positive.kmer.stride-510\\gene_index.csv")
sf = df.sample(frac=0.01)
sf.to_csv("workspace\\seqlab\\by_genes\\gene_dir\\seqlab.strand-positive.kmer.stride-510\\gene_index.sample.csv", index=False)

In [6]:
"""Make small sample from bundle sample."""
import pandas as pd
df = pd.read_csv("workspace\\seqlab\\by_sequence\\seqlab-stride.256\\bundle.csv")
sample_df = df.sample(n=10, random_state=1337)
sample_df.to_csv("workspace\\seqlab\\by_sequence\\seqlab-stride.256\\bundle.sample.small.csv", index=False)

In [1]:
"""Merge promoter, ss, and polya (imbalanced data)"""
import os
promoter = os.path.join("data", "promoter", "epd")
promoter_train = os.path.join(promoter, "promoter_train.csv")
promoter_validation = os.path.join(promoter, "promoter_validation.csv")
ss = os.path.join("data", "splice-sites", "splice-deep")
ss_train = os.path.join(ss, "ss_train.csv")
ss_validation = os.path.join(ss, "ss_validation.csv")
polya = os.path.join("data", "poly-a", "grch38")
polya_train = os.path.join(polya, "polya_train.csv")
polya_validation = os.path.join(polya, "polya_validation.csv")

for p in [promoter_train, promoter_validation, ss_train, ss_validation, polya_train, polya_validation]:
    if os.path.exists(p):
        print(f"{p} found.")
    else:
        print(f"{p} not found.")

from data_preparation import merge_prom_ss_polya_csv

"""Merging training data"""
train_path = os.path.join("workspace", "mtl", "train.csv")
validation_path = os.path.join("workspace", "mtl", "validation.csv")

merge_prom_ss_polya_csv({
    "prom": [promoter_train],
    "ss": [ss_train],
    "polya": [polya_train]
}, train_path)

merge_prom_ss_polya_csv({
    "prom": [promoter_validation],
    "ss": [ss_validation],
    "polya": [polya_validation]
}, validation_path)

data\promoter\epd\promoter_train.csv found.
data\promoter\epd\promoter_validation.csv found.
data\splice-sites\splice-deep\ss_train.csv found.
data\splice-sites\splice-deep\ss_validation.csv found.
data\poly-a\grch38\polya_train.csv found.
data\poly-a\grch38\polya_validation.csv found.


True

In [2]:
"""Merge promoter, ss, and polya balanced data"""
import os
promoter = os.path.join("data", "promoter", "epd")
promoter_train = os.path.join(promoter, "promoter_train.balanced.csv")
promoter_validation = os.path.join(promoter, "promoter_validation.balanced.csv")
ss = os.path.join("data", "splice-sites", "splice-deep")
ss_train = os.path.join(ss, "ss_train.balanced.csv")
ss_validation = os.path.join(ss, "ss_validation.balanced.csv")
polya = os.path.join("data", "poly-a", "grch38")
polya_train = os.path.join(polya, "polya_train.balanced.csv")
polya_validation = os.path.join(polya, "polya_validation.balanced.csv")

checkpoints = []
for p in [promoter_train, promoter_validation, ss_train, ss_validation, polya_train, polya_validation]:
    if os.path.exists(p):
        print(f"{p} found.")
        checkpoints.append(True)
    else:
        print(f"{p} not found.")
        checkpoints.append(False)

if all(checkpoints):
    from data_preparation import merge_prom_ss_polya_csv

    """Merging training data"""
    train_path = os.path.join("workspace", "mtl", "train.balanced.csv")
    validation_path = os.path.join("workspace", "mtl", "validation.balanced.csv")

    merge_prom_ss_polya_csv({
        "prom": [promoter_train],
        "ss": [ss_train],
        "polya": [polya_train]
    }, train_path)

    merge_prom_ss_polya_csv({
        "prom": [promoter_validation],
        "ss": [ss_validation],
        "polya": [polya_validation]
    }, validation_path)

data\promoter\epd\promoter_train.balanced.csv found.
data\promoter\epd\promoter_validation.balanced.csv found.
data\splice-sites\splice-deep\ss_train.balanced.csv found.
data\splice-sites\splice-deep\ss_validation.balanced.csv found.
data\poly-a\grch38\polya_train.balanced.csv found.
data\poly-a\grch38\polya_validation.balanced.csv found.


In [5]:
"""Create sample data."""
import pandas as pd
import os
pd.read_csv(os.path.join("workspace", "mtl", "train.csv")).sample(n=10).to_csv(os.path.join("workspace", "mtl", "train.sample.csv"), index=False)
pd.read_csv(os.path.join("workspace", "mtl", "validation.csv")).sample(n=5).to_csv(os.path.join("workspace", "mtl", "validation.sample.csv"), index=False)


In [1]:
import os
import pandas as pd
from data_preparation import str_kmer
_columns = ["sequence", "label_prom", "label_ss", "label_polya"]
src_paths = [
    os.path.join("workspace", "mtl", "original", p) for p in [
        'train.balanced.csv',
        'train.csv',
        'train.sample.csv',
        'train.sample.long.csv',
        'validation.balanced.csv',
        'validation.csv',
        'validation.sample.csv'
    ]
]

for src_path in src_paths:
    print(f"Working in {src_path}")
    src_df = pd.read_csv(src_path)
    # target_df = pd.DataFrame(columns=_columns)
    dest_path = os.path.join("workspace", "mtl", os.path.basename(src_path))
    if os.path.exists(dest_path):
        os.remove(dest_path)
    dest = open(dest_path, "x")
    dest.write(f"{','.join(_columns)}\n")
    for i, r in src_df.iterrows():
        seq_kmer = str_kmer(r["sequence"], 3)
        #target_df = pd.concat([target_df, pd.DataFrame([
        #    [seq_kmer, r["label_prom"], r["label_ss"], r["label_polya"]]
        #], columns=_columns)])
        dest.write(f"{seq_kmer},{r['label_prom']},{r['label_ss']},{r['label_polya']}\n")
    #target_df.to_csv(os.path.join("workspace", "mtl", os.path.basename(src_path)), index=False)
    dest.close()


Working in workspace\mtl\original\train.balanced.csv
Working in workspace\mtl\original\train.csv
Working in workspace\mtl\original\train.sample.csv
Working in workspace\mtl\original\train.sample.long.csv
Working in workspace\mtl\original\validation.balanced.csv
Working in workspace\mtl\original\validation.csv
Working in workspace\mtl\original\validation.sample.csv


In [4]:
# Generate 25% of training data and validation data.
import os
import pandas as pd
pathdir = os.path.join("workspace", "mtl", "balanced") 
files = ["train", "validation"]
files = [os.path.join(pathdir, a) for a in files]
for f in files:
    df = pd.read_csv(f"{f}.csv")
    quartile = df.sample(frac=0.25, random_state=1337)
    quartile.to_csv(f"{f}.25.csv", index=False)

In [1]:
# Generate 25% of training data and validation data.
import os
import pandas as pd
pathdir = os.path.join("workspace", "mtl", "balanced") 
files = ["train", "validation"]
files = [os.path.join(pathdir, a) for a in files]
for f in files:
    df = pd.read_csv(f"{f}.csv")
    quartile = df.sample(frac=0.1, random_state=1337)
    quartile.to_csv(f"{f}.10.csv", index=False)

In [None]:
# Generate 25% of training data and validation data.
import os
import pandas as pd
pathdir = os.path.join("workspace", "seqlab", "seqlab.strand-positive.kmer.stride-510.from-index") 
files = ["gene_train_bundle", "gene_validation_bundle", "gene_test_bundle"]
files = [os.path.join(pathdir, a) for a in files]
for f in files:
    df = pd.read_csv(f"{f}.csv")
    quartile = df.sample(frac=0.1, random_state=1337)
    quartile.to_csv(f"{f}.10.csv", index=False)