In [51]:
import pandas
import pickle
import pathlib
cwd = pathlib.Path.cwd()
import random
import re
import numpy
import matplotlib.pyplot as plt

total_exons = 10_000
total_introns = 10_000
total_ep = 20_000

Generates three datasets:

1) an exon dataset
2) an intron dataset
3) an equal probability dataset of lengths between 20 and 2000 (randomly labeled as exons and introns).

In [52]:
def gen_seq(kmer: int = 6):
    '''
    '''
    if kmer < 0:
        kmer = abs(kmer)
    if kmer == 0:
        kmer = 6

    seq = ""

    for _ in range(kmer):
        n = gen_nuc()
        seq = f"{n}{seq}"

    return seq



def gen_nuc():
    '''
    Generates a nucleotide. The reason it generates the nucleotide rather then the number (which would appear to be more convienent) is because
    this allows me to save the sequences, look at them later, and use the existing scripts easily.
    '''
    n = random.randint(1, 4)

    if n == 1:
        return "A"
    elif n == 2:
        return "G"
    elif n == 3:
        return "T"
    else:
        return "C"
    

def gen_length(average: int = 120, sigma: int = 25):
    '''
    Generates the length, using a gaussian distribution.

    By trail and error: exons -> 120 and 25, introns -> 1000, 200
    '''

    return int(numpy.random.normal(average, scale = sigma))


def gen_region():
    '''
    randomly chooses exon or intron
    '''
    r = random.randint(1, 2)

    return "exon" if r == 1 else "intron"

Generates Exon data

In [53]:
random_exons = pandas.DataFrame(pandas.NA, columns=["Seq", "Length", "Classificaion"], index=[x for x in range(total_exons)])

for e in range(total_exons):
    l = gen_length()
    random_exons.loc[e, "Seq"] = gen_seq(kmer = l)
    random_exons.loc[e, "Length"] = l
    random_exons.loc[e, "Classificaion"] = "exon"

print(random_exons.head())

                                                 Seq Length Classificaion
0  TCTAAGGTTGCTCTACAGGTGTTTATTTGACACCCACCAGAGTGCA...    141          exon
1  CTCTTGACAGACATGTCATCCCGGCATCCGGATCCCGCGTAAGGGG...    113          exon
2  GCACACTATCCCCCCACGCGGACCCGAGCTCACCAGAATTAAAGTG...    110          exon
3  ACGTACTGGTCATCCTCTCCACTAACTATCGGTATGTAATCCTCTG...    146          exon
4  AAAGTTCTATGCGCTTGTATCACTATTAGTCAAAGGTATGGCAGTT...    111          exon


Generates Intron Data

In [54]:
random_introns = pandas.DataFrame(pandas.NA, columns=["Seq", "Length", "Classificaion"], index=[x for x in range(total_introns)])

for i in range(total_introns):
    l = gen_length(average = 1000, sigma = 200)
    random_introns.loc[i, "Seq"] = gen_seq(kmer = l)
    random_introns.loc[i, "Length"] = l
    random_introns.loc[i, "Classificaion"] = "intron"

print(random_introns.head())

                                                 Seq Length Classificaion
0  GCAAAAGACGGCATTGGGATGTGCACTTAATTGGAGTTTAGTTCCT...    777        intron
1  GCGGTAGTGTAAGGCGTGGAATGCAGATATTAAACTTGCCTACGGT...    943        intron
2  AAGAGAGGGCACTCAAAGAGGCCGGTAATTCTTTGAGAATCTCTTA...   1411        intron
3  GACTCGCAATATCTAGCGATCTACTTCCTCAATGAACGTTCCGCTT...    973        intron
4  ATCGCTCGAGTGTGGCACGGGGCCACATCTTTCCGGGGTTAGAAGC...   1224        intron


Generates "Equal Probability" data

In [55]:
random_ep = pandas.DataFrame(pandas.NA, columns=["Seq", "Length", "Classificaion"], index=[x for x in range(total_ep)])

for ep in range(total_ep):
    l = random.randint(12, 2000)
    random_ep.loc[ep, "Seq"] = gen_seq(kmer = l)
    random_ep.loc[ep, "Length"] = l
    random_ep.loc[ep, "Classificaion"] = gen_region()

print(random_ep.head())

                                                 Seq Length Classificaion
0  GACACGGCGGTTGTGGATCAAAATTTGCATCGACATTTGCGTTCGA...   1662          exon
1  CGCTATCTAACGAAAGGTAGGGCCGTGCGTGTGCGCTTTTGTCGAC...   1581        intron
2  GTAGCGACTACTTCGTCCTTCGAAGATCCCCGACAACACTTCGACT...   1546          exon
3  ACTAGGACGTCAAAAGGCAACAGTAAATGAGTCCACGTGCAGGCCT...    839        intron
4  ACTCCGCCGCGGGGTGAGGTTCTCTCGCTGCCGAGTCTGTGCCCTA...     87          exon


In [56]:
random_exons.to_pickle(cwd.parent / "Data_Files" / "Random_Exon_Seq.pkl")
random_introns.to_pickle(cwd.parent / "Data_Files" / "Ranomd_Intron_Seq.pkl")
random_ep.to_pickle(cwd.parent / "Data_Files"/ "Ranomd_EP_Seq.pkl")

: 