In [None]:
"""
This notebook contains Z-curve representation of DNA sequence.
The idea is to create numeric represenation of base (A, T, G, and C) in 3D space with this formula.

xn = (An + Gn) - (Cn + Tn) as purine vs. pyrimidine,
yn = (An + Cn) - (Gn + Tn) as amino vs. keto,
zn = (An + Tn) - (Gn + Cn) as hydrogen bonds representation.

Reference can be found at https://pubmed.ncbi.nlm.nih.gov/8204213/
"""

In [4]:
DATASET_PATH = "data/ft/fine_tuning_sample_k-mer_3_ALPHA_BETA_DELTA_merged.txt"

# Generate array of (x, y, z) coordinate from a sequence.
# @param seq : Sequence to be coverted.
# @return : Array of coordinates.
def get_z_curve(seq):
    nucleotides = {
        'A': 0,
        'T': 0,
        'G': 0,
        'C': 0
    }
    z_coordinates = []
    for c in seq:
        nucleotides[c] += 1
        xn = (nucleotides['A'] + nucleotides['G']) - (nucleotides['C'] + nucleotides['T'])
        yn = (nucleotides['A'] + nucleotides['C']) - (nucleotides['G'] + nucleotides['T'])
        zn = (nucleotides['A'] + nucleotides['T']) - (nucleotides['G'] + nucleotides['C'])
        z_coordinates.append(
            (xn, yn, zn)
        )
    return z_coordinates

# Read file containing sequences and its labels.
# Convert each sequence into z-curve representation.
# @param dataset_file_path : Path to source file.
# @param target_file_path : Target file to write z-curve representation of sequences in dataset. 
#                           If empty then no file is created.
def generate_z_curve_from_file(dataset_file_path, target_file_path=False):
    f = open(dataset_file_path, 'r')
    
    next(f) # The file has header so the reader needs to skip the first line. Data starts at second line.
    z_curves = []
    
    # Iterate each line.
    for line in f:
        arr = line.split('\t')
        seq = arr[0].strip()
        label = arr[1].strip()
        z_curves.append(get_z_curve(seq))
    

In [5]:
# Test
s = "GTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACT"
print(get_z_curve(s))

[(1, -1, -1), (0, -2, 0), (-1, -3, 1), (-2, -2, 0), (-3, -3, 1), (-4, -2, 0), (-5, -3, 1), (-4, -2, 2), (-3, -1, 3), (-2, 0, 4), (-3, 1, 3), (-2, 0, 2), (-1, 1, 3), (0, 2, 4), (-1, 3, 3), (-2, 2, 4), (-3, 1, 5), (-4, 0, 6), (-3, 1, 7), (-2, 2, 8), (-1, 3, 9), (0, 4, 10), (-1, 3, 11), (-2, 4, 10), (-3, 3, 11), (-2, 2, 10), (-3, 1, 11), (-2, 0, 10), (-3, -1, 11), (-2, -2, 10), (-1, -3, 9), (-2, -2, 8), (-3, -3, 9), (-2, -4, 8), (-3, -5, 9), (-4, -4, 8), (-3, -3, 9), (-4, -2, 8), (-5, -3, 9), (-6, -2, 8), (-5, -3, 7), (-4, -4, 6), (-5, -3, 5), (-6, -4, 6), (-5, -5, 5), (-6, -4, 4), (-5, -3, 5), (-6, -4, 6), (-5, -5, 5), (-6, -4, 4), (-7, -5, 5), (-8, -6, 6), (-7, -5, 7), (-6, -6, 6), (-7, -7, 7), (-6, -8, 6), (-7, -7, 5), (-6, -6, 6), (-7, -5, 5), (-8, -6, 6)]
