<a href="https://colab.research.google.com/github/andrkech/GENERATIVE-METHODS-IN-GENOMICS/blob/main/Variant_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q biopython
import random
import csv
import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import tensorflow as tf

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
fasta_file_path = "/content/drive/MyDrive/BIOINFORMATICS/THESIS_KECHAGIAS/DATA/INFO/TP53.fasta"
csv_file_path = "/content/drive/MyDrive/BIOINFORMATICS/THESIS_KECHAGIAS/DATA/INFO/totalVariantDataGATK_4_1_0_0.csv"
read_size = 300
num_reads = 50
seed = 2

Read a FASTA file and return the sequence as a string.

In [None]:
def read_fasta(file_path):
    sequences = []
    try:
        for record in SeqIO.parse(file_path, "fasta"):
            sequences.append(str(record.seq))

    except Exception as e:
        print(f"Error: {e}")
        return None

    return ''.join(sequences)

In [None]:
fasta_sequence = read_fasta(fasta_file_path)

print(fasta_sequence, "\n", len(fasta_sequence))

CTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGGTAAGCTCCTGACTGAACTTGATGAGTCCTCTCTGAGTCACGGGCTCTCGGCTCCGTGTATTTTCAGCTCGGGAAAATCGCTGGGGCTGGGGGTGGGGCAGTGGGGACTTAGCGAGTTTGGGGGTGAGTGGGATGGAAGCTTGGCTAGAGGGATCATCATAGGAGTTGCATTGTTGGGAGACCTGGGTGTAGATGATGGGGATGTTAGGACCATCCGAACTCAAAGTTGAACGCCTAGGCAGAGGAGTGGAGCTTTGGGGAACCTTGAGCCGGCCTAAAGCGTACTTCTTTGCACATCCACCCGGTGCTGGGCGTAGGGAATCCCTGAAATAAAAGATGCACAAAGCATTGAGGTCTGAGACTTTTGGATCTCGAAACATTGAGAACTCATAGCTGTATATTTTAGAGCCCATGGCATCCTAGTGAAAACTGGGGCTCCATTCCGAAATGATCATTTGGGGGTGATCCGGGGAGCCCAAGCTGCTAAGGTCCCACAACTTCCGGACCTTTGTCCTTCCTGGAGCGATCTTTCCAGGCAGCCCCCGGCTCCGCTAGATGGAGAAAATCCAATTGAAGGCTGTCAGTCGTGGAAGTGAGAAGTGCTAAACCAGGGGTTTGCCCGCCAGGCCGAGGAGGACCGTCGCAATCTGAGAGGCCCGGCAGCCCTGTTATTGTTTGGCTCCACATTTACATTTCTGCCTCTTGCAGCAGCATTTCCGGTTTCTTTTTGCCGGAGCAGCTCACTATTCACCCGATGAGAGGGGAGGAGAGAGAGAGAAAATGTCCTTTAGGCCGGTTCCTCTTACTTGGCAGAGGGAGGCTGCTATTCTCCGCCTGCATTTCTTTTTCTGGATTACTTAGTTATGGCCTTTGCAAAGGCAGGGGTATTTGTTTT

Generate reads of the given sequence of specified read_size. (chunks of the FASTA file)

In [None]:
def generate_reads(sequence, read_size, num_reads):
    if seed is not None:
        random.seed(seed)

    reads_with_positions = []
    seq_len = len(sequence)

    for _ in range(num_reads):
        if not seq_len <= read_size:
            start = random.randint(0, seq_len - read_size)  # Random start position ensuring read fits within sequence

        else:
            return ""

        read = sequence[start:start + read_size]
        reads_with_positions.append((read, start))

    return reads_with_positions

In [None]:
synthetic_reads = generate_reads(fasta_sequence, read_size, num_reads)

for read in synthetic_reads:
    print(read)

('ACTAAAAAATACAAAAATTAGCTGGGCGTGGTGGGTGCCTGTAATCCCAGCTATTCGGGAGGGTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCAGAGGTTGCAGTGAGCCAAGATCGTGCCACTACACTCCAGCCTGGGCGACAAGAACGAAACTCCGTCTCAAAAAAAAGGGGGGAATCATACATTATGTGCTCATTTTTGTCGGGCTTCTGTCCTTCAATGTACTGTCTGACATTCGTTCATGTTGTATATATCAGTATTTTGCTCCTTTTCATTTAGTATAGTCCATCGATTGTA', 1853)
('CTCTACTGAATGCTTTTAATTTTAATTATTTTACAGTTGGAGTATAGGGCTACCATTTTAGTGCTATTTTCTTTTTTTCTTTGTTAATTTTTGAGACAGGGACTCACACTGTTGCCCAGGCTAGAGTACAATGGCACAATCAAGGCTTACTGCAGCCTCGAACCCCTGGGCTCAAGCAGTCCTCTAGCAGCCTCACGAGTAGCTGGGATTACTCCACCACACCCAGCTAACTATTTTATTTTTTTGTATTGACAGGATCTCACTATGTTGCCCAGGCTGGTCTCAAACTGCTGGCCTCAA', 3001)
('CATCTTTTTTTTTTTTTTTAACCCCAGGGTCATGAAGATATTATCTTACATTTTCTTTTAGGACCTTTATGGTTGTAAGTTTTACAGTAAGGTCCTTGAGCCATTAATTAATTCTTAAAATTAATTGTTTATGGTGTGAGGTGTAGGAGTCAGTCTCTGGTATCTTTCCTGTATGGAAATCCAGTTATTCTGTCTCCACTTGTTGAAATAGGCTTCCTTTCTCTACTGAATGCTTTTAATTTTAATTATTTTACAGTTGGAGTATAGGGCTACCATTTTAGTGCTATTTTCTTTTTTTCT', 2781)
('GCCAAGGCAGGCAGATCACCTGAGCCCAGGAGTTCAAGACCAGCCTGGGTAACATGATGAAACCT

In [None]:
def read_and_process_csv(file_path):
    try:
        # Initialize the list to hold processed data
        processed_list = []

        with open(file_path, 'r') as file:
            reader = csv.reader(file)
            for row in reader:
                line = row[0]

                # Split the line by `,` avoiding quoted commas
                elements = []
                current = []
                in_quotes = False

                for char in line:
                    if char == '"' and (not current or current[-1] != '\\'):
                        in_quotes = not in_quotes
                        current.append(char)

                    elif char == ',' and not in_quotes:
                        elements.append(''.join(current).strip('"'))
                        current = []

                    else:
                        current.append(char)

                elements.append(''.join(current).strip('"'))

                # Replace empty strings with ''
                elements = [elem if elem else '' for elem in elements]

                processed_list.append(elements)

        processed_array = np.array(processed_list, dtype=object)
        return processed_array

    except Exception as e:
        print(f"Error: {e}")

        return None

In [None]:
processed_list = read_and_process_csv(csv_file_path)

'''
if processed_list is not None:
    for row in processed_list:
        print(row)
        print(len(row))
else:
    print("Failed to read and process the CSV file.")
'''

'\nif processed_list is not None:\n    for row in processed_list:\n        print(row)\n        print(len(row))\nelse:\n    print("Failed to read and process the CSV file.")\n'

Keep only the SNPs of the variant list.

In [None]:
def extract_snps(variants_list):
    snp_list = []
    ref_index = 4  # REF
    alt_index = 5  # ALT

    for row in variants_list:
        ref_nucleotide = row[ref_index]
        alt_nucleotide = row[alt_index]

        if len(ref_nucleotide) == 1 and len(alt_nucleotide) == 1:
            snp_list.append(row)

    return snp_list

In [None]:
SNP_list = extract_snps(processed_list)

'''
for SNP in SNP_list:
    print(SNP)
'''

'\nfor SNP in SNP_list:\n    print(SNP)\n'

Apply variations to the reads.

In [None]:
def apply_variant(reads_with_positions, variant_pos, ref_base, alt_base, vaf1, fasta_start=0, shuffle_seed=None):
    applicable_reads = [(read, start_pos) for read, start_pos in reads_with_positions if int(start_pos) <= int(variant_pos) - fasta_start < int(start_pos) + len(read)]
    num_reads_to_modify = int(len(applicable_reads) * float(vaf1))

    if shuffle_seed is not None:
      random.seed(shuffle_seed)

    random.shuffle(applicable_reads)
    reads_to_modify = applicable_reads[:num_reads_to_modify]

    modified_reads = []
    for read, start_pos in reads_with_positions:
        if (read, start_pos) in reads_to_modify:
            relative_pos = variant_pos - fasta_start - start_pos
            if read[relative_pos] == ref_base:
                read = read[:relative_pos] + alt_base + read[relative_pos + 1:]
                modified_reads.append((read, start_pos))
            else:
                modified_reads.append((read, start_pos))
        else:
            modified_reads.append((read, start_pos))

    return modified_reads

In [None]:
variant_line = 1 # Choose which SNP to apply

modified_reads = apply_variant(
                          synthetic_reads,
                          SNP_list[variant_line][3],
                          SNP_list[variant_line][4],
                          SNP_list[variant_line][5],
                          SNP_list[variant_line][-3],
                          fasta_start=7668402,
                          shuffle_seed=2
                          )

Allocate synthetic quality scores.

In [None]:
def allocate_quality_scores(reads, quality_scores_tensor):
    # Convert tensor to list
    quality_scores_list = quality_scores_tensor.numpy().flatten().tolist()

    # Check if all the values are integers
    if not all(isinstance(score, int) for score in quality_scores_list):
        raise ValueError("All quality scores must be integers.")

    # Check for the length accordance
    if len(reads) * len(reads[0][0]) != len(quality_scores_list):
        raise ValueError("The total number of quality scores must match the total number of bases in all reads.")

    reads_with_quality_scores = []
    idx = 0
    for read, position in reads:
        quality_scores = quality_scores_list[idx:idx + len(read)]
        reads_with_quality_scores.append((read, position, quality_scores))
        idx += len(read)

    return reads_with_quality_scores

Generates a FASTQ file.


In [None]:
def generate_fastq_file(reads_with_quality_scores, output_fastq_path):
    seq_records = []

    for read, position, quality_scores in reads_with_quality_scores:
        seq_record = SeqRecord(
            Seq(read),
            id=f"@instrument:run{1}:flowcell{1}:lane{1}:tile{1001}:pos{position}:0 1:N:0:sample{1}",
            description="",
            letter_annotations={"phred_quality": quality_scores}
        )
        seq_records.append(seq_record)

    # Write to FASTQ file
    with open(output_fastq_path, "w") as output_handle:
        SeqIO.write(seq_records, output_handle, "fastq")

In [None]:
tf.random.set_seed(42)
quality_scores_tensor = tf.random.uniform([50, 300], minval=0, maxval=40, dtype=tf.int32)

In [None]:
reads_plus_qs = allocate_quality_scores(modified_reads, quality_scores_tensor)

In [None]:
output_fastq_path = "synthetic.fastq"
generate_fastq_file(reads_plus_qs, output_fastq_path)