In [21]:
# Create a dictionary to store the total length of each transcript
transcript_lengths = {}

# Open the bed file for reading
with open('/workdir/zl843/translation-start-site/Arabidopsis_thaliana.TAIR10.57.gff3.5-UTR', 'r') as bed_file:
    for line in bed_file:
        parts = line.strip().split(' ')
        transcript_id = parts[0]
        transcript_length = int(parts[4])
        
        # Check if the transcript_id is already in the dictionary
        if transcript_id in transcript_lengths:
            # If it is, add the current length to the existing total
            transcript_lengths[transcript_id] += transcript_length
        else:
            # If it's not, initialize the total length
            transcript_lengths[transcript_id] = transcript_length

# Create a dictionary to store the sequences based on transcript IDs
transcript_sequences = {}

# Open the FASTA file for reading
with open('/workdir/zl843/translation-start-site/Arabidopsis_thaliana.TAIR10.cdna.all.fa', 'r') as fasta_file:
    current_transcript_id = None
    sequence = ''
    
    for line in fasta_file:
        if line.startswith('>'):
            # Store the previous transcript's sequence if available
            if current_transcript_id is not None:
                transcript_sequences[current_transcript_id] = sequence
                
            # Update the current transcript ID
            current_transcript_id = line[1:].split()[0]
            sequence = ''
        else:
            sequence += line.strip()

    # Store the last transcript's sequence
    transcript_sequences[current_transcript_id] = sequence

**Save the sequence as FASTA file**

In [27]:
# Create an output file for writing the extracted sequences
with open('UTR_output.fasta', 'w') as output_file:
    for transcript_id, total_length in transcript_lengths.items():
        if transcript_id in transcript_sequences:
            sequence = transcript_sequences[transcript_id][:total_length]
            output_file.write(f">{transcript_id}\n{sequence}\n")

**Search the ATG in the 5'-UTR site**

In [37]:
# Define a function to search for "ATG" in 3-mers
def find_atg_in_sequence(sequence, sequence_id):
    atg_positions = []
    sequence = sequence[::-1]  # Reverse the sequence for scanning from the end
    for i in range(0, len(sequence) - 2, 3):
        if sequence[i:i + 3] == "GTA":
            atg_positions.append(len(sequence) - i - 3)  # Calculate position from the end
    if atg_positions:
        return (sequence_id, atg_positions)
    else:
        return None

# Read the FASTA file and search for "ATG"
fasta_file = "/workdir/zl843/translation-start-site/UTR_output.fasta"
results = []

with open(fasta_file, "r") as file:
    sequence_id = None
    sequence = ""
    for line in file:
        line = line.strip()
        if line.startswith(">"):
            if sequence_id is not None:
                result = find_atg_in_sequence(sequence, sequence_id)
                if result:
                    results.append(result)
            sequence_id = line[1:]
            sequence = ""
        else:
            sequence += line

# Check the last sequence in the file
if sequence_id is not None:
    result = find_atg_in_sequence(sequence, sequence_id)
    if result:
        results.append(result)

# Write the results to a file
output_file = "atg_positions.txt"
with open(output_file, "w") as output:
    for result in results:
        sequence_id, positions = result
        for position in positions:
            output.write(f"{sequence_id}\t{position}\n")


**Output the ATG start site in the 5'-UTR**

In [56]:
# Read the first file and store the data in a dictionary
bed_file = "/workdir/zl843/translation-start-site/Arabidopsis_thaliana.TAIR10.57.gff3.mRNA.bed"
transcripts = {}
with open(bed_file, 'r') as f:
    for line in f:
        parts = line.strip().split('\t')
        chrom, start, end, strand, transcript_id = parts
        transcripts[transcript_id] = (chrom, int(start), int(end), strand)

# Read the second file and update the positions
length_file = "/workdir/zl843/translation-start-site/atg_positions.txt"
output = []
with open(length_file, 'r') as f:
    for line in f:
        transcript_id, length = line.strip().split('\t')
        length = int(length)
        chrom, start, end, strand = transcripts[transcript_id]

        if strand == '+':
            start = start + length - 1
        else:
            end = end - length

        output.append((chrom, start, end, transcript_id, ".", strand))

# Write the updated data to a new file
output_file = "/workdir/zl843/translation-start-site/ATG_in_5-UTR.bed"
with open(output_file, 'w') as f:
    for entry in output:
        f.write('\t'.join(map(str, entry)) + '\n')

print(f"Updated data written to {output_file}")


Updated data written to /workdir/zl843/translation-start-site/ATG_in_5-UTR.bed
