# Preprocess Input Transcripts

We want to preprocess fasta files giving transcript information to a Python friendly file format.

In [1]:
from Bio import SeqIO
import pandas as pd

In [32]:
DATA_PATH = '/mnt/geofflab/SNP_barcoding/Lvar_annotations_v3_Jan2021/'
LOCATIONS_PATH = DATA_PATH + 'annotations/transcript_coords.txt'
SEQUENCES_PATH = DATA_PATH + 'Lvar.braker.pasa.transcripts.fasta'
SAVE_PATH = DATA_PATH + 'transcripts.csv'

In [8]:
# Read transcript coords
transcipt_coords = pd.read_csv(LOCATIONS_PATH, sep='\t', index_col=4, header=None, 
                                names=['chromosome', 'start', 'end', 'direction', 'gene'])

In [27]:
# Read sequences
fasta_sequences = SeqIO.parse(open(SEQUENCES_PATH),'fasta')

transcripts = []
sequences = []

for fasta in fasta_sequences:
    if fasta.id in transcipt_coords.index:
        transcripts.append(fasta.id)
        sequences.append(str(fasta.seq))
        
# Add the sequence data to the coords file
transcipt_coords.loc[transcripts, 'sequence'] = sequences

In [30]:
# Convert chromosomes into ints for ease of use
transcipt_coords['chromosome'] = transcipt_coords['chromosome'].apply(lambda x: int(x.split('chr')[1]))

In [35]:
# Save the final result
transcipt_coords.to_csv(SAVE_PATH)