# Preprocess Input Transcripts

We want to preprocess fasta files giving transcript information to a Python friendly file format.

In [1]:
from Bio import SeqIO
import pandas as pd
import anndata

In [2]:
DATA_FOLDER = '/mnt/geofflab/SNP_barcoding/'
DATA_PATH = DATA_FOLDER + 'Lvar_annotations_v3_Jan2021/'
LOCATIONS_PATH = DATA_PATH + 'annotations/transcript_coords.txt'
SEQUENCES_PATH = DATA_PATH + 'Lvar.braker.pasa.transcripts.fasta'
COUNTS_PATH = DATA_FOLDER + 'LV_counts_10hpf.csv'
SAVE_PATH = DATA_PATH + 'transcripts.csv'

In [3]:
# Read transcript coords
transcipt_coords = pd.read_csv(LOCATIONS_PATH, sep='\t', index_col=4, header=None, 
                                names=['chromosome', 'start', 'end', 'direction', 'gene'])

In [4]:
# Read sequences
fasta_sequences = SeqIO.parse(open(SEQUENCES_PATH),'fasta')

transcripts = []
sequences = []

for fasta in fasta_sequences:
    if fasta.id in transcipt_coords.index:
        transcripts.append(fasta.id)
        sequences.append(str(fasta.seq))
        
# Add the sequence data to the coords file
transcipt_coords.loc[transcripts, 'sequence'] = sequences

In [5]:
# Convert chromosomes into ints for ease of use
transcipt_coords['chromosome'] = transcipt_coords['chromosome'].apply(lambda x: int(x.split('chr')[1]))

In [6]:
# Read counts and restrict our transcripts to only those we have data for
counts = pd.read_csv(COUNTS_PATH, index_col=0)
overlap = list(set(counts.index).intersection(set(transcipt_coords.index)))
transcript_coords = transcipt_coords.loc[overlap, :].copy()

In [8]:
counts

Unnamed: 0_level_0,Name,AAACCCAGTAGAGACC.1,AAACCCAGTGAGAACC.1,AAACCCAGTGCCTGCA.1,AAACCCAGTGCTGTCG.1,AAACCCAGTTTCGTGA.1,AAACCCATCTGCCCTA.1,AAACGAAAGATCGGTG.1,AAACGAACAGCGTATT.1,AAACGAAGTTGTTGTG.1,...,TTTGGTTCAATGGCCC.1,TTTGGTTGTCCTTGTC.1,TTTGGTTGTGTGGACA.1,TTTGGTTTCAAGTGTC.1,TTTGGTTTCCGATTAG.1,TTTGTTGAGAATACAC.1,TTTGTTGGTACTGACT.1,TTTGTTGGTCTCCTGT.1,TTTGTTGGTTCTCCTG.1,TTTGTTGTCGACGTCG.1
Lv_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LVA_1.t1,LVA_1.t1:Sp-Unk_5,1,1,1,1,3,1,1,1,1,...,2,2,2,1,3,1,1,5,5,3
LVA_10.t1,LVA_10.t1:none,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LVA_1000.t1,LVA_1000.t1:Sp-Kctd1_2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LVA_10002.t1,LVA_10002.t1:Sp-PolppL_64,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LVA_10004.t1,LVA_10004.t1:Sp-Hypp_2701,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LVA_m26308.t1,LVA_m26308.t1:Sp-Actinin,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LVA_m28240.t1,LVA_m28240.t1:Sp-PolypL_2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LVA_m29644.t1,LVA_m29644.t1:Sp-Endrvt22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LVA_m36108.t1,LVA_m36108.t1:Sp-Hypp_2410,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Save the final result
transcipt_coords.to_csv(SAVE_PATH)