### Parse dates from a tsv into fasta file based on ID matching

In [2]:
#Step 1: Build dictionary that maps the sampling date to the taxon name (name is the key)
#Date format is YYYY-MM-DD

date_seq_dict = {}

with open("control_dates_envRTprot.txt", 'rU') as f: #rU allows \n and \r to designate newline
	for line in f:
		split_line = line.split("\t")
		date_seq_dict[split_line[0]] = {'date': split_line[1].strip()}

In [3]:
print date_seq_dict.keys()

print date_seq_dict

['H0725135', 'T0205131', 'K0829045', 'Y1215137', 'T0526205', 'Pailin_C10346', 'T0527136', 'R0306224', 'O0409117', 'R0201074', 'J0430219', 'R0222042', 'R0221192', 'J0330035', 'H0729036', 'H0728201', 'O0820158', 'O0708154', 'Pailin_C20233', 'H0915199', 'Pailin_C10216', 'R0131317', 'Y0916057', 'J0420156', 'W0530014', 'R0222053', 'K0526031']
{'H0725135': {'date': '1997-07-25'}, 'T0205131': {'date': '2009-02-05'}, 'K0829045': {'date': '2000-08-29'}, 'Y1215137': {'date': '2014-12-15'}, 'T0526205': {'date': '2009-05-26'}, 'Pailin_C10346': {'date': '2009-07-10'}, 'T0527136': {'date': '2009-05-27'}, 'R0306224': {'date': '2007-03-06'}, 'O0409117': {'date': '2004-04-09'}, 'R0201074': {'date': '2007-02-01'}, 'J0430219': {'date': '1999-04-30'}, 'R0222042': {'date': '2007-02-22'}, 'R0221192': {'date': '2007-02-21'}, 'J0330035': {'date': '1999-03-30'}, 'H0729036': {'date': '1997-07-29'}, 'H0728201': {'date': '1997-07-28'}, 'O0820158': {'date': '2004-08-20'}, 'O0708154': {'date': '2004-07-08'}, 'Paili

In [4]:
from Bio import SeqIO

# Add env control sequences to the dictionary based on taxon names
env_file = open('env_controls_aligned.fasta','rU')
env_dict = SeqIO.to_dict(SeqIO.parse(env_file, "fasta"))
env_file.close()

for key in env_dict.keys():
    date_seq_dict[key]['env'] = env_dict[key].seq.upper()

In [5]:
#Add prot control sequences to the dictionary based on taxon names

prot_file = open('prot_controls_aligned.fasta', "rU")
prot_dict = SeqIO.to_dict(SeqIO.parse(prot_file, "fasta"))
prot_file.close()

for key in prot_dict.keys():
    date_seq_dict[key]['prot'] = prot_dict[key].seq.upper()

In [6]:
#Add RT control sequences to the dictionary based on taxon names

RT_file = open('RT_controls_aligned.fasta', "rU")
RT_dict = SeqIO.to_dict(SeqIO.parse(RT_file, "fasta"))
RT_file.close()

for key in RT_dict.keys():
    date_seq_dict[key]['RT'] = RT_dict[key].seq.upper()

In [7]:
#CHECK TO MAKE SURE THINGS HAVE WORKED

print len(date_seq_dict) #Should be 27
print  date_seq_dict['R0222053']['date'] #check that date matches with excel file data for this taxon
print len(date_seq_dict['R0222053']['env']) #check that length matches with alnmt length in geneious
print len(date_seq_dict['R0222053']['prot'])
print len(date_seq_dict['R0222053']['RT'])

27
2007-02-22
915
393
771


In [8]:
#Dated ENV control fasta file:
for key in date_seq_dict.keys():
    print '>'+ key + '|' + str(date_seq_dict[key]['date']) + '|' + 'env' + '\n' + str(date_seq_dict[key]['env'])

>H0725135|1997-07-25|env
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGAATCCGTATCCAGAGAGGACCAGGGAGAGCATTTGTTACAATAGGAAA---AATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATAACACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAA------ACAATAATCTTT------AAGCAATCCT---CAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAAT---------AGTACTTGGTTTA------------ATAGTACTTGGAGTACTGAAGGGTCAAATAACACTGA------AGGAAGTGACACAATCACCCTCCCATGCAGAATAAAACAAATTATAAACATGTGGCAGAAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTTCATCAAATATTACAGGGCTGCTATTAACAAGAGATGGTGGTAATAGCAACAATGAGTCCGAGATCTTCAGACCTGGAGGAGGAGATATGAGGGACAATTGGAGAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCATTAGGAGTAGCACCCACCAAGGCAAAGAGAAGAGTGGTGCAGAGAGAAAAAAGA
>T0205131|2009-02-05|env
------GCGATTCTAAAGTGTAATGCTAAGAATT

In [73]:
for key in date_seq_dict.keys():
    print '>'+ key + '|' + str(date_seq_dict[key]['date']) + '|' + 'prot' + '\n' + str(date_seq_dict[key]['prot'])

>H0725135|1997-07-25|prot
CCTCAAATCACTCTTTGGCAACGACCCCTTGTCACAGTAAAAATAGGAGGACAGCTAAAAGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGATATAAATTTACCAGGAAAATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAGGTAAGGCAATATGATCAGATACTTATAGAAATTTGTGGAAAAAAGGCTATAGGTACAGTATTGGTAGGACCTACACCTGTCRACATAATTGGACGAAATATGTTGACTCAGATTGGTTGTACTTTAAATTTCCCAATTAGTCCTATTAACACTGTACCAGTAACATTAAAGCCAGGAATGGATGGACCAAAAGTTAAACAGTGGCCATTGACAGAAGAAAAA------
>T0205131|2009-02-05|prot
CCTCAAATCACTCTTTGGCAACGACCCCTTGTCACARTAAARRTAGGRGGACAGCTRAAAGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGATATAAATTTGCCAGGRAAATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATTAAGGTAAGGCAATATGATCAGRTACTTATAGAAATTTGTGGRAAAAAAGCTATAGGTACAGTRTTAGTAGGACCTACACCTGTCAACATAATTGGACGAAATATGTTGACYCARATTGGTTGTACTTTAAATTTCCCAATTAGTCCTATTGACACTGTACCAGTAACATTAAAGCCAGGAATGGATGGACCAAAGGTTAAACAGTGGCCATTGACAGAAGAA---------
>K0829045|2000-08-29|prot
CCTCAAATCACTCTTTGGCAACGACCCCTTGTCACAATAAAAATAGGAGGACAGCTGAAAGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGATATAAATTTGCCAGGAAAATGGAAACCAAA

In [74]:
for key in date_seq_dict.keys():
    print '>'+ key + '|' + str(date_seq_dict[key]['date']) + '|' + 'RT' + '\n' + str(date_seq_dict[key]['RT'])

>H0725135|1997-07-25|RT
CCCATTAGTCCTATTAACACTGTACCAGTAACATTAAAGCCAGGAATGGATGGACCAAAAGTTAAACAGTGGCCATTGACAGAAGAAAAAATAAAAGCATTAACAGAAATTTGTAAAGAGATGGAAAAGGAAGGAAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCTATAAAGAAAAAGGACAGCACCAAATGGAGGAAATTAGTGGATTTCAGAGAGCTCAATAAAAGAACTCAGGACTTTTGGGAAGTTCAATTAGGAATACCGCATCCAGCAGGTTTAAAAARGAAAAAATCAATAACAGTACTAGATGTGGGAGATGCATATTTTTCAGTTCCTTTAGATGAAAGCTTTAGAAAGTATACTGCATTCACCATACCTAGTATAAATAATGAGACACCAGGAATTAGATATCAGTACAATGTGCTGCCACAGGGATGGAAAGGATCACCGGCAATATTCCAGTGYAGCATGACAAAAATCTTAGAGCCCTTTAGAATAAAAAATCCAGAAATGGTTATCTATCAATACATGGATGACTTGTATGTAGGATCTGATTTAGAAATAGGGCAGCACAGAATAAAAATAGAGGAGCTAAGAGCTCATYTATTGAGCTGGGGATTTACTACACCAGACAAAAAGCATCAGAAGGATCCTCCATTCCTTTGGATGGGATATGAACTCCATCCTGACAGATGGACAGTCCAGCCTATAGAACTGCCWGAAAAAGACAGCTGGACTGTCAAT------
>T0205131|2009-02-05|RT
---------------------------------------------ATGGATGGACCAAAGGTTAAACAGTGGCCATTGACAGAAGAAAAAATAAAAGCATTAACAGAAATTTGTAAAGAGATGGAAAAGGAAGGAAAAATYTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTA

In [10]:
#I'm also going to make a fasta file that has all the sequences concatenated together, as prot+RT+env
#This is genomic ordering 5' to 3'
for key in date_seq_dict.keys():
    print '>'+ key + '|' + str(date_seq_dict[key]['date']) + '|' + 'protRTenv' + '\n' + str(date_seq_dict[key]['prot']) + str(date_seq_dict[key]['RT']) + str(date_seq_dict[key]['env'])

>H0725135|1997-07-25|protRTenv
CCTCAAATCACTCTTTGGCAACGACCCCTTGTCACAGTAAAAATAGGAGGACAGCTAAAAGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGATATAAATTTACCAGGAAAATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAGGTAAGGCAATATGATCAGATACTTATAGAAATTTGTGGAAAAAAGGCTATAGGTACAGTATTGGTAGGACCTACACCTGTCRACATAATTGGACGAAATATGTTGACTCAGATTGGTTGTACTTTAAATTTCCCAATTAGTCCTATTAACACTGTACCAGTAACATTAAAGCCAGGAATGGATGGACCAAAAGTTAAACAGTGGCCATTGACAGAAGAAAAA------CCCATTAGTCCTATTAACACTGTACCAGTAACATTAAAGCCAGGAATGGATGGACCAAAAGTTAAACAGTGGCCATTGACAGAAGAAAAAATAAAAGCATTAACAGAAATTTGTAAAGAGATGGAAAAGGAAGGAAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCTATAAAGAAAAAGGACAGCACCAAATGGAGGAAATTAGTGGATTTCAGAGAGCTCAATAAAAGAACTCAGGACTTTTGGGAAGTTCAATTAGGAATACCGCATCCAGCAGGTTTAAAAARGAAAAAATCAATAACAGTACTAGATGTGGGAGATGCATATTTTTCAGTTCCTTTAGATGAAAGCTTTAGAAAGTATACTGCATTCACCATACCTAGTATAAATAATGAGACACCAGGAATTAGATATCAGTACAATGTGCTGCCACAGGGATGGAAAGGATCACCGGCAATATTCCAGTGYAGCATGACAAAAATCTTAGAGCCCTTTAGAATAAAAAATCCAGAAATGGTTATCTATCAATACATGGATGACTTGTATGTAGGATCTGAT