# Parse HCV dates from text files into fasta headers

In [68]:
#Step 1: Build dictionary for 1b that maps the sampling date to the taxon name (name is the key)
#Date format is YYYY-MM-DD

#note that fasta files are aligned!

hcv1b_dict = {}

with open("ControlDates_HCV_1b.txt", 'rU') as f: #rU allows \n and \r to designate newline
	for line in f:
		split_line = line.split("\t")
		hcv1b_dict[split_line[1]] = {'date': split_line[2].strip()}

In [69]:
print hcv1b_dict

{'T0507223-NS5B-C': {'date': '2009-05-07'}, 'R000006': {'date': '2003-01-15'}, 'T0610213-NS5B-C': {'date': '2009-06-10'}, 'T0601149-NS5B-C': {'date': '2009-06-01'}, 'Y1018062-Ns5b': {'date': '2014-10-18'}, 'R000028': {'date': '2003-01-10'}, 'Y1125155-Ns5b': {'date': '2014-11-25'}, 'R00010': {'date': '2003-01-17'}, 'T0406156-NS5B-C': {'date': '2009-04-06'}, 'X0115890-Ns5b': {'date': '2013-01-15'}, 'W0629891-Ns5b': {'date': '2012-06-29'}, 'R001291': {'date': '2003-03-21'}, 'T0430212-NS5B-C': {'date': '2009-04-30'}, 'Y0314061-Ns5b': {'date': '2014-03-14'}, 'T0507219-NS5B-C': {'date': '2009-05-07'}, 'R004855': {'date': '2002-10-02'}, 'T0423048-N-NS5B': {'date': '2009-04-23'}, 'T0331074-NS5B-H': {'date': '2009-03-31'}, 'R000887': {'date': '2003-03-01'}, 'Y1024164-Ns5b': {'date': '2014-10-24'}, 'W1207890-Ns5b': {'date': '2012-12-07'}, 'X021489-Ns5b': {'date': '2013-02-14'}, 'T0602266-NS5B-H': {'date': '2009-06-02'}, 'Y0321083-Ns5b': {'date': '2014-03-21'}, 'T0617310-NS5B-H': {'date': '2009-0

In [70]:
from Bio import SeqIO

# Add 1b control sequences to the dictionary based on taxon names
HCV1b_controls = open('Controls_HCV_1b_ALN.fasta','rU')
HCV1b_controls_dict = SeqIO.to_dict(SeqIO.parse(HCV1b_controls, "fasta"))
HCV1b_controls.close()

for key in HCV1b_controls_dict.keys():
    hcv1b_dict[key]['1b'] = HCV1b_controls_dict[key].seq.upper()

In [71]:
print hcv1b_dict['T0507223-NS5B-C']['date']
print hcv1b_dict['T0507223-NS5B-C']['1b']
print len(hcv1b_dict['T0507223-NS5B-C']['1b'])

2009-05-07
GAAAAGCCCTATGGGCTTCTCGTATGACACTCGCTG--CTTTGACTCAACAGTCACTGAGAGCGACATCCGTGTTGAGGAGTCAATCTACCAATGTTGTGACTTGGCCCCCGAAGCCAGACAGGCCATAA-AGTCGCTCACAGAGCGGCTCTACATCGGGGGTCCCCTGACTAATTCAAAAGGGCAGAACTGCGGTTATCGCCGGTGCCGCGCGAGCGGCGTGCTGACGACTAGCTGCGGCAATACCCTCACATGCTACTTGAAAGCCACTGCGGCCTGTCGAGCTGCAAAGCTCCAGGACTGCACGATGCTCGTGAACGGAGACGACCTTGTCGTTATCTGCGA-AAGCGCGGGAACCCAGGAGGATGCGGCGAGCCTACGAGTCTTCACGGAGGCTATGACTAGGTACTCTGCCCCCCCCGGGGACCCGCCCCAACCAGAATACGAC
449


In [83]:
#Build dictionary for 6e that maps the sampling date to the taxon name (name is the key)
#Date format is YYYY-MM-DD

hcv6e_dict = {}

with open("ControlDates_HCV_6e.txt", 'rU') as f: #rU allows \n and \r to designate newline
	for line in f:
		split_line = line.split("\t")
		hcv6e_dict[split_line[1]] = {'date': split_line[2].strip()}

In [84]:
print hcv6e_dict

{'W0731890-Ns5b': {'date': '2012-07-31'}, 'T0424331-NS5B-H': {'date': '2009-04-24'}, 'E002159': {'date': '2002-12-27'}, 'Y0410027-Ns5b': {'date': '2014-04-10'}, 'E000269': {'date': '2003-01-25'}, 'R006424': {'date': '2002-12-24'}, 'R006232': {'date': '2002-12-20'}, 'R004936': {'date': '2002-10-11'}, 'R006235': {'date': '2002-12-17'}, 'T0408250-NS5B-H': {'date': '2009-04-08'}, 'E002278': {'date': '2003-01-04'}, 'X0423890-Ns5b': {'date': '2013-04-23'}, 'R005898': {'date': '2002-11-29'}, 'T0529154-NS5B-H': {'date': '2009-05-29'}, 'T0519292-NS5B-H': {'date': '2009-05-19'}, 'T0331017-NS5B-H': {'date': '2009-03-31'}, 'V1118916-Ns5b': {'date': '2011-11-18'}, 'R000032': {'date': '2003-01-15'}, 'R006192': {'date': '2002-12-13'}, 'R006427': {'date': '2002-12-24'}}


In [85]:
# Add 6e control sequences to the dictionary based on taxon names
HCV6e_controls = open('Controls_HCV_6e_ALN.fasta','rU')
HCV6e_controls_dict = SeqIO.to_dict(SeqIO.parse(HCV6e_controls, "fasta"))
HCV6e_controls.close()

for key in HCV6e_controls_dict.keys():
    if key in hcv6e_dict.keys():
        hcv6e_dict[key]['6e'] = HCV6e_controls_dict[key].seq.upper()

In [86]:
print hcv6e_dict['W0731890-Ns5b']['date']
print hcv6e_dict['W0731890-Ns5b']['6e']
print len(hcv6e_dict['W0731890-Ns5b']['6e'])

2012-07-31
--------------ACACAAGTCACAGAGCGCGACATTCAAACTGAACATTCCATCTACCAGTGCTGCCAGTTGGAGCCGGTTGCACGGAAGGCCATCACTTCTCTCACTGAYCGACTGTATTGYGGTGGGCCCATGTTTAACTCGAAAGGGCAAGCATGCGGAACTCGCAGATGCAGGGCCAGTGGTGTCTTGACTACCAGCTTGGGCAATACCCTGACATGCTACCTGAAAGCACAGGCYGCRTGTAGRGCTGCCGGGCTCAAGAAYTTTGACATGTTGGTCTGCGGAGACGATCTCGTCATTATTTCAGAGAGTTTGGGGGTCTCGGAGGACGCTAGTGCACTGCRAGGTTYCACRGAARCCATGACWAGAGA
376


In [51]:
#Make date/seq dictionaries for Roka isolates
#Starting off with gene 1b


roka_hcv1b_dict = {}

with open("RokaDates_HCV_1b.txt", 'rU') as f: #rU allows \n and \r to designate newline
	for line in f:
		split_line = line.split("\t")
		roka_hcv1b_dict[split_line[1]] = {'date': split_line[2].strip()}

In [52]:
print roka_hcv1b_dict

{'NCHADS137': {'date': '2014-12-17'}, 'NCHADS229': {'date': '2015-01-14'}, 'NCHADS232': {'date': '2015-01-13'}, 'NCHADS117': {'date': '2014-12-17'}, 'NCHADS115': {'date': '2014-12-17'}, 'NCHADS110': {'date': '2014-12-15'}, 'NCHADS152': {'date': '2014-12-18'}, 'NCHADS207': {'date': '2014-12-26'}, 'NCHADS055': {'date': '2014-12-15'}, 'NCHADS054': {'date': '2014-12-15'}, 'NCHADS053': {'date': '2014-12-15'}, 'NCHADS074': {'date': '2014-12-15'}, 'NCHADS051': {'date': '2014-12-15'}, 'NCHADS050': {'date': '2014-12-15'}, 'NCHADS159': {'date': '2014-12-18'}, 'NCHADS196': {'date': '2014-12-23'}, 'NCHADS213': {'date': '2015-01-06'}, 'NCHADS192': {'date': '2014-12-22'}, 'NCHADS235': {'date': '2015-01-08'}, 'NCHADS191': {'date': '2014-12-22'}, 'NCHADS206': {'date': '2014-12-26'}, 'NCHADS018': {'date': '2014-12-16'}, 'NCHADS165': {'date': '2014-12-19'}, 'NCHADS121': {'date': '2014-12-17'}, 'NCHADS072': {'date': '2014-12-15'}, 'NCHADS105': {'date': '2014-12-15'}, 'NCHADS156': {'date': '2014-12-18'}, 

In [53]:
# Add 1b control sequences to the dictionary based on taxon names
HCV1b_roka = open('Roka_HCV_1b_ALN.fasta','rU')
HCV1b_roka_dict = SeqIO.to_dict(SeqIO.parse(HCV1b_roka, "fasta"))
HCV1b_roka.close()

for key in HCV1b_roka_dict.keys():
    roka_hcv1b_dict[key]['1b'] = HCV1b_roka_dict[key].seq.upper()

In [60]:
print roka_hcv1b_dict['NCHADS137']['date']
print roka_hcv1b_dict['NCHADS137']['1b']
print len(roka_hcv1b_dict['NCHADS137']['1b'])

2014-12-17
------------CTCAACAGTCACTGAGAACGATATCCGTGTTGAGGAGTCAATCTACCAATGTTGTGACTTGGCCCCCGAGGCCAGACAGGCTATAAGGTCGCTCACCGAGCGGCTTTATATTGGGGGCCCCCTGACTAATTCAAAAGGGCAGAACTGCGGCTATCGCCGRTGCCGCGCCAGCGGCGTGCTGACGACTAGCTGTGGTAATACCCTTACATGTTACTTGAAGGCCTCTGCAGCCTGTCGAGCTGCAAAGCTCCAGGACTGCACGATGCTCGTGTGCGGAGACGACCTCGTCGTTATCTGTGAAAGTGCAGGAACCCAGGAGGACGCGGCGAGCCTACGAGTCTTCACGGA---
363


In [55]:
#do the same thing now for the 6e segments
roka_hcv6e_dict = {}

with open("RokaDates_HCV_6e.txt", 'rU') as f: #rU allows \n and \r to designate newline
	for line in f:
		split_line = line.split("\t")
		roka_hcv6e_dict[split_line[1]] = {'date': split_line[2].strip()}

In [56]:
print roka_hcv6e_dict

{'NCHADS087': {'date': '2014-12-15'}, 'NCHADS178': {'date': '2014-12-22'}, 'NCHADS224': {'date': '2014-12-31'}, 'NCHADS016': {'date': '2014-12-16'}, 'NCHADS014': {'date': '2014-12-16'}, 'NCHADS170': {'date': '2014-12-19'}, 'NCHADS034': {'date': '2014-12-16'}, 'NCHADS019': {'date': '2014-12-16'}, 'NCHADS209': {'date': '2014-12-26'}, 'NCHADS024': {'date': '2014-12-16'}, 'NCHADS222': {'date': '2014-12-29'}, 'NCHADS223': {'date': '2014-12-29'}, 'NCHADS071': {'date': '2014-12-15'}, 'NCHADS070': {'date': '2014-12-15'}, 'NCHADS150': {'date': '2014-12-18'}, 'NCHADS128': {'date': '2014-12-17'}, 'NCHADS052': {'date': '2014-12-15'}, 'NCHADS079': {'date': '2014-12-15'}, 'NCHADS008': {'date': '2014-12-16'}, 'NCHADS158': {'date': '2014-12-18'}, 'NCHADS009': {'date': '2014-12-16'}, 'NCHADS194': {'date': '2014-12-22'}, 'NCHADS195': {'date': '2014-12-22'}, 'NCHADS234': {'date': '2015-01-08'}, 'NCHADS221': {'date': '2014-12-29'}, 'NCHADS204': {'date': '2014-12-24'}, 'NCHADS212': {'date': '2015-01-06'}, 

In [57]:
# Add 1b control sequences to the dictionary based on taxon names
HCV6e_roka = open('Roka_HCV_6e_ALN.fasta','rU')
HCV6e_roka_dict = SeqIO.to_dict(SeqIO.parse(HCV6e_roka, "fasta"))
HCV6e_roka.close()

for key in HCV6e_roka_dict.keys():
    roka_hcv6e_dict[key]['6e'] = HCV6e_roka_dict[key].seq.upper()

In [59]:
print roka_hcv6e_dict['NCHADS087']['date']
print roka_hcv6e_dict['NCHADS087']['6e']
print len(roka_hcv6e_dict['NCHADS087']['6e'])

2014-12-15
------------CTGTTTTGACTCAACTGTCACAGAGCGCGACATTCAGACAGAACGCGACATCTATCAGTGCTGCCAGTTAGAGCCCGCAGCACGGAAAGCCATCACATCGCTCACTGACCGACTGTACTGTGGCGGCCCCATGTTTAACTCTAAGGGTCAGGCATGTGGGTACCGCAGATGCAGGGCCAGCGGCGTGTTAACCACCAGCCTAGGCAACACACTGACTTGCTACCTGAAAGCTCAGGCGGCGTGCAAGGCCGCTGGGCTGAGGGACTTTGACATGTTGGTCTGCGGAGACGATCTTGTCGTTATTTCGGAGAGTGTGGGGGTTTCGGAGGATGCTAGTGCGCTGAGAGCTTTCACGGA------------
381


In [72]:
#Make Control_1b Fasta with dates in taxa headers
for key in hcv1b_dict.keys():
    print '>'+ key + '|' + str(hcv1b_dict[key]['date']) + '|' + '1b' + '\n' + str(hcv1b_dict[key]['1b'])

>T0507223-NS5B-C|2009-05-07|1b
GAAAAGCCCTATGGGCTTCTCGTATGACACTCGCTG--CTTTGACTCAACAGTCACTGAGAGCGACATCCGTGTTGAGGAGTCAATCTACCAATGTTGTGACTTGGCCCCCGAAGCCAGACAGGCCATAA-AGTCGCTCACAGAGCGGCTCTACATCGGGGGTCCCCTGACTAATTCAAAAGGGCAGAACTGCGGTTATCGCCGGTGCCGCGCGAGCGGCGTGCTGACGACTAGCTGCGGCAATACCCTCACATGCTACTTGAAAGCCACTGCGGCCTGTCGAGCTGCAAAGCTCCAGGACTGCACGATGCTCGTGAACGGAGACGACCTTGTCGTTATCTGCGA-AAGCGCGGGAACCCAGGAGGATGCGGCGAGCCTACGAGTCTTCACGGAGGCTATGACTAGGTACTCTGCCCCCCCCGGGGACCCGCCCCAACCAGAATACGAC
>R000006|2003-01-15|1b
-----------------------------------------TGACTCAACGGTCACTGAGAGTGACATCCGTGTTGAGGAGTCAATCTACCAATGTTGTGACTTGGCCCCCGAAGCCAGACAGGCCATAA-GGTCGCTCACAGAGCGGCTCTATATCGGGGGCCCCTTGACTAATTCAAAAGGGCAGAACTGTGGTTATCGCCGGTGCCGCGCCAGCGGCGTGCTGACGACTAGCTGCGGTAATACCCTCACATGTTACTTGAAGGCCTCTGCGGCCTGTCGAGCTGCGAAGCTCCAGGACTGCACGATGCTCGTGAACGGAGACGACCTTGTCGTTATCTGTGA-AAGCGCGGGAACCCAAGAGGATGCGGCGAGCCTACG-------------------------------------------------------------------
>T0610213-NS5B-C|2009-06-10|1b
---------------

In [88]:
#Make Control_6e Fasta with dates in taxa headers

for key in hcv6e_dict.keys():
    print '>'+ key + '|' + str(hcv6e_dict[key]['date']) + '|' + '6e' + '\n' + str(hcv6e_dict[key]['6e'])

>W0731890-Ns5b|2012-07-31|6e
--------------ACACAAGTCACAGAGCGCGACATTCAAACTGAACATTCCATCTACCAGTGCTGCCAGTTGGAGCCGGTTGCACGGAAGGCCATCACTTCTCTCACTGAYCGACTGTATTGYGGTGGGCCCATGTTTAACTCGAAAGGGCAAGCATGCGGAACTCGCAGATGCAGGGCCAGTGGTGTCTTGACTACCAGCTTGGGCAATACCCTGACATGCTACCTGAAAGCACAGGCYGCRTGTAGRGCTGCCGGGCTCAAGAAYTTTGACATGTTGGTCTGCGGAGACGATCTCGTCATTATTTCAGAGAGTTTGGGGGTCTCGGAGGACGCTAGTGCACTGCRAGGTTYCACRGAARCCATGACWAGAGA
>T0424331-NS5B-H|2009-04-24|6e
-TCGCTGTTTTGGATCACAAGTCACGGAGCGCGATGTCCAGACGGAGCATGACATCTAYCAGTGCTGCCAGTTGGAGCCCGCAGCACGGACAGCCATTACAGCGCTCACTGACCGATTGTATTGCGGTGGTCCCATGTTTAACTCTAAAGGTCAGGCATGTGGATACCGTAGRTGCAGGGCCAGTGGCGTYTTGACCACCAGCCTGGGCAATACTCTGACTTGCTACTTGAAGGCTCAAGCGGCATGCAGGGCTGCCGGGCTGAAAGATTTTGACATGCTGGTCTGCGGAGACGACCTTGTCGTTATTTCGGAGAGTTTGGGGGTCTCGGAGGACACTAGTGCACTGCGAGTTTT-ACAGAACCATTGACCAGA--
>E002159|2002-12-27|6e
----CTGTTTTGACTCAACGGTCACAGAGCGCGACATTCATACAGAGCACGACATCTACCAATGCTGCGAATTAGAGCCCGCAGCACGGAAAGCYATTACATCGCTTACTGACCGGCTATACTGTGGTGGCCCCATGGTTAACTCTAAGGGTCAGGCATGTGG

In [89]:
#Make Roka_6e Fasta with dates in taxa headers

for key in roka_hcv6e_dict.keys():
    print '>'+ key + '|' + str(roka_hcv6e_dict[key]['date']) + '|' + '6e' + '\n' + str(roka_hcv6e_dict[key]['6e'])

>NCHADS087|2014-12-15|6e
------------CTGTTTTGACTCAACTGTCACAGAGCGCGACATTCAGACAGAACGCGACATCTATCAGTGCTGCCAGTTAGAGCCCGCAGCACGGAAAGCCATCACATCGCTCACTGACCGACTGTACTGTGGCGGCCCCATGTTTAACTCTAAGGGTCAGGCATGTGGGTACCGCAGATGCAGGGCCAGCGGCGTGTTAACCACCAGCCTAGGCAACACACTGACTTGCTACCTGAAAGCTCAGGCGGCGTGCAAGGCCGCTGGGCTGAGGGACTTTGACATGTTGGTCTGCGGAGACGATCTTGTCGTTATTTCGGAGAGTGTGGGGGTTTCGGAGGATGCTAGTGCGCTGAGAGCTTTCACGGA------------
>NCHADS178|2014-12-22|6e
---------------TTTTGACTCAACTGTCACAGAGCGCGACATTCAGACAGAACACGACATCTACCAGTGCTGCCAGTTAGAGCCCGCAGCACGGAAAGCCATTACATCGCTCACTGACCGACTGTACTGTGGCGGCCCCATGCTTAACTCTAAGGGTCAGGCATGTGGGTACCGCAGATGCAGGGCCAGTGGCGTGTTAACCACCAGCCTAGGCAATACTCTGACTTGCTACCTGAAAGCTCAGGCAGCGTGTAGRGCCGCTGGGCTGAAGGACTTTGACATGTTGGTCTGCGGAGACGATCTCGTCGTTATTTCGGAGAGTGTGGGGGTTTCGGAGGACGCTAGTGCGCTGAGAGCTTTCACGGA------------
>NCHADS224|2014-12-31|6e
---------------TTTTGACTCAACTGTCACAGAGCGCGACATTCAGACAGAACACGACATTTATCAGTGCTGCCAGTTAGAGCCCGCAGCACAGAAAGCCATTACGTCGCTCACTGACCGACTGTACTGTGGCGGCCCCATGTTTAACTCTAAGGGTC

In [90]:
#Make Roka_1b Fasta with dates in taxa headers

for key in roka_hcv1b_dict.keys():
    print '>'+ key + '|' + str(roka_hcv1b_dict[key]['date']) + '|' + '1b' + '\n' + str(roka_hcv1b_dict[key]['1b'])

>NCHADS137|2014-12-17|1b
------------CTCAACAGTCACTGAGAACGATATCCGTGTTGAGGAGTCAATCTACCAATGTTGTGACTTGGCCCCCGAGGCCAGACAGGCTATAAGGTCGCTCACCGAGCGGCTTTATATTGGGGGCCCCCTGACTAATTCAAAAGGGCAGAACTGCGGCTATCGCCGRTGCCGCGCCAGCGGCGTGCTGACGACTAGCTGTGGTAATACCCTTACATGTTACTTGAAGGCCTCTGCAGCCTGTCGAGCTGCAAAGCTCCAGGACTGCACGATGCTCGTGTGCGGAGACGACCTCGTCGTTATCTGTGAAAGTGCAGGAACCCAGGAGGACGCGGCGAGCCTACGAGTCTTCACGGA---
>NCHADS229|2015-01-14|1b
------TTTTGACTCAACAGTCACTGAGAACGATATCCGTGTTGAGGAGTCAATCTACCAATGTTGTGACTTGGCCCCCGAGGCCAGACAGGCTATAAGGTCGCTCACCGAGCGGCTTTATATTGGGGGCCCCCTGACTAATTCAAAAGGGCAGAACTGCGGCTATCGCCGGTGCCGCGCCAGCGGCGTGCTGACGACTAGCTGTGGTAATACCCTTACATGTTACTTGAAGGCCTCTGCAGCCTGTCGAGCTGCAAAGCTCCAGGACTGCACGATGCTCGTGTGCGGAGACGACCTCGTCGTTATCTGTGAAAGTGCAGGAACCCAGGAGGACGCGGCGAGCCTACGAGTCTTCACGGA---
>NCHADS232|2015-01-13|1b
------TTTTGACTCAACAGTCACTGAGAACGATATCCGTGTTGAGGAGTCAATCTACCAATGTTGTGACTTGGCCCCCGAGGCCAGACAGGCTATAAGGTCGCTCACCGAGCGGCTTTATATTGGGGGCCCCCTGACTAATTCAAAAGGGCAGAACTGCGGCTATCGCCGGTGCCGCGCCAGCGGCGTGCTGACGA