## What this script is for:
The goal with this script is to subsample the 619 U events dataset to create an equitable number of samples per year per geography per host. 

In [92]:
# Read in the current dataset as a nested dictionary structure. 
allUevents_dict = {}
with open("UEventsFASTA.txt", 'rU') as f: #rU allows \n and \r to designate newline
    for line in f:
        if line.startswith(">"):
            split_line = line.split("_")
            year = int(split_line[0].replace('>',''))
            taxon = line.strip().replace('>','')
            allUevents_dict[taxon] = {'year': year}
        else:
            allUevents_dict[taxon]['sequence'] = line.strip()

#print allUevents_dict

{'1998_mG001U_UC_165': {'sequence': 'GATTCCAGCCAAGAGATAAAAGGTCACCTCTTTGTTGATAAAATCTCCAATCGAGTCGTGAAGGCAACGAGCTACGGACACCACCCTTGGGGACTGCATCGGGCCTGTATGATTGAATTCTGTGGGAAACAGTGGATACGGACAGATCTCGGTGACCTGATATCTGTCGAATACAATTCTGGAGCAGAAATCCTCTCGTTCCCGAAGTGTGAGGACAAGACGGTGGGGATGAGGGGAAACTTGGATGACTTTGCCTATCTAGATGATCTGGTGAAGGCCTCTGAGAGCAGAGAGGAATGTCTT', 'year': 1998}, '1994_mG043U_UP_87': {'sequence': 'GATTCCAGCCAAGAGATAAAAGGTCACCTCTTTGTTGATAAAATCTCCAATCGAGTCGTGAAGGCAACGAGCTACGGACACCACCCCTGGGGACTGCATCGGGCCTGTATGATTGAATTCTGTGGGAAACAGTGGATACGGACAGATCTCGGTGACCTGATATCTGTCGAATACAATTCCGGAGCAGAAATCCTCTCGTTCCCGAAGTGTGAGGACAAGACGGTGGGGATGAGGGGAAATTTGGATGACTTTGCCTATCTAGACGATCTGGTGAAGGCCTCTGAGAGCAGAGAGGAATGTCTT', 'year': 1994}, '1998_mG001U_UC_162': {'sequence': 'GATTCCAGCCAAGAGATAAAAGGTCACCTCTTTGTTGATAAAATCTCCAATCGAGTCGTGAAGGCAACGAGCTACGGACACCACCCTTGGGGACTGCATCGGGCCTGTATGATTGAATTCTGTGGGAAACAGTGGATACGGACAGATCTCGGTGACCTGATATCTGTCGAATACAATTCTGGAGCAGAAATCCTCTCGTTCCCGAAGTGTGAGGACAAGACGGTGGGGATGAGGGGAAACTTGGATGACT

In [49]:
#Make a dictionary that links the taxon name with the geographic range (given a BEAST style trait map with headers removed)
geo_dict = {}
with open("GeoRanges.txt", 'rU') as f:
    for line in f:
        split_line = line.split('\t')
        name = split_line[0]
        geo = split_line[1].strip()
        geo_dict[name] = geo
         
#print geo_dict

#Do the same thing for host species (given BEAST style trait map with headers removed)
host_dict = {}
with open("HostSpp.txt", 'rU') as f:
    for line in f:
        split_line = line.split('\t')
        name = split_line[0]
        host = split_line[1].strip()
        host_dict[name] = host

#print host_dict

In [93]:
#Merge so that all of the information that we want (date, host, geo range) is in one structure.

for key in allUevents_dict.keys():
    allUevents_dict[key].update({'host':host_dict[key]})
    allUevents_dict[key].update({'range':geo_dict[key]})

#print allUevents_dict
#print allUevents_dict['2002_mG001U_UC_255']
#print allUevents_dict['2002_mG001U_UC_255']['host']
#print allUevents_dict['2002_mG001U_UC_255']['range']
#print allUevents_dict['2002_mG001U_UC_255']['year']

In [103]:
#Seems sort of weird, but I'm taking the dictionary that has the information that I want
#and making it into a pandas dataframe so I can export the data I want to CSV.

import pandas as pd

allUEvents_df=pd.DataFrame.from_dict(allUevents_dict,orient='index')
allUEvents_df.index.name = 'taxon_name'
#print allUEvents_df

allUEvents_df.to_csv('allUEvents_forSubSamp.csv')

In [108]:
#import the data from the CSV
import csv
import random

full_data = []

with open('allUEvents_forSubSamp.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:
        full_data.append(line)

print full_data[0]

{'taxon_name': '1971_mG018U_UP_1', 'host': 'mykiss', 'sequence': 'GATTCCAGCCAAGAGATAAAAGGTCACCTCTTTGTTGATAAAATCTCCAATCGAGTCGTGAAGGCAACGAGCTACGGACACCACCCCTGGGGACTGCATCAGGCCTGTATGATTGAATTCTGTGGGAAACAGTGGATACGGACAGACCTCGGTGACCTGATATCTGTCGAATACAATTCCGGAGCAGAAATCCTCTCGTTCCCGAAGTGTGAGGACAAGACGGTGGGGATGAGGGGAAACTTGGATGACTTTGCCTATCTAGACGATCTGGTGAAGGCCTCTGAGAGCAGAGAGGAATGTCTT', 'range': 'CRB', 'year': '1971'}


In [126]:
#now 'bin' obs into all possible combos of Host (4 options), Geography (2 Options), and Year (1971 -2013 options)

hostgeoyear_binned_data = {}

for element in full_data:
    key = (element['year'], element['host'], element['range']) #key is a tuple with the 3 defining vars
    if key not in hostgeoyear_binned_data:
        hostgeoyear_binned_data[key] = []
    hostgeoyear_binned_data[key].append(element)

#print len(hostgeoyear_binned_data.keys())
#print hostgeoyear_binned_data.keys()

#test out how this works a little bit
#for key in hostgeoyear_binned_data.keys():
  #  if 'nondom' in key:
    #    print key

In [176]:
#define arbitrary threshold values that can be played with
n_desired_samples = 5

#set seed for sampling reproducibility
random.seed(9223444)

In [177]:
subsampled_data = {}

for key,listvalues in hostgeoyear_binned_data.items():
    if len(listvalues) > n_desired_samples:
        subsampled_data[key] = random.sample(listvalues,n_desired_samples)
    elif len(listvalues) <= n_desired_samples:
        subsampled_data[key] = listvalues

n_seqs_sampled = 0
for key in subsampled_data.keys():
    n_seqs_sampled += len(subsampled_data[key])


total_hostgeoyear_combos = len(subsampled_data.keys()) #should be the same as length of binned_data.keys. YAY!
print n_seqs_sampled
print n_desired_samples * total_hostgeoyear_combos 
print float(380.0/619.0)
# if sampling intensity is equal for all bins these two numbers should be the same
# I know that they won't be though because lots of early bins will only have 1 sample.
# try to get them reasonably close while still sampling reasonably well.


380
750
0.613893376414


In [178]:
#print out the subsampled data in FASTA format

for key in subsampled_data.keys():
    for element in subsampled_data[key]:
        print '>' + element['taxon_name'] + '\n' + element['sequence']

>1989_mG001U_UC_57
GATTCCAGCCAAGAGATAAAAGGTCACCTCTTTGTTGATAAAATCTCCAATCGAGTCGTGAAGGCAACGAGCTACGGACACCACCCTTGGGGACTGCATCGGGCCTGTATGATTGAATTCTGTGGGAAACAGTGGATACGGACAGATCTCGGTGACCTGATATCTGTCGAATACAATTCTGGAGCAGAAATCCTCTCGTTCCCGAAGTGTGAGGACAAGACGGTGGGGATGAGGGGAAACTTGGATGACTTTGCCTATCTAGATGATCTGGTGAAGGCCTCTGAGAGCAGAGAGGAATGTCTT
>1984_mG003U_UP_23
GATTCCAGCCAAGAGATAAAAGGTCACCTCTTTGTTGATAAAATCTCCAATCGAGTCGTGAAGGCAACGAGCTACGGACACCACCCCTGGGGACTGCATCGGGCCTGTATGATTGAATTCTGTGGGAAACAGTGGATACGGACAGATCTCGGTGACCTGATATCTGTCGAATACAATTCCGGAGCAGAAATCCTCTCGTTCCCGAAGTGTGAGGACAAGACGGTGGGGATGAGGGGAAACTTGGATGACTTTGCCTATCTAGACGATCTGGTGAAGGCCTCTGAGAGCAGAGAGGAATGTCTT
>2007_mG177U_UC_533
GATTCCAGCCAAGAGATAAAAGGTCACCTCTTTGTTGAAAAAATCTCCAATCGAGTCGTGAAGGCAACGAGCTACGGACACCACCCTTGGGGACTGCATCGGGCCTGTATGATTGAATTCTGTGGGAAACAGTGGATACGGACAGATCTCGGTGACCTGATATCTGTCGAATACAATTCTGGAGCAGAAATCCTCTCGTTCCCGAAGTGTGAGGACAAGACGGTGGGGATGAGGGGAAACTTGGATGACTTTGCCTATCTGGATGATCTGGTGAAGGCCTCTGAGAGCAGAGAGGAATGTCTT
>2007_mG198U_UC_528
GATTCCAGCC

In [179]:
#also, need to make new trait maps to go along with the subsampling of sequences we now have!

#First, HOST trait map
print "taxon" + '\t' + "species"
for key in subsampled_data.keys():
    for element in subsampled_data[key]:
        print  element['taxon_name'] + '\t' + element['host']

taxon	species
1989_mG001U_UC_57	nerka
1984_mG003U_UP_23	chin
2007_mG177U_UC_533	chin
2007_mG198U_UC_528	chin
2007_mG174U_UC_520	chin
2007_mG237U_UC_523	chin
2007_mG151U_UC_519	chin
2012_mG265U_UP_1092	nondom
2012_mG265U_UP_1080	nondom
2012_mG250U_UP_1088	nondom
2012_mG050UmG250U_UP_1085	nondom
2012_mG050U_UP_1086	nondom
2006_mG174U_UC_475	nondom
1995_mG002U_UP_103	nerka
1995_mG017U_UP_114	nerka
2003_mG194U_UC_312	nerka
2003_mG001U_UC_329	mykiss
2003_mG001U_UC_340	mykiss
2003_mG001U_UC_323	mykiss
2003_mG150U_UC_309	mykiss
2003_mG001U_UC_322	mykiss
1975_mG057U_UC_6	nerka
1999_mG001U_UC_186	mykiss
1999_mG001U_UC_187	mykiss
1999_mG001U_UC_195	mykiss
1999_mG033U_UC_196	mykiss
1999_mG289U_UC_184	mykiss
2003_mG184U_UP_296	nondom
1990_mG051U_UP_68	nerka
1986_mG003U_UP_31	nerka
1986_mG054U_UP_32	nerka
2006_mG147U_UC_511	chin
2006_mG001U_UC_485	chin
2006_mG151U_UC_479	chin
2006_mG001U_UC_473	chin
2006_mG001U_UC_505	chin
1989_mG001U_UC_59	mykiss
1987_mG001U_UC_38	mykiss
1987_mG001U_UC_42	mykiss
2

In [159]:
#Next, GEOGRAPHIC RANGE trait map
print "taxon" + '\t' + "range"
for key in subsampled_data.keys():
    for element in subsampled_data[key]:
        print  element['taxon_name'] + '\t' + element['range']

taxon	range
1989_mG001U_UC_57	CRB
1984_mG003U_UP_23	CRB
2007_mG177U_UC_533	CRB
2007_mG198U_UC_528	CRB
2007_mG174U_UC_520	CRB
2007_mG237U_UC_523	CRB
2007_mG151U_UC_519	CRB
2012_mG265U_UP_1092	coastal
2012_mG265U_UP_1080	coastal
2012_mG250U_UP_1088	coastal
2012_mG050UmG250U_UP_1085	coastal
2012_mG050U_UP_1086	coastal
2006_mG174U_UC_475	CRB
1995_mG002U_UP_103	coastal
1995_mG017U_UP_114	coastal
2003_mG194U_UC_312	CRB
2003_mG001U_UC_329	CRB
2003_mG001U_UC_340	CRB
2003_mG001U_UC_323	CRB
2003_mG150U_UC_309	CRB
2003_mG001U_UC_322	CRB
1975_mG057U_UC_6	CRB
1999_mG001U_UC_186	CRB
1999_mG001U_UC_187	CRB
1999_mG001U_UC_195	CRB
1999_mG033U_UC_196	CRB
1999_mG289U_UC_184	CRB
2003_mG184U_UP_296	coastal
1990_mG051U_UP_68	coastal
1986_mG003U_UP_31	coastal
1986_mG054U_UP_32	coastal
2006_mG147U_UC_511	CRB
2006_mG001U_UC_485	CRB
2006_mG151U_UC_479	CRB
2006_mG001U_UC_473	CRB
2006_mG001U_UC_505	CRB
1989_mG001U_UC_59	CRB
1987_mG001U_UC_38	CRB
1987_mG001U_UC_42	CRB
2007_mG032U_UC_515	coastal
2007_mG147U_UC_516	

In [186]:
#Checking out some summaries of the data to see how things are shaping up in terms of the equitable sampling.

#number of records from CRB range
CRB_count = []
for key in subsampled_data.keys():
    if 'CRB' in key:
        CRB_count.append(len(subsampled_data[key]))
print 'observations in CRB: ' + str(sum(CRB_count))

#number of records from coastal range
coastal_count = []
for key in subsampled_data.keys():
    if 'coastal' in key:
        coastal_count.append(len(subsampled_data[key]))
print 'observations in coastal range: ' + str(sum(coastal_count))

#number of records in chinook
chin_count = []
for key in subsampled_data.keys():
    if 'chin' in key:
        chin_count.append(len(subsampled_data[key]))
print 'observations in chinook: ' + str(sum(chin_count))

#number of records in sockeye/kokanee
nerka_count = []
for key in subsampled_data.keys():
    if 'nerka' in key:
        nerka_count.append(len(subsampled_data[key]))
print 'observations in nerka: ' + str(sum(nerka_count))

#number of records in mykiss (steelhead and rainbow trout)
mykiss_count = []
for key in subsampled_data.keys():
    if 'mykiss' in key:
        mykiss_count.append(len(subsampled_data[key]))
print 'observations in mykiss: ' + str(sum(mykiss_count))

observations in CRB: 266
observations in coastal range: 114
observations in chinook: 119
observations in nerka: 107
observations in mykiss: 117


In [191]:
#Now I need to make UC and UP specific files given the full subsampled dataset:
#MAKE UP FASTA
for key in subsampled_data.keys():
    for element in subsampled_data[key]:
        if 'UP' in element['taxon_name']:
            print '>' + element['taxon_name'] + '\n' + element['sequence']
        else:
            continue

>1984_mG003U_UP_23
GATTCCAGCCAAGAGATAAAAGGTCACCTCTTTGTTGATAAAATCTCCAATCGAGTCGTGAAGGCAACGAGCTACGGACACCACCCCTGGGGACTGCATCGGGCCTGTATGATTGAATTCTGTGGGAAACAGTGGATACGGACAGATCTCGGTGACCTGATATCTGTCGAATACAATTCCGGAGCAGAAATCCTCTCGTTCCCGAAGTGTGAGGACAAGACGGTGGGGATGAGGGGAAACTTGGATGACTTTGCCTATCTAGACGATCTGGTGAAGGCCTCTGAGAGCAGAGAGGAATGTCTT
>2012_mG265U_UP_1092
GATTCCAGCCAAGAGATAAAAGGTCACCTCTTTGTTGATAAAATCTCCAATCGAGCCGTGAAGGCAACGAGCTACGGACACCACCCCTGGGGACTGCATCGGGCCTGTATGATTGAATTCTGTGGGAAACAGTGGATACGGACAGATCTCGGTGACCTGATATCTGTCGAATACAATTCCGGATCAGAAATCCTCTCGTTCTCGAAGTGTGAGGACAAGACGGTGGGGATGAGGGGAAACTTGGATGACTTTGCCTATCTAGACGATCTGGTGAAGGCCTCTGAGAGCAGAGAAGAATGTCTT
>2012_mG265U_UP_1080
GATTCCAGCCAAGAGATAAAAGGTCACCTCTTTGTTGATAAAATCTCCAATCGAGCCGTGAAGGCAACGAGCTACGGACACCACCCCTGGGGACTGCATCGGGCCTGTATGATTGAATTCTGTGGGAAACAGTGGATACGGACAGATCTCGGTGACCTGATATCTGTCGAATACAATTCCGGATCAGAAATCCTCTCGTTCTCGAAGTGTGAGGACAAGACGGTGGGGATGAGGGGAAACTTGGATGACTTTGCCTATCTAGACGATCTGGTGAAGGCCTCTGAGAGCAGAGAAGAATGTCTT
>2012_mG250U_UP_1088
GATTCC

In [192]:
#MAKE UC FASTA
for key in subsampled_data.keys():
    for element in subsampled_data[key]:
        if 'UC' in element['taxon_name']:
            print '>' + element['taxon_name'] + '\n' + element['sequence']
        else:
            continue

>1989_mG001U_UC_57
GATTCCAGCCAAGAGATAAAAGGTCACCTCTTTGTTGATAAAATCTCCAATCGAGTCGTGAAGGCAACGAGCTACGGACACCACCCTTGGGGACTGCATCGGGCCTGTATGATTGAATTCTGTGGGAAACAGTGGATACGGACAGATCTCGGTGACCTGATATCTGTCGAATACAATTCTGGAGCAGAAATCCTCTCGTTCCCGAAGTGTGAGGACAAGACGGTGGGGATGAGGGGAAACTTGGATGACTTTGCCTATCTAGATGATCTGGTGAAGGCCTCTGAGAGCAGAGAGGAATGTCTT
>2007_mG177U_UC_533
GATTCCAGCCAAGAGATAAAAGGTCACCTCTTTGTTGAAAAAATCTCCAATCGAGTCGTGAAGGCAACGAGCTACGGACACCACCCTTGGGGACTGCATCGGGCCTGTATGATTGAATTCTGTGGGAAACAGTGGATACGGACAGATCTCGGTGACCTGATATCTGTCGAATACAATTCTGGAGCAGAAATCCTCTCGTTCCCGAAGTGTGAGGACAAGACGGTGGGGATGAGGGGAAACTTGGATGACTTTGCCTATCTGGATGATCTGGTGAAGGCCTCTGAGAGCAGAGAGGAATGTCTT
>2007_mG198U_UC_528
GATTCCAGCCAAGAGATAAAAGGTCACCTCTTTGTTGATAAAATCTCCAATCGAGTCGTGAAGGCAACGAGCTACGGACACCACCCCTGGGGACTGCATCGGGCCTGTATGATTGAATTCTGTGGGAAACAGTGGATACGGACAGATCTCGGTGACCTGATATCTGTCGAATACAATTCTGGAGCAGAAATCCTCTCGTTCCCGAAGTGTGAGGACAAGACGGTGGGGATGAGGGGAAACTTGGATGACTTTGCCTATCTGGATGATCTGGTGAAGGCCTCTGAGAGCAGAGAGGAATGTCTT
>2007_mG174U_UC_520
GATTCCAGC

In [193]:
#Next, GEOGRAPHIC RANGE trait map for UP subsampled events
print "taxon" + '\t' + "range"
for key in subsampled_data.keys():
    for element in subsampled_data[key]:
        if 'UP' in element['taxon_name']:
            print  element['taxon_name'] + '\t' + element['range']

taxon	range
1984_mG003U_UP_23	CRB
2012_mG265U_UP_1092	coastal
2012_mG265U_UP_1080	coastal
2012_mG250U_UP_1088	coastal
2012_mG050UmG250U_UP_1085	coastal
2012_mG050U_UP_1086	coastal
1995_mG002U_UP_103	coastal
1995_mG017U_UP_114	coastal
2003_mG184U_UP_296	coastal
1990_mG051U_UP_68	coastal
1986_mG003U_UP_31	coastal
1986_mG054U_UP_32	coastal
2004_mG002U_UP_347	coastal
2004_mG268U_UP_342	coastal
2010_mG050U_UP_660	CRB
1981_mG003U_UP_15	coastal
2009_mG050U_UP_580	CRB
2012_mG050UmG270U_UP_949	coastal
2012_mG050U_UP_952	coastal
2012_mG050U_UP_1040	coastal
2012_mG050U_UP_928	coastal
2012_mG285U_UP_1039	coastal
2006_mG148U_UP_472	coastal
2006_mG184U_UP_468	coastal
2006_mG269U_UP_469	coastal
1988_mG028U_UP_52	CRB
1988_mG050U_UP_47	CRB
1983_mG045U_UP_21	coastal
1988_mG051U_UP_50	coastal
1992_mG002U_UP_78	coastal
1987_mG002U_UP_41	CRB
2000_mG002U_UP_212	coastal
2000_mG003U_UP_211	coastal
1989_mG035U_UP_58	coastal
1989_mG053U_UP_61	coastal
1993_mG019U_UP_84	coastal
2010_mG050U_UP_653	CRB
2013_mG274U_

In [194]:
#Next, GEOGRAPHIC RANGE trait map for UC subsampled events
print "taxon" + '\t' + "range"
for key in subsampled_data.keys():
    for element in subsampled_data[key]:
        if 'UC' in element['taxon_name']:
            print  element['taxon_name'] + '\t' + element['range']

taxon	range
1989_mG001U_UC_57	CRB
2007_mG177U_UC_533	CRB
2007_mG198U_UC_528	CRB
2007_mG174U_UC_520	CRB
2007_mG237U_UC_523	CRB
2007_mG151U_UC_519	CRB
2006_mG174U_UC_475	CRB
2003_mG194U_UC_312	CRB
2003_mG001U_UC_329	CRB
2003_mG001U_UC_340	CRB
2003_mG001U_UC_323	CRB
2003_mG150U_UC_309	CRB
2003_mG001U_UC_322	CRB
1975_mG057U_UC_6	CRB
1999_mG001U_UC_186	CRB
1999_mG001U_UC_187	CRB
1999_mG001U_UC_195	CRB
1999_mG033U_UC_196	CRB
1999_mG289U_UC_184	CRB
2006_mG147U_UC_511	CRB
2006_mG001U_UC_485	CRB
2006_mG151U_UC_479	CRB
2006_mG001U_UC_473	CRB
2006_mG001U_UC_505	CRB
1989_mG001U_UC_59	CRB
1987_mG001U_UC_38	CRB
1987_mG001U_UC_42	CRB
2007_mG032U_UC_515	coastal
2007_mG147U_UC_516	coastal
2007_mG147U_UC_541	coastal
1993_mG032U_UC_82	CRB
2001_mG001U_UC_225	CRB
2001_mG001U_UC_227	CRB
2001_mG001U_UC_246	CRB
2010_mG151U_UC_737	CRB
2009_mG001U_UC_571	CRB
2009_mG032U_UC_610	CRB
2009_mG001U_UC_601	CRB
2009_mG032U_UC_611	CRB
2009_mG151U_UC_578	CRB
1991_mG032U_UC_76	coastal
2009_mG185U_UC_618	CRB
1988_mG001U_UC

In [195]:
#Host Trait Map for UP subsampled events
print "taxon" + '\t' + "host"
for key in subsampled_data.keys():
    for element in subsampled_data[key]:
        if 'UP' in element['taxon_name']:
            print  element['taxon_name'] + '\t' + element['host']

taxon	host
1984_mG003U_UP_23	chin
2012_mG265U_UP_1092	nondom
2012_mG265U_UP_1080	nondom
2012_mG250U_UP_1088	nondom
2012_mG050UmG250U_UP_1085	nondom
2012_mG050U_UP_1086	nondom
1995_mG002U_UP_103	nerka
1995_mG017U_UP_114	nerka
2003_mG184U_UP_296	nondom
1990_mG051U_UP_68	nerka
1986_mG003U_UP_31	nerka
1986_mG054U_UP_32	nerka
2004_mG002U_UP_347	nerka
2004_mG268U_UP_342	nerka
2010_mG050U_UP_660	nondom
1981_mG003U_UP_15	nerka
2009_mG050U_UP_580	mykiss
2012_mG050UmG270U_UP_949	nerka
2012_mG050U_UP_952	nerka
2012_mG050U_UP_1040	nerka
2012_mG050U_UP_928	nerka
2012_mG285U_UP_1039	nerka
2006_mG148U_UP_472	nerka
2006_mG184U_UP_468	nerka
2006_mG269U_UP_469	nerka
1988_mG028U_UP_52	chin
1988_mG050U_UP_47	chin
1983_mG045U_UP_21	nerka
1988_mG051U_UP_50	nerka
1992_mG002U_UP_78	mykiss
1987_mG002U_UP_41	chin
2000_mG002U_UP_212	nondom
2000_mG003U_UP_211	nondom
1989_mG035U_UP_58	chin
1989_mG053U_UP_61	chin
1993_mG019U_UP_84	nerka
2010_mG050U_UP_653	nerka
2013_mG274U_UP_1093	nerka
2013_mG274U_UP_1094	nerka
20

In [196]:
#Host Trait Map for UC subsampled events
print "taxon" + '\t' + "host"
for key in subsampled_data.keys():
    for element in subsampled_data[key]:
        if 'UC' in element['taxon_name']:
            print  element['taxon_name'] + '\t' + element['host']

taxon	host
1989_mG001U_UC_57	nerka
2007_mG177U_UC_533	chin
2007_mG198U_UC_528	chin
2007_mG174U_UC_520	chin
2007_mG237U_UC_523	chin
2007_mG151U_UC_519	chin
2006_mG174U_UC_475	nondom
2003_mG194U_UC_312	nerka
2003_mG001U_UC_329	mykiss
2003_mG001U_UC_340	mykiss
2003_mG001U_UC_323	mykiss
2003_mG150U_UC_309	mykiss
2003_mG001U_UC_322	mykiss
1975_mG057U_UC_6	nerka
1999_mG001U_UC_186	mykiss
1999_mG001U_UC_187	mykiss
1999_mG001U_UC_195	mykiss
1999_mG033U_UC_196	mykiss
1999_mG289U_UC_184	mykiss
2006_mG147U_UC_511	chin
2006_mG001U_UC_485	chin
2006_mG151U_UC_479	chin
2006_mG001U_UC_473	chin
2006_mG001U_UC_505	chin
1989_mG001U_UC_59	mykiss
1987_mG001U_UC_38	mykiss
1987_mG001U_UC_42	mykiss
2007_mG032U_UC_515	mykiss
2007_mG147U_UC_516	mykiss
2007_mG147U_UC_541	mykiss
1993_mG032U_UC_82	nerka
2001_mG001U_UC_225	nondom
2001_mG001U_UC_227	nondom
2001_mG001U_UC_246	nondom
2010_mG151U_UC_737	nondom
2009_mG001U_UC_571	nerka
2009_mG032U_UC_610	mykiss
2009_mG001U_UC_601	mykiss
2009_mG032U_UC_611	mykiss
2009_mG