In [108]:
import pandas as pd
import numpy as np
from Bio import SeqIO

In [109]:
agm_titers_df = pd.read_csv('/Users/Sidney/nextstrain/dengue/data/smith2015/agm_1month_titers.csv', index_col=0, comment='#')
agm_titers_df.dropna(how='all', inplace=True)
smith_key_df = pd.read_csv('/Users/Sidney/nextstrain/dengue/data/smith2015/final-supplemental-files/Fig1A-key-for-tree-names-virus-names.txt', header=0, sep='\t',index_col=0)
smith_strain_acc = { s['fullname'] : s['genbank'] for i,s in smith_key_df.iterrows()}
smith_sequences = { s.description : s for s in SeqIO.parse('/Users/Sidney/nextstrain/dengue/data/smith2015/final-supplemental-files/Fig1A-aligned-nucleotide-sequences.FASTA', 'fasta')}
vdb_acc_strains = { s.description.split('|')[1] : s.description.split('|')[0].split('.')[0] for s in SeqIO.parse('/Users/Sidney/nextstrain/fauna/data/dengue.fasta', 'fasta')}

for s in smith_strain_acc.keys():
    smith_strain_acc[s.replace('-', '').replace('_', '')] = smith_strain_acc[s]
smith_strain_acc['DENV2/Tonga/1974-Tonga-74']='AY744147'
smith_strain_acc['DENV3/Fiji/1992-29472'] = 'L11422'
smith_strain_acc['DENV2/Senegal/1970/Sendak_H D_0674'] = smith_strain_acc['DENV2/Senegal/1970/Sendak_H']
smith_sequences['DENV3/Fiji/1992-29472'] = smith_sequences['DENV3/Fiji/1992-29472-L11422-I']

In [110]:
def build_canonical_name(sero, country, strain_id, year):
    return ('%s/%s/%s/%s'%(sero, country, strain_id, year)).strip().upper()

def pull_virus_smithmetadata(strain):
    serotype, country, yearstrain = strain.split('/', 2)
    yearstrain = yearstrain.replace('/', '-')
    year = yearstrain.split('-')[0]
    strain = ''.join(yearstrain.split('-')[1:]).replace('_', '')
    return serotype, country, strain, year

def pull_sera_smithmetadata(strain):
    serotype, country, year, strain = strain.split('_', 3)
    serotype = 'DENV'+serotype[-1]
    strain = strain.replace('_', '')
    return serotype, country, strain, year

def fix_smith_strain(strain, type='virus'):
    '''
    Given metadata annotations in doc, make new strain names like
    DENV1234/country/ID/year
    where ID is derived from the original strain ID with metadata redundancies removed
    '''
    ## Pull data, try the obvious first
    
    if type == 'virus':
        sero, country, strain_id, year = pull_virus_smithmetadata(strain) # Pull metadata from pre-processed annotations
    else:
        sero, country, strain_id, year = pull_sera_smithmetadata(strain)
    strain = build_canonical_name(sero, country, strain_id, year)
    return strain

In [122]:
def format_smith_upload(smith_strain_list):
    missing_records = []
    for s in smith_strain_list:
        row = {}
        NA_fields = ['Species', 'Isolate Name', 'Georegion', 'Author', 'Sampling City', 'Pubmed ID']
        for n in NA_fields:
            row[n] = None
        if '/' in s:
            serotype, country, strain, year = pull_virus_smithmetadata(s)
        else:
            serotype, country, strain, year = pull_sera_smithmetadata(s)
        row['Accession'] = smith_strain_acc[s]
        row['Name'] = fix_smith_strain(s)
        row['Start'] = 935
        row['Stop'] = 2413
        row['Segment'] = 'E'
        row['Organism'] = 'dengue virus '+serotype[-1]
        row['Country'] = country
        row['Sampling Year'] = year
        row['Species'] = 'dengue virus'
        try:
            row['Sequence'] = str(smith_sequences[s].seq)
        except:
            print 'sequence not found for %s'%s
            print sorted(smith_sequences.keys())
        missing_records.append(row)
#     pd.DataFrame(missing_vdb_viruses).to_csv('smith_viruses.tsv', sep='\t')

def convert_smith_vdb_virus_strains(smith_strains):
    smith_vdb_strains = {}
    not_in_key = []
    not_in_vdb = []
    for smith_strain in smith_strains:
        try: # Have an accession that's in vdb?
            smith_vdb_strains[smith_strain] = vdb_acc_strains[smith_strain_acc[smith_strain]]
        except:
            if smith_strain in smith_strain_acc and smith_strain_acc[smith_strain] not in vdb_acc_strains: # Have an accession not in vdb?
                not_in_vdb.append(smith_strain)
                smith_vdb_strains[smith_strain] = fix_smith_strain(smith_strain)
            else: # Missing accession altogether?
                try:
                    smith_vdb_strains[smith_strain] = vdb_acc_strains[smith_strain_acc[smith_strain.replace('-', '').replace('_', '')]]
                except:
                    not_in_key.append(smith_strain)
    if not_in_key != []: 
        print '\n\nThese strain names did not match an accession number in the key:', sorted(not_in_key)
        print '\n\nKey strain names:', sorted(smith_strain_acc.items())
        
    return smith_vdb_strains, not_in_key

smith_vdb_virus_strains, missing_vdb_viruses = convert_smith_vdb_virus_strains(agm_titers_df.index.values)
vdb_smith_virus_strains = { v: k for k,v in smith_vdb_virus_strains.items() }

In [124]:
def convert_smithsera_smithvirus_strain(strain):
    serotype, country, strain, year = pull_sera_smithmetadata(strain)
    return serotype+'/'+country+'/'+year+strain

def convert_smith_vdb_sera_strains(smith_strains):
    smith_vdb_strains = {}
    not_in_vdb = []
    not_in_key = []
    
    for smith_strain in smith_strains:
        fixed_smith_strain = fix_smith_strain(smith_strain, type='sera')
        
        if smith_strain in smith_vdb_virus_strains: # seen smith strain before?
            smith_vdb_strains[smith_strain] = smith_vdb_virus_strains[smith_strain]
            
        elif fixed_smith_strain in smith_vdb_virus_strains: # seen malformatted version of name, made same vdb name before?
            smith_vdb_strains[smith_strain] = fixed_smith_strain
            
        elif smith_strain in smith_strain_acc: # do we have an accession number?
            try:
                smith_vdb_strains[smith_strain] = vdb_acc_strains[smith_strain_acc[smith_strain]] # is the acc in vdb?
            except:
                not_in_vdb.append(smith_strain)
                smith_vdb_strains[smith_strain] = fix_smith_strain(smith_strain)
                
        elif smith_strain.replace('-', '').replace('_', '') in vdb_acc_strains: # do we almost have an accession number?
            try:
                smith_vdb_strains[smith_strain] = vdb_acc_strains[smith_strain_acc[smith_strain.replace('_', '').replace('_', '')]] # is the acc in vdb?
                smith_strain_acc[smith_strain] = smith_strain_acc[smith_strain.replace('_', '').replace('_', '')]
            except:
                not_in_vdb.append(smith_strain)
                smith_vdb_strains[smith_strain] = fix_smith_strain(smith_strain)
        elif convert_smithsera_smithvirus_strain(smith_strain) in smith_strain_acc: # do we have an acc for the OTHER formatting found in this document?
            try:
                smith_vdb_strains[smith_strain] = vdb_acc_strains[smith_strain_acc[convert_smithsera_smithvirus_strain(smith_strain)]] # is the acc in vdb?
                smith_strain_acc[smith_strain] = smith_strain_acc[convert_smithsera_smithvirus_strain(smith_strain)]
            except:
                not_in_vdb.append(smith_strain)
                smith_vdb_strains[smith_strain] = fix_smith_strain(smith_strain)
        else:
            not_in_key.append(smith_strain)

    if not_in_key != []: 
        print '\n\nThese strain names did not match an accession number in the key:', sorted(not_in_key)
        print '\n\nKey strain names:', sorted(smith_strain_acc.items())
    return smith_vdb_strains, not_in_vdb

smith_vdb_sera_strains, missing_vdb_seraviruses = convert_smith_vdb_virus_strains(agm_titers_df.columns.values)
missing_vdb_viruses += missing_vdb_seraviruses
print '\n\n\n missing'
print missing_vdb_viruses

format_smith_upload(missing_vdb_viruses)
print 'These viruses were not found in vdb.\nTo upload, run `fauna$ python vdb/dengue_upload --fname smith_viruses.tsv --ftype tsv -v dengue -db vdb`\n', not_in_vdb

# agm_titers_df.rename(columns = virus_strains_smith_vdb, index = sera_strains_smith_vdb)
# df --> line list


######### NOTE FOR TOMORROW -- FIX YOUR SERA NAME : ACCESSION KEY ISSUE, it'll solve the rest ###############



These strain names did not match an accession number in the key: ['DEN1_Bolivia_2010_FSB_3363', 'DEN1_Burma_2005_61117', 'DEN1_Cambodia_2003_GenBankGQ868619', 'DEN1_Nauru_1974_NIHvaccine', 'DEN1_Peru_2000_IQT_6152', 'DEN1_PuertoRico_2006_BID_V852', 'DEN1_Venezuela_2000_OBT_1298', 'DEN1_VietNam_2008_BID_V1937', 'DEN2_Cambodia_2009_D2T0601085_KH09_KSP', 'DEN2_Malaysia_2008_DKD811', 'DEN2_Nicaragua_2005_BID_V533', 'DEN2_Nicaragua_2006_BID_V571', 'DEN2_Peru_1996_IQT2913', 'DEN2_Senegal_2003_Sendak_HD_0674', 'DEN2_Tonga_1974_NIHvaccine', 'DEN2_Vietnam_2003_DF_670', 'DEN2_Vietnam_2006_32_135_2', 'DEN2_Vietnam_2006_BID_V735', 'DEN3_Burma_2008_80931', 'DEN3_Cambodia_2011_V0907330', 'DEN3_Fiji_1992__', 'DEN3_Nicaragua_2009_608', 'DEN3_PuertoRico_1963_PRS_228762', 'DEN3_PuertoRico_2006_429965', 'DEN3_Vietnam_2006_BID_V1329', 'DEN3_Vietnam_2007_BID_V1817', 'DEN4_Brazil_2012_BR_12', 'DEN4_Burma_2008_81087', 'DEN4_Cambodia_2010_U0811386', 'DEN4_Cambodia_2011_V0624301', 'DEN4_Indonesia_1973_M30153

KeyError: 'DEN1_Nauru_1974_NIHvaccine'