In [None]:
import pandas as pd

In [None]:
data.columns

In [None]:
rename_map = { 'Organism Name': 'name'
                    , 'Organism Groups': 'taxonomy_string'
#                    , 'Level': 'assembly_completeness'
#                    , 'Size(Mb)': 'total_length'
#                    , 'GC%': 'gc_percent'
                    , 'Replicons': 'replicons_string'
#                    , 'Scaffolds': 'scaffold_tally'
#                    , 'CDS': 'cds_tally'
#                    , 'Release Date': 'release_date'
#                    , 'GenBank FTP': 'genbank_ftp_url'
                    , 'RefSeq FTP': 'refseq_ftp_url'
#                    , 'Genes': 'gene_tally
#                    , 'Host': 'host_name'
#                    , 'Modify Date': 'modify_date'
                    }



In [None]:
data = pd.read_csv('meta/ncbi_genomes.csv').rename(columns={'#Organism Name': 'Organism Name'})
data = data.rename(columns=rename_map)

def orgname_to_id(name):
    'Transform name from NCBI into a normalized genome_id'
    out = name
    words = out.split(' ')
    # Filter out non-alpha characters from genus word.
    words[0] = ''.join(filter(str.isalpha, words[0]))
    # Replace genus with first letter.
    words[0] = words[0][0]
    # Drop the part after a '=' word (synonym?)
    if '=' in words:
        words = words[:words.index('=')]
    # Remove uneccessary modifier words.
    words = filter(lambda s: s not in ['str.', 'substr.', 'subsp.', 'bv.', 'biovar'], words)
    out = '_'.join(words)
    # Replace with better characters
    for char in [':', '-', '/', '=', '.']:
        out = out.replace(char, '_')
    for char in ['(', ')', "'"]:
        out = out.replace(char, '')

    # Check only legal characters
    for char in out:
        assert char.isalnum() or char == '_', (name, out)

    return out

data['genome_id'] = data.name.map(orgname_to_id)
assert data.genome_id.is_unique

data

In [None]:
def parse_replicon(r):
    r = r.strip()
    name, accessions = r.split(':')
    if name.startswith('chromosome'):
        replicon_type = 'chromosome'
    elif name.startswith('plasmid'):
        replicon_type = 'plasmid'
    if '/' in accessions:
        refseq_id, genbank_id = accessions.split('/')
    else:
        genbank_id = accessions
        refseq_id = ''
    return name, replicon_type, refseq_id, genbank_id

def flatten_replicons(x):
    # replicons string is "<chromosome name>" or "plasmid" followed by ":<refseq>/<genbank>"
    genome_id = x.genome_id
    replicons = [parse_replicon(r) for r in x.replicons_string.split(';')]
    for r in replicons:
        yield tuple([genome_id]) + r
        
replicons = []
for _, g in data.iterrows():
    for r in flatten_replicons(g):
        replicons.append(r)
        
replicons = pd.DataFrame(replicons, columns=['genome_id', 'replicon_name', 'replicon_type', 'refseq_id', 'genbank_id'])
replicons