In [72]:
import functools as ft
import numpy as np
import pandas as pd
import multiprocessing as mp
import re
import sys
import csv
import collections as co
import itertools
import difflib

In [129]:
class extractor(object):

    def __init__(self):
        
        self.cities = co.defaultdict(list)
        self.subcountries = co.defaultdict(list)
        self.countries = co.defaultdict(list)
        
        self.cities_names = []
        self.subcountries_names = []
        self.countries_names = []
        
        self.matrix_genome = np.empty((0, 0, ), dtype=object)
        self.matrix_strain = np.empty((0, 0, ), dtype=object)
        self.matrix_head = np.empty((0, 0, ), dtype=object)
        
        self.index_accession = np.empty(0, dtype=object)
        self.index_strain = np.empty(0, dtype=object)
        
    def init_matrix(self, infile):
        
        lines = sum(1 for l in open(infile))
        
        self.matrix_genome = np.empty((lines, 1, ), dtype=object)
        self.matrix_strain = np.empty((lines, 6, ), dtype=object)
        self.matrix_head = np.empty((lines, 2, ), dtype=object)
        
        self.index_accession = np.empty(lines, dtype=object)
        self.index_strain = np.empty(lines, dtype=object)
    
    
    def fill_dicts(self, worldfile):
        
        data = pd.read_csv(worldfile, chunksize = 10000, sep = ',', na_filter = False)
        
        for split in data:

            for city, country, subcountry, geonameid in split.itertuples(index=False, name=None):

                self.cities[city].append([city, subcountry, country])
                self.subcountries[subcountry]. append(['null', subcountry, country])
                self.countries[country].append(['null', 'null', country])

        self.cities_names = list(self.cities.keys())
        self.subcountries_names = list(self.subcountries.keys())
        self.countries_names = list(self.countries.keys())

        
    def destination(self, entry):

        match_subcountry = difflib.get_close_matches(entry, self.subcountries_names, 1, 0.9)
        if not match_subcountry:
            match_city = difflib.get_close_matches(entry, self.cities_names, 1, 0.9)
            if not match_city:
                match_country = difflib.get_close_matches(entry, self.countries_names, 1, 0.9)
                if not match_country:
                    result = ['null', 'null', 'null']
                else:
                    match = match_country[0]
                    result = self.countries[match]
            else:
                match = match_city[0]
                result = self.cities[match]
        else:
            match = match_subcountry[0]
            result = self.subcountries[match]

        if any(isinstance(i, list) for i in result):
            output = result[0]
        else:
            output = result

        return(output)

    
    def input_sequences(self, infile):
        
        data = pd.read_csv(infile, chunksize = 10000, sep = ';', na_filter = False, header = None)
        
        #dt = np.dtype([('R','u1'), ('G','u1'), ('B','u1'), ('A','u1')])
        
        for chunk in data:
        
            for line, info, read in chunk.itertuples(index=True, name=None):

                head = info.split('|')

                accession = head[0][1:]
                strain = head[1]
                segment = head[2]
                organism = head[4]
                subtype = head[5]
                if subtype == 'NA' or subtype == 'nan':
                    subtype = 'null'
                host = head[7]

                info = strain.split('/')

                spec = info[0]
                del info[0]
                year = info[-1]
                del info[-1]

                if year.isdecimal():
                    if len(year) == 2:
                        year = '19'+year 
                else:
                    year = 'null'

                if not info:
                    pos = ['null', 'null', 'null']
                else:
                    for i in info:
                        pos = self.destination(i)
                        if not all([item == 'null' for item in pos]):
                            break

                self.matrix_genome[line] = np.array([read])
                self.matrix_strain[line] = np.array([spec, pos[0], pos[1], pos[2], year, host])
                self.matrix_head[line] = np.array([strain, segment])
                self.index_accession[line] = accession
                self.index_strain[line] = strain
        
    def get_genomes(self):
        
        genomes = pd.DataFrame(self.matrix_genome, index = self.index_accession)
        
        return(genomes)
    
    def get_strains(self):
        
        strains = pd.DataFrame(self.matrix_strain, index = self.index_strain).drop_duplicates()
        
        return(strains)
    
    def get_header(self):
        
        header = pd.DataFrame(self.matrix_head, self.index_accession)
        
        return(header)

In [130]:
x = extractor()

In [131]:
x.fill_dicts('cities.csv')

In [132]:
x.init_matrix('B_HA.csv')

In [133]:
x.input_sequences('B_HA.csv')

In [139]:
x.get_header()

Unnamed: 0,0,1
LC033391,B/Isahaya/13I004/2014,1
KT854634,B/Alabama/01/2015,1
KX615827,B/Alabama/01/2016,1
CY218618,B/Alabama/01/2017,1
MN637988,B/Alabama/01/2019,1
...,...,...
CY018763,B/Victoria/02/1987,1
CY018659,B/Victoria/504/2000,1
CY018843,B/Wellington/01/1994,1
CY018771,B/Yamagata/16/1988,1
