In [7]:
import re
import numpy as np
import pandas as pd
import multiprocessing as mp
import glob
from Bio import SeqIO
import sys
import csv
import collections
import itertools
import difflib

In [8]:
class wrapper(object):
    
    def __init__(self, ingene):

        self.dict_seg = collections.defaultdict(dict)
        self.dict_seg['A'] = {"[( ]P[B]?2[ .,)]":"1", "[ ]?POLYMERASE( PROTEIN| GENE)?( BASIC| B)( PROTEIN| GENE)? 2[,. ]":"1", 
                              "[( ]P[B]?1[ .,)]":"2", "[ ]?POLYMERASE( PROTEIN| GENE)?( BASIC| B)( PROTEIN| GENE)? 1[,. ]":"2", 
                              "[( ]PA[ .,)]":"3", "[ ]?POLYMERASE( PROTEIN| GENE)?( ACIDIC| A| ACID)( PROTEIN| GENE)?[,. ]":"3", 
                              "[( ]HA[ .,)]":"4", "[ ]?H[A]?EMAGGLUTININ[,. ]":"4", 
                              "[( ]NP[ .,)]":"5", "[ ]?NUCLEOPROTEIN[,. ]":"5", "[ ]?NUCLEOCAPSID[,. ]":"5",
                              "[( ]NA[ .,)]":"6", "[ ]?NEURAMINIDASE[,. ]":"6",
                              "[( ]M[12]?[ .,)]":"7", "[ ]?MATRIX[., ]":"7", "[ ]MEMBRANE?[., ]":"7",
                              "[( ]NS[1]?[ .,)]":"8", "[( ]NEP[ .,)]":"8", "[ ]?NON-STRUCTURAL[., ]":"8", "[ ]?NONSTRUCTURAL[., ]":"8", "[ ]?NUCLEAR EXPORT[., ]":"8"}

        self.dict_seg['B'] = {"[( ]P[B]?1[ .,)]":"1", "[ ]?POLYMERASE( PROTEIN| GENE)?( BASIC| B)( PROTEIN| GENE)? 1[,. ]":"1", 
                              "[( ]P[B]?2[ .,)]":"2", "[ ]?POLYMERASE( PROTEIN| GENE)?( BASIC| B)( PROTEIN| GENE)? 2[,. ]":"2", 
                              "[( ]PA[ .,)]":"3", "[ ]?POLYMERASE( PROTEIN| GENE)?( ACIDIC| A| ACID)( PROTEIN| GENE)?[,. ]":"3", 
                              "[( ]HA[ .,)]":"4", "[ ]?H[A]?EMAGGLUTININ[,. ]":"4", 
                              "[( ]NP[ .,)]":"5", "[ ]?NUCLEOPROTEIN[,. ]":"5", "[ ]?NUCLEOCAPSID[,. ]":"5",
                              "[( ]NA[ .,)]":"6", "[ ]?NEURAMINIDASE[,. ]":"6",
                              "[( ][B]?M[12]?[ .,)]":"7", "[ ]?MATRIX[., ]":"7", "[ ]MEMBRANE?[., ]":"7",
                              "[( ]NS[12]?[ .,)]":"8", "[( ]NEP[ .,)]":"8", "[ ]?NON-STRUCTURAL[., ]":"8", "[ ]?NONSTRUCTURAL[., ]":"8", "[ ]?NUCLEAR EXPORT[., ]":"8"}

        self.dict_seg['C'] = {"[( ]P[B]?2[ .,)]":"1", "[ ]?POLYMERASE( PROTEIN| GENE)?( BASIC| B)( PROTEIN| GENE)? 2[,. ]":"1", 
                              "[( ]P[B]?1[ .,)]":"2", "[ ]?POLYMERASE( PROTEIN| GENE)?( BASIC| B)( PROTEIN| GENE)? 1[,. ]":"2", 
                              "[( ]P[A3][ .,)]":"3", "[ ]?POLYMERASE( PROTEIN| GENE)?( ACIDIC| A| ACID)( PROTEIN| GENE)?[,. ]":"3", 
                              "[( ]H[AE][F]?[ .,)]":"4", "[ ]?H[A]?EMAGGLUTININ[,. ]":"4", 
                              "[( ]NP[ .,)]":"5", "[ ]?NUCLEOPROTEIN[,. ]":"5", "[ ]?NUCLEOCAPSID[,. ]":"5",
                              "[( ][C]?M[12]?[ .,)]":"6", "[ ]?MATRIX[., ]":"6", "[ ]MEMBRANE?[., ]":"6",
                              "[( ]NS[1]?[ .,)]":"7", "[( ]NEP[ .,)]":"7", "[ ]?NON-STRUCTURAL[., ]":"7", "[ ]?NONSTRUCTURAL[., ]":"7", "[ ]?NUCLEAR EXPORT[., ]":"7"}
        self.gb_vrl = SeqIO.index_db(ingene + "gbvrl.idx", glob.glob(ingene + "gbvrl*.seq"), "genbank")
        self.matrix = np.empty((0, 0, ), dtype=object)
        
        
    def search(self, accession):

        vers = 1
        while True:
            try:
                entry = self.gb_vrl[accession + '.' + str(vers)].description
                return entry
            except KeyError:
                if vers >= 10:
                    entry = 'Null'
                    return entry
                else:
                    vers = vers + 1


    def quest (self, question, mode):
        if mode == 1:    
            while True:
                rem = input(question)
                if rem == "y":
                    return rem
                elif rem == "n":
                    return rem
                else:
                    print("Wrong input. [y/n]")
        else:
            rem = "y"
            return rem


    def genome(self, infile, mode, silent):

        rows = sum(1 for l in open(infile))
        self.matrix = np.empty((rows, 8, ), dtype=object)
        
        data = pd.read_csv(infile, chunksize = 10000, sep = ',', na_filter = False, header = None)
        
        for chunk in data:

            for line, info, read in chunk.itertuples(index=True, name=None):

                row = list(map(str, info.split('|')))
                row[0] = row[0][1:]
                
                if row[8] == 'Pass':
                    entry = self.search(row[0]).upper()

                    if entry != 'Null':   

                        try:
                            for num_key in ["1", "2", "3", "4", "5", "6", "7", "8"]:
                                if re.search(rf"SE(T|G)MENT[: ]+{num_key}", entry):
                                    num = num_key
                                    if num:
                                        break
                            if 'num' not in locals():
                                for seg_key in self.dict_seg[row[4]]:
                                    if re.search(rf"{seg_key}", entry):
                                        num = self.dict_seg[row[4]].get(seg_key)
                                        if num:
                                            break

                                if 'num' not in locals():   
                                    rem = self.quest(row[0][1:] 
                                                + ": No protein information found. Remove? [y/n].\nIRD entry: segment number: " 
                                                + row[2] + ".\nGenBank entry: " + entry + ".", mode)
                                    if rem == "y":
                                        if silent == 0:  
                                            print(row[0][1:] + ": Removed.")  
                                    elif rem == "n":
                                        self.matrix[line] = np.array([row[0], row[1], row[2], row[4], row[5], row[6], row[7], read])

                                else:
                                    if num == row[2]:
                                        self.matrix[line] = np.array([row[0], row[1], row[2], row[4], row[5], row[6], row[7], read])
                                    else:
                                        rem = self.quest(row[0][1:] 
                                                    + ": Wrong segment number. Change? [y/n].\nIRD entry: segment number: " 
                                                    + row[2] + ".\nGenBank entry segment number: " + num + ".", mode)
                                        if rem == "y":
                                            row[2] = num
                                            if silent == 0:  
                                                print(row[0][1:] + ": Segment number changed.")
                                            self.matrix[line] = np.array([row[0], row[1], row[2], row[4], row[5], row[6], row[7], read])
                                        elif rem == "n":
                                            self.matrix[line] = np.array([row[0], row[1], row[2], row[4], row[5], row[6], row[7], read])

                            else:
                                if num == row[2]:
                                    self.matrix[line] = np.array([row[0], row[1], row[2], row[4], row[5], row[6], row[7], read])
                                else:
                                    rem = self.quest(row[0][1:] 
                                                + ": Wrong segment number. Change? [y/n].\nIRD entry: segment number: " 
                                                + row[2] + ".\nGenBank entry segment number: " + num + ".", mode)
                                    if rem == "y":
                                        row[2] = num
                                        if silent == 0:  
                                            print(row[0][1:] + ": Segment number changed.")  
                                        self.matrix[line] = np.array([row[0], row[1], row[2], row[4], row[5], row[6], row[7], read])
                                    elif rem == "n":
                                        self.matrix[line] = np.array([row[0], row[1], row[2], row[4], row[5], row[6], row[7], read])

                        except:
                            rem = self.quest(row[0][1:] 
                                        + ": No protein information found. Remove? [y/n].\nIRD entry: segment number: " 
                                        + row[2] + ".\nGenBank entry: " + entry + ".", mode)
                            if rem == "y":
                                if silent == 0:  
                                    print(row[0][1:] + ": Removed.")  
                            elif rem == "n":
                                self.matrix[line] = np.array([row[0], row[1], row[2], row[4], row[5], row[6], row[7], read])

                    else:
                        rem = self.quest(row[0][1:] 
                                    + ": No GenBank entry found. Remove? [y/n]. \nIRD entry: segment number: " 
                                    + row[2] + ".", mode)
                        if rem == "y":
                            if silent == 0:  
                                print(row[0][1:] + ": Removed.")  
                        elif rem == "n":
                            self.matrix[line] = np.array([row[0], row[1], row[2], row[4], row[5], row[6], row[7], read])

                #else:
                    #rem = quest(row[0][1:] 
                    #            + ": Not passed curation. Remove? [y/n].", mode)
                    #if rem == "y":
                        #if silent == 0:  
                            #print(row[0][1:] + ": Removed.")  
                    #elif rem == "n":
                    #    yield row[0] + "|" + row[1] + "|" + row[2] + "|" + row[3]
                    #pass

                    #to many not curation pass so omit this dialog
                    
                if 'num' in locals():
                    del num
                if 'entry' in locals():
                    del entry
                if 'rem' in locals():
                    del rem
                    
    def get_matrix(self):
        
        colnames = ['accession','strain','segment', 'organism', 'subtype', 'year', 'host', 'genome']
        curated = pd.DataFrame(self.matrix, columns = colnames)
        curated.set_index(['accession','strain','subtype'], inplace = True)
        
        return(curated)

In [9]:
def main():
    
    x = wrapper('../../../Desktop/Masterthesis_V4/ISU_Database/genbank/')
    x.genome('Input/A_HA.csv', 1, 0)
    matrix = x.get_matrix()

In [10]:
if __name__ == '__main__':

        main()

30276: No protein information found. Remove? [y/n].
IRD entry: segment number: 4.
GenBank entry: INFLUENZA A VIRUS (A/SWINE/GERMANY/2/1981(H1N1)) MRNA FOR HEMAGGLUTININ.n
X350190: No protein information found. Remove? [y/n].
IRD entry: segment number: 4.
GenBank entry: NULL.n
