# build_GCAGCF_InsectGenomes
## Author : Savandara Besse

- Created date: 03-08-2017
- Modified date: 04-10-2017

#### **Description** : Build a csv file containing all insect genomes based on Genebank and Refseq assemblies

In [2]:
import sys
import os
from Bio import Entrez
Entrez.email = 'savandara.besse@gmail.com'

In [3]:
f = open('refseq_assembly_summary.txt')
refseq_data = [x.strip() for x in f.readlines()]
f.close()

In [4]:
insectDict = {}

for line in refseq_data[2:]:
  line = line.split('\t')
  tax_id = line[6]
  handle = Entrez.efetch(db="taxonomy", id=tax_id)
  record = Entrez.read(handle)

  for index in range(len(record)) : 
    lineage_ex = record[index]["LineageEx"] 

    for Dict in lineage_ex :
      if Dict["Rank"] == "class" and Dict["ScientificName"] == "Insecta" :
        insectDict[tax_id] = {}
        insectDict[tax_id]["sp_name"] = line[7]
        insectDict[tax_id]["genome_id"] = line[0]
      
        for Dict in lineage_ex :
            if Dict["Rank"] == "family" :
                insectDict[tax_id]["family_name"] = Dict["ScientificName"]
                insectDict[tax_id]["family_id"] = Dict["TaxId"]
            if Dict["Rank"] == "order" : 
                insectDict[tax_id]["order_name"] = Dict["ScientificName"]
                insectDict[tax_id]["order_id"] = Dict["TaxId"]
     

In [5]:
len(insectDict)

98

In [6]:
already_exist = []
for key in insectDict.keys():
    already_exist.append(key)
    

In [16]:
f = open('genbank_assembly_summary.txt')
genbank_data = [x.strip() for x in f.readlines()]
f.close()

In [17]:
for line in genbank_data[1:] :
    line = line.split('\t')
    tax_id = line[1]
    
    if line[5] == "Insects" :
        if not tax_id in already_exist :
            insectDict[tax_id] = {}
            insectDict[tax_id]["sp_name"] = line[0]
            insectDict[tax_id]["genome_id"] = line[8]
            
            handle = Entrez.efetch(db="taxonomy", id=tax_id)
            record = Entrez.read(handle) 
            
            for index in range(len(record)) :
                lineage_ex = record[index]["LineageEx"] 
                
                for Dict in lineage_ex :
                    if Dict["Rank"] == "family" :
                        insectDict[tax_id]["family_name"] = Dict["ScientificName"]
                        insectDict[tax_id]["family_id"] = Dict["TaxId"]
                    if Dict["Rank"] == "order" : 
                        insectDict[tax_id]["order_name"] = Dict["ScientificName"]
                        insectDict[tax_id]["order_id"] = Dict["TaxId"]
                if "family_name" not in insectDict[tax_id]:
                    insectDict[tax_id]["family_name"] = "N/A"
                    insectDict[tax_id]["family_id"] = "N/A"
                if "order_name" not in insectDict[tax_id]:
                    insectDict[tax_id]["order_name"] = "N/A"
                    insectDict[tax_id]["order_id"] = "N/A"

In [18]:
len(insectDict)

248

In [19]:
import pandas as pd 

In [20]:
df = pd.DataFrame.from_dict(insectDict, orient='index')

In [21]:
df[["sp_name","family_id", "family_name", "order_id", "order_name","genome_id"]]
df.sort_values(['order_name','order_id','family_name','family_id'])

Unnamed: 0,sp_name,order_name,genome_id,family_id,order_id,family_name
6973,Blattella germanica,Blattodea,GCA_000762945.1,1049651,85823,Ectobiidae
114634,Nasutitermes exitiosus,Blattodea,GCA_001404035.1,46569,85823,Termitidae
136037,Zootermopsis nevadensis,Blattodea,GCA_000696155.1,7501,85823,Termopsidae
224129,Agrilus planipennis,Coleoptera,GCF_000699045.1,50527,7041,Buprestidae
217634,Anoplophora glabripennis,Coleoptera,GCF_000390285.1,34667,7041,Cerambycidae
7539,Leptinotarsa decemlineata,Coleoptera,GCA_000500325.1,27439,7041,Chrysomelidae
50550,Priacma serrata,Coleoptera,GCA_000281835.1,50548,7041,Cupedidae
57062,Hypothenemus hampei,Coleoptera,GCA_001012855.1,7042,7041,Curculionidae
77166,Dendroctonus ponderosae,Coleoptera,GCF_000355655.1,7042,7041,Curculionidae
116153,Aethina tumida,Coleoptera,GCF_001937115.1,116151,7041,Nitidulidae


In [22]:
df.to_csv('GCA_GCF_insectGenomes.csv', sep=',', header=True, index_label='tax_id')