# Draw gene maps

In [5]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import pandas as pd
import os

## Get homologous proteins across genomes with hmmer

In [3]:
def read_tblout(tblout):
    df = pd.read_csv(tblout,
                 delim_whitespace=True,
                 comment='#',
                 header=None)
    
    # parse and label column
    df = df[df.columns[:5]]
    df.columns = ['target', '-', 'query', '-',  'full_eval']
    
    
    df['acc'] = df['target'].str.rsplit('_', 1, expand=True)[0]
    df = df[['acc', 'target', 'query', 'full_eval']]
    
    df = df[df['full_eval'] < 0.01]
    
    # select best hit for annotation
    df = df.sort_values(by=['acc', 'target', 'full_eval'], ascending=[True, True, False])
    df = df.drop_duplicates(subset=['target'], keep='first')
    
    return df

### NCBI

In [12]:
# read hmmer results
df_ncbi_hits = read_tblout('../data/tecti_genomes/NCBI/hmmer/all_prd_models.tbl')

# read metadata
df_ncbi_metadata = pd.read_csv('../data/tecti_genomes/NCBI/tectivirus_metadata.tsv', sep='\t')
df_ncbi_metadata = df_ncbi_metadata[df_ncbi_metadata['genus'] != '-']
df_ncbi_metadata = df_ncbi_metadata[df_ncbi_metadata['genus'] != 'Alphatectivirus']

# show hits per genome
pivot_ncbi_hits = df_ncbi_hits.groupby(['acc', 'query']).size().unstack(fill_value='-').reset_index()
df_ncbi_metadata.merge(pivot_ncbi_hits, on='acc', how='left')

Unnamed: 0,acc,genus,species_name,isolate,len,host_class,host_sp,I,III,IX,X,XVIII,XX,XXXII,XXXIV,gpi,gpx
0,NC_011523.1,Betatectivirus,Betatectivirus AP50,Bacillus phage AP50,14398,Bacilli,Bacillus anthracis,-,-,1,-,-,-,-,-,-,-
1,NC_005258.1,Betatectivirus,Betatectivirus Bam35,Bacillus phage Bam35c,14935,Bacilli,Bacillus thuringiensis,-,-,1,-,-,-,-,-,-,-
2,NC_006945.1,Betatectivirus,Betatectivirus GIL16,Bacillus phage GIL16c,14844,Bacilli,Bacillus thuringiensis,-,-,1,-,-,-,-,-,-,-
3,MZ089978.1,Betatectivirus,Betatectivirus sato,Bacillus phage Sato,14852,Bacilli,Bacillus cereus,-,-,1,-,-,-,-,-,-,-
4,MZ089979.1,Betatectivirus,Betatectivirus sole,Bacillus phage Sole,14444,Bacilli,Bacillus cereus VD166,-,-,1,-,-,-,-,-,-,-
5,NC_022094.1,Betatectivirus,Betatectivirus Wip1,Bacillus phage Wip1,14319,Bacilli,Bacillus anthracis,-,-,1,-,-,-,-,-,-,-
6,NC_055059.1,Deltatectivirus,Deltatectivirus forthebois,Streptomyces phage Forthebois,18251,Actinomycetia,Streptomyces scabiei,1,-,-,-,-,-,1,1,-,-
7,NC_055060.1,Deltatectivirus,Deltatectivirus wheeheim,Streptomyces phage WheeHeim,18266,Actinomycetia,Streptomyces scabiei,1,-,-,-,-,-,1,1,-,-
8,NC_055061.1,Epsilontectivirus,Epsilontectivirus toil,Rhodococcus phage Toil,17253,Actinomycetia,Rhodococcus opacus,1,-,1,-,-,-,1,-,-,-
9,NC_042083.1,Gammatectivirus,Gammatectivirus GC1,Gluconobacter phage GC1,16523,Alphaproteobacteria,Gluconobacter cerinus,1,1,1,1,1,1,1,1,1,1


### JGI

In [13]:
# read hmmer results
df_jgi_hits = read_tblout('../data/tecti_genomes/JGI_IMGVR/hmmer/all_prd_models.tbl')

# read metadata
df_jgi_metadata = pd.read_csv('../data/tecti_genomes/JGI_IMGVR/JGI_metadata.tsv', sep='\t')

# show hits per genome
pivot_jgi_hits = df_jgi_hits.groupby(['acc', 'query']).size().unstack(fill_value='-').reset_index()
df_jgi_metadata.merge(pivot_jgi_hits, on='acc', how='left')

Unnamed: 0,acc,img_taxon_id,sample_name,len,loc,I,III,IX,V,VI,...,XVIII,XX,XXII,XXXI,XXXII,XXXIII,XXXIV,gpi,gpq,gpx
0,Ga0500005_002616,3300050116,Peatland microbial communities from Stordalen ...,19730,sweden,1,1,1,1,-,...,1,-,-,-,1,-,-,-,-,-
1,Ga0500006_002682,3300050117,Peatland microbial communities from Stordalen ...,20094,sweden,1,1,1,1,-,...,1,1,-,-,1,-,-,1,-,-
2,Ga0500006_012856,3300050117,Peatland microbial communities from Stordalen ...,5558,sweden,-,1,1,-,-,...,-,1,-,-,-,-,-,1,-,-
3,Ga0500009_001431,3300050120,Peatland microbial communities from Stordalen ...,17036,sweden,1,1,1,-,-,...,1,1,1,1,-,-,1,1,-,1
4,Ga0500011_004634,3300050122,Peatland microbial communities from Stordalen ...,4396,sweden,-,1,1,1,-,...,-,1,-,-,-,-,-,1,-,-
5,Ga0500012_004241,3300050123,Peatland microbial communities from Stordalen ...,4422,sweden,-,1,1,-,-,...,1,1,1,-,-,-,-,1,-,-
6,Ga0500013_005094,3300050124,Peatland microbial communities from Stordalen ...,4816,sweden,-,1,1,-,-,...,1,1,1,-,2,-,-,1,-,-
7,Ga0500017_0003996,3300050128,Peatland microbial communities from Stordalen ...,11142,sweden,-,1,1,-,-,...,1,1,-,-,1,-,-,1,-,-
8,Ga0500018_001608,3300050129,Peatland microbial communities from Stordalen ...,18742,sweden,1,1,1,1,-,...,1,1,-,-,1,-,-,1,-,-
9,Ga0500022_002121,3300050132,Peatland microbial communities from Stordalen ...,18125,sweden,1,1,1,1,-,...,1,-,-,-,1,-,-,1,-,-


### Yutin

In [14]:
# read hmmer results
df_yutin_hits = read_tblout('../data/tecti_genomes/Yutin/hmmer/all_prd_models.tbl')

# read metadata
df_yutin_metadata = pd.read_csv('../data/tecti_genomes/Yutin/yutin_metadata.tsv', sep='\t')

# show hits per genome
pivot_yutin_hits = df_yutin_hits.groupby(['acc', 'query']).size().unstack(fill_value='-').reset_index()
df_yutin_metadata.merge(pivot_yutin_hits, on='acc', how='left')

Unnamed: 0,acc,environment,len,I,III,IX,VII,X,XII,XIII,XVIII,XXXII,XXXIV,gpi,gpx
0,LNFM01013825.1,Activated carbon metagenome,10153,-,1,1,-,1,1,-,1,2,-,1,-
1,FRDC01003407.1,freshwater metagenome,17409,1,1,1,1,1,-,-,-,1,-,-,-
2,LNFM01009513.1,Activated carbon metagenome,14103,1,1,1,1,1,-,-,-,-,1,-,-
3,JRYJ01001167.1,Activated sludge metagenome,17872,1,1,1,-,-,-,1,1,-,1,-,1


## Draw first version with clinker

In [15]:
clinker_cmd = 'clinker '\
              '../data/clinker/gb/*.gb '\
              '-p ../data/clinker/clinker.html'

In [16]:
print(clinker_cmd)

clinker ../data/clinker/gb/*.gb -p ../data/clinker/clinker.html


## Show only the ones for clinker

In [17]:
clinker_genomes = ['Ga0500009_001431', 'Ga0505706_00067931', 'Ga0505706_00072834', 'Ga0505706_00084197', 'NC_042083.1']

In [21]:
df_ncbi_hits[df_ncbi_hits['acc'].isin(clinker_genomes)]

Unnamed: 0,acc,target,query,full_eval
0,NC_042083.1,NC_042083.1_orf10,gpi,0.00039
19,NC_042083.1,NC_042083.1_orf11,XX,1.9e-06
6,NC_042083.1,NC_042083.1_orf12,III,1.1e-51
1,NC_042083.1,NC_042083.1_orf14,gpx,0.00025
18,NC_042083.1,NC_042083.1_orf16,XVIII,4.3e-10
28,NC_042083.1,NC_042083.1_orf17,XXXIV,1.4e-11
20,NC_042083.1,NC_042083.1_orf18,XXXII,4.2e-19
2,NC_042083.1,NC_042083.1_orf2,I,5.4e-132
15,NC_042083.1,NC_042083.1_orf8,X,3.6e-11
7,NC_042083.1,NC_042083.1_orf9,IX,9.5e-55


In [20]:
df_jgi_hits[df_jgi_hits['acc'].isin(clinker_genomes)]

Unnamed: 0,acc,target,query,full_eval
186,Ga0500009_001431,Ga0500009_001431_orf10,XIII,3e-08
295,Ga0500009_001431,Ga0500009_001431_orf12,XXXI,9.5e-10
157,Ga0500009_001431,Ga0500009_001431_orf15,X,3e-13
96,Ga0500009_001431,Ga0500009_001431_orf17,IX,1.1000000000000001e-60
24,Ga0500009_001431,Ga0500009_001431_orf18,gpi,2.6e-05
243,Ga0500009_001431,Ga0500009_001431_orf19,XX,1e-07
66,Ga0500009_001431,Ga0500009_001431_orf20,III,1.1e-72
265,Ga0500009_001431,Ga0500009_001431_orf21,XXII,1.1e-08
36,Ga0500009_001431,Ga0500009_001431_orf22,gpx,1e-05
217,Ga0500009_001431,Ga0500009_001431_orf24,XVIII,7.9e-07
