In [1]:
import os
from sys import platform
from Bio import SeqIO

species  = ['human', 'mouse']
salias = {'human' : 'Homo_sapiens', 'mouse' : 'Mus_musculus'}
genes = ['IGHV', 'IGHD', 'IGHJ',
         'IGKV', 'IGKJ',
         'IGLV', 'IGLJ',
         'TRBV', 'TRBD', 'TRBJ',
         'TRAV', 'TRAJ',
         'TRDV', 'TRDD', 'TRDJ',
         'TRGV', 'TRGJ']

if platform == 'linux' or platform == 'linux2':
    MAKEBLASTDB_CMD = './makeblastdb_linux'
elif platform == 'darwin':
    MAKEBLASTDB_CMD = './makeblastdb_darwin'
elif platform == 'win32':
    raise 'Cannot run on Windows'

def get_url(s, g):
    return f'https://www.imgt.org/download/V-QUEST/IMGT_V-QUEST_reference_directory/{salias[s]}/{g[:2]}/{g}.fasta'

def get_name(s, g):
    return f'{s}.{g[3]}'

Build database according to official instructions from [NCBI](https://ncbi.github.io/igblast/)

In [2]:
db_files = {get_name(s, g) for s in species for g in genes}

for s in species:
    for g in genes:
        os.system(f'wget -q {get_url(s, g)} -O ->> {get_name(s, g)}.imgt')

for f in db_files:
    os.system(f'./edit_imgt_file.pl {f}.imgt > {f}')
    # we have duplicates here
    seen = set()
    records = []
    for record in SeqIO.parse(f'{f}', 'fasta'):  
        if record.seq not in seen:
            seen.add(record.seq)
            records.append(record)
    SeqIO.write(records, f'{f}', "fasta")
    os.system(f'rm {f}.imgt')
    # build database the same way as in internal_data
    os.system(f'{MAKEBLASTDB_CMD} -parse_seqids -dbtype nucl -in {f} >/dev/null 2>&1')
    os.system(f'rm {f}')

Test

In [3]:
os.environ['IGDATA'] = os.path.abspath('..')
species = 'human'
cmd = ' '.join([f'-c_region_db $IGDATA/database/ncbi_{species}_c_genes',
f'-germline_db_V $IGDATA/database/{species}.V',
f'-germline_db_D $IGDATA/database/{species}.D',
f'-germline_db_J $IGDATA/database/{species}.J',
f'-organism {species}',
f'-auxiliary_data $IGDATA/optional_file/{species}_gl.aux',
f'-show_translation -outfmt 19'])
seq = "CAGGCTGAGGACGAGGCTGATTATTACTGCAGTTCATATAGAGGCAGCGCCACTTTCGAGGTGGTGTTCGGCGGAG"
!cd ../.. && echo "{seq}" | bin/igblastn_darwin {cmd}

sequence_id	sequence	sequence_aa	locus	stop_codon	vj_in_frame	v_frameshift	productive	rev_comp	complete_vdj	d_frame	v_call	d_call	j_call	c_call	sequence_alignment	germline_alignment	sequence_alignment_aa	germline_alignment_aa	v_alignment_start	v_alignment_end	d_alignment_start	d_alignment_end	j_alignment_start	j_alignment_end	c_alignment_start	c_alignment_end	v_sequence_alignment	v_sequence_alignment_aa	v_germline_alignment	v_germline_alignment_aa	d_sequence_alignment	d_sequence_alignment_aa	d_germline_alignment	d_germline_alignment_aa	j_sequence_alignment	j_sequence_alignment_aa	j_germline_alignment	j_germline_alignment_aa	c_sequence_alignment	c_sequence_alignment_aa	c_germline_alignment	c_germline_alignment_aa	fwr1	fwr1_aa	cdr1	cdr1_aa	fwr2	fwr2_aa	cdr2	cdr2_aa	fwr3	fwr3_aa	fwr4	fwr4_aa	cdr3	cdr3_aa	junction	junction_length	junction_aa	junction_aa_length	v_score	d_score	j_score	c_score	v_cigar	d_cigar	j_cigar	c_cigar	v_support	d_support	j_support	c_support	v_identity	d_identity	j_ide