In [1]:
import tarfile
import json
import pandas as pd

In [2]:
with tarfile.open("mibig_json_4.0.tar", "r") as tar:
    members = tar.getmembers()
    print(len(members))
    
    json_files = [m for m in members if m.name.endswith('.json')]
    data = []
    for json_file in json_files:
        json_data = json.load(tar.extractfile(json_file))
        data.append(json_data)


3014


In [3]:
bgc_data = []
for bgc in data:
    if bgc.get('status') != 'retired' and any(c.get('structure') for c in bgc.get('compounds', [])):
        compounds = bgc.get('compounds', [])
        structure = next((c.get('structure') for c in compounds if c.get('structure')), None)

        bgc_info = {
            'accession': bgc.get('accession'),
            'quality': bgc.get('quality'),
            'completeness': bgc.get('completeness'),
            'first_compound': structure,
            'taxonomy': bgc.get('taxonomy'),
            'status': bgc.get('status'),
        }

        bgc_data.append(bgc_info)


In [4]:
df = pd.DataFrame(bgc_data)

In [5]:
df['first_compound'].isna().sum()

np.int64(0)

In [6]:
df

Unnamed: 0,accession,quality,completeness,first_compound,taxonomy,status
0,BGC0000001,questionable,complete,CC1C[C@]23OC(=O)C4=C2OC1C(O)C3\C=C/C(=O)[C@@H]...,"{'name': 'Verrucosispora maris AB-18-032', 'nc...",active
1,BGC0000002,questionable,unknown,CCCC(O[C@H]1C[C@](C)(N)[C@H](O)[C@H](C)O1)C(C)...,"{'name': 'Kutzneria albida DSM 43870', 'ncbiTa...",active
2,BGC0000003,questionable,unknown,CCC(C)C(C(=O)OC(/C=C/C=C/C=C/C(=O)O)C1(CO1)C)O...,"{'name': 'Alternaria alternata', 'ncbiTaxId': ...",active
3,BGC0000004,questionable,unknown,COC1=C2C3=C(C(=O)OCC3)C(=O)OC2=C4[C@H]5C=CO[C@...,"{'name': 'Aspergillus oryzae', 'ncbiTaxId': 5062}",active
4,BGC0000006,questionable,unknown,COc1cc2c(c3oc(=O)c4c(c13)CCC4=O)[C@@H]1C=CO[C@...,"{'name': 'Aspergillus flavus', 'ncbiTaxId': 5059}",active
...,...,...,...,...,...,...
2110,BGC0003164,questionable,complete,NCCC[C@H](N)CC(=O)N[C@@H]1[C@H](O)[C@@H](OC(N)...,"{'name': 'Streptomyces fd1-xmd', 'ncbiTaxId':...",pending
2111,BGC0003165,high,complete,CN(C1C(O)[C@@H](OC(N)=O)C(CO)OC1\N=C1/NC2C(N1)...,"{'name': 'Streptomyces luteocolor BD-12', 'ncb...",pending
2112,BGC0003168,high,complete,CO[C@@H]([C@@H](O)[C@@H](O)[C@@H]1CO)[C@H](O1)...,"{'name': 'Streptomyces sp. WAC1420', 'ncbiTaxI...",pending
2113,BGC0003169,high,complete,O=C(N[C@@H](CCNC1=O)C(N[C@@H](CCN)C(N[C@H](CC2...,"{'name': 'Paenibacillus polymyxa PKB1', 'ncbiT...",pending


In [7]:
from rdkit import Chem

In [8]:
def canonicalize_smiles(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    molecule = Chem.MolToSmiles(molecule, canonical=True, isomericSmiles=False)
    return molecule

df['first_compound'] = df['first_compound'].apply(canonicalize_smiles)



In [9]:
# remove duplicate smiles based on completeness
duplicated = df['first_compound'].duplicated(keep=False)
df = (
    df.sort_values(by=['completeness'], ascending=False) 
      .groupby('first_compound', as_index=False)
      .first()
)

In [10]:
duplicated = df['first_compound'].duplicated(keep=False)
duplicated.value_counts()

first_compound
False    1816
Name: count, dtype: int64

In [11]:
df = df[[ 'accession' ,'first_compound', 'quality', 'completeness']]

In [12]:
df.to_csv('mibig_active_bgc.csv', index=False)

In [13]:
with tarfile.open('mibig_gbk_4.0.tar', 'r') as tar:
    members = tar.getmembers()
    gbk_files = {m.name.split('/')[-1].replace('.gbk',''): m for m in members if m.name.endswith('.gbk')}
    
    filtered_bgcs ={bgc_id: gbk_files[bgc_id] for bgc_id in df['accession'] if bgc_id in gbk_files}

In [14]:
from Bio import SeqIO
from io import TextIOWrapper
import os

In [15]:
os.makedirs('mibig_fasta', exist_ok=True)

with tarfile.open("mibig_gbk_4.0.tar", "r") as tar:
    for bgc_id, tarinfo in filtered_bgcs.items():
        file_obj = tar.extractfile(tarinfo)
        if file_obj is None:
            continue

        gbk_record = SeqIO.read(TextIOWrapper(file_obj), "genbank")
        
        dna_seq = str(gbk_record.seq)
        
        fasta_filename = f"mibig_fasta/{bgc_id}.fasta"
        with open(fasta_filename, 'w') as f:
            f.write(f">{bgc_id}\n")
            f.write(f"{dna_seq}\n")