In [1]:
import os
from Bio import SeqIO
import pandas as pd

In [2]:
mibig_fastas_path = 'mibig_data/mibig_fasta/'
prism_fastas_path = 'prism4_data/prism_fasta/'

In [3]:
for file in os.listdir(mibig_fastas_path):
    if file.endswith('.fna'):
        with open(os.path.join(mibig_fastas_path, file), 'r') as f:
            seqs = list(SeqIO.parse(f, 'fasta'))
            if len(seqs) != 1:
                print(f"File {file} has {len(seqs)} sequences, expected 1.")

for file in os.listdir(prism_fastas_path):
    if file.endswith('.fna'):
        with open(os.path.join(prism_fastas_path, file), 'r') as f:
            seqs = list(SeqIO.parse(f, 'fasta'))
            if len(seqs) != 1:
                print(f"File {file} has {len(seqs)} sequences, expected 1.")

In [4]:
#create 1 file with all sequences
mibig_output_file = 'mibig_combined.fasta'
prism_output_file = 'prism_combined.fasta'

with open(mibig_output_file, 'w') as outfile:
    for file in os.listdir(mibig_fastas_path):
        with open(os.path.join(mibig_fastas_path, file), 'r') as f:
            for record in SeqIO.parse(f, 'fasta'):
                SeqIO.write(record, outfile, 'fasta')

with open(prism_output_file, 'w') as outfile:
    for file in os.listdir(prism_fastas_path):
        with open(os.path.join(prism_fastas_path, file), 'r') as f:
            for record in SeqIO.parse(f, 'fasta'):
                SeqIO.write(record, outfile, 'fasta')

In [5]:
mibig_count = sum(1 for _ in SeqIO.parse(mibig_output_file, 'fasta'))
prism_count = sum(1 for _ in SeqIO.parse(prism_output_file, 'fasta'))
print(f"Total sequences in {mibig_output_file}: {mibig_count}")
print(f"Total sequences in {prism_output_file}: {prism_count}")
#join the two files
combined_output_file = 'combined_sequences.fasta'
with open(combined_output_file, 'w') as outfile:
    for file in [mibig_output_file, prism_output_file]:
        with open(file, 'r') as f:
            for record in SeqIO.parse(f, 'fasta'):
                SeqIO.write(record, outfile, 'fasta')

Total sequences in mibig_combined.fasta: 1673
Total sequences in prism_combined.fasta: 1281


In [6]:
import subprocess
subprocess.run("seqkit rmdup --by-seq --ignore-case combined_sequences.fasta > combine_sequences_deduplicated.fasta", shell=True)

[INFO][0m 706 duplicated records removed


CompletedProcess(args='seqkit rmdup --by-seq --ignore-case combined_sequences.fasta > combine_sequences_deduplicated.fasta', returncode=0)

In [7]:
# get all names for combined sequences deduplicated to a list
names = []
for record in SeqIO.parse('combine_sequences_deduplicated.fasta', 'fasta'):
    names.append(record.id)
names = list(names)

In [8]:
mibig_df = pd.read_csv('mibig_data/mibig_active_bgc.csv')
prism_df = pd.read_csv('prism4_data/prism_bgc.csv')

In [9]:
mibig_df.rename(columns={'first_compound': 'SMILES'}, inplace=True)
mibig_df.drop(columns=['completeness', 'quality'], inplace=True)
mibig_df['source'] = 'mibig'

prism_df.rename(columns={'True SMILES': 'SMILES', 'Cluster': 'accession'}, inplace=True)
prism_df['source'] = 'prism'

In [10]:
df = pd.concat([mibig_df, prism_df], ignore_index=True)

In [11]:
#check how many names from the list names are in the dataframe
df['name_in_list'] = df['accession'].isin(names)

In [12]:
df['name_in_list'].value_counts()
#check the entries that are not in the list
df = df[df['name_in_list']]

In [13]:
df

Unnamed: 0,accession,SMILES,source,name_in_list
0,BGC0001464,Brc1[nH]c(Br)c(Br)c1Br,mibig,True
1,BGC0000892,C#CC#CC#CC#CC=CC=CC(O)CCCCC(=O)O,mibig,True
2,BGC0002804,C#CC#CC#CC#CC=CCCCCCCCC(=O)O,mibig,True
3,BGC0001897,C#CC#CC=C=CC1OC1C(O)C=CC1CCC(=O)O1,mibig,True
4,BGC0001994,C#CC(O)C(N)C(=O)O,mibig,True
...,...,...,...,...
2940,streptoseomycin.fasta,COC1CC2C=CC3CC4CC(O)C3C2(O4)C(C)=CC2COC(=O)CCC...,prism,True
2943,Thanamycin.fasta,CC=C1NC(=O)C(C(C)O)NC(=O)C(Cc2cccc3[nH]ccc23)N...,prism,True
2944,tiancimycin_BGC0001378.fasta,Cc1cc2c(c3c1C(=O)c1ccc(C(=O)O)c(O)c1C3=O)NC1C#...,prism,True
2948,titanium_padanamide.fasta,COCC(=O)NC(C(=O)N1NCCCC1C(=O)NC(Cc1ccccc1)C(O)...,prism,True


In [14]:
bgc_data_cleaned = df.drop(columns=['name_in_list'])
bgc_data_cleaned.to_csv('bgc_data_cleaned.csv', index=False)