In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import display

import duckdb
import pandas as pd
from pathlib import Path
import logging
from Bio import SeqIO
from datetime import datetime
import time

from typing import List, Optional, Union, Dict, Set

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [49]:
SAMPLES = ['SRR12068547', 'SRR12068550']
DUCKDB_PATHS = [
    Path('/mnt/data3/planter_outputs/SRR12068547/SRR12068547.duckdb'),
    Path('/mnt/data3/planter_outputs/SRR12068550/SRR12068550.duckdb')
]

def display_tables(duckdb_path):
    if not Path(duckdb_path).exists():
        raise FileNotFoundError(f'{duckdb_path} does not exist')
    with duckdb.connect(duckdb_path) as conn:
        db = conn.execute("PRAGMA show_tables;").fetchdf()
        tables = db['name'].tolist()

        for table in tables:
            print(f"First 3 rows of {table}:")
            df = conn.execute(f"SELECT * FROM {table}").fetchdf()
            print(f'{table} has {df.shape[0]} rows and {df.shape[1]} columns')
            display(df.head(3))

display_tables(DUCKDB_PATHS[0])
display_tables(DUCKDB_PATHS[1])

First 3 rows of annotations:
annotations has 251 rows and 10 columns


Unnamed: 0,seqhash_id,seed_ortholog,evalue,score,eggnog_ogs,max_annot_lvl,cog_category,description,preferred_name,sample_id
0,v1_DLS_0076d08a43a612188f215c23180111039b05c1b...,265311.Mfl224,1.93e-06,53.5,"COG1564@1|root,COG1564@2|Bacteria,3WTAN@544448...",544448|Tenericutes,H,"Thiamin pyrophosphokinase, vitamin B1 binding ...",thiN,SRR12068547
1,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,265311.Mfl040,3.74e-45,155.0,"COG0022@1|root,COG0022@2|Bacteria,3WSYT@544448...",544448|Tenericutes,C,dehydrogenase e1 component,pdhB,SRR12068547
2,v1_DLS_02340e54c7bf3192db0a8f6d8bad893802d33fb...,265311.Mfl489,1.3e-29,120.0,"2EWBJ@1|root,33PQA@2|Bacteria,3WT4V@544448|Ten...",544448|Tenericutes,-,-,-,SRR12068547


First 3 rows of cluster_members:
cluster_members has 0 rows and 2 columns


Unnamed: 0,seqhash_id,cluster_id


First 3 rows of clusters:
clusters has 0 rows and 3 columns


Unnamed: 0,cluster_id,representative_seqhash_id,size


First 3 rows of ec_numbers:
ec_numbers has 130 rows and 2 columns


Unnamed: 0,seqhash_id,ec_number
0,v1_DLS_05fc6dc2fcc79581a85e20a59498e2ca4389b43...,1.2.4.4
1,v1_DLS_116aa24751cc0c03d972d124b07632ab5a2db0e...,3.4.23.36
2,v1_DLS_166a64b63f322440c5a41bc6219f0a942373c2b...,5.4.2.12


First 3 rows of go_terms:
go_terms has 110 rows and 2 columns


Unnamed: 0,seqhash_id,go_term
0,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,GO:0140032
1,v1_DLS_1016173829ac36ac2333823a671a085347bddb2...,GO:0005515
2,v1_DLS_27169752765a0e3a22d9727f0dcb6ece7c520a9...,GO:0005622


First 3 rows of kegg_info:
kegg_info has 0 rows and 6 columns


Unnamed: 0,seqhash_id,kegg_ko,kegg_pathway,kegg_module,kegg_reaction,kegg_rclass


First 3 rows of schema_version:
schema_version has 3 rows and 3 columns


Unnamed: 0,version,migration_name,applied_at
0,1,000_schema_version.sql,2025-02-11 02:04:08.915
1,2,001_initial_schema.sql,2025-02-11 02:04:08.915
2,3,002_add_indexes.sql,2025-02-11 02:04:08.915


First 3 rows of sequences:
sequences has 437 rows and 7 columns


Unnamed: 0,seqhash_id,sequence,sample_id,assembly_date,is_representative,repseq_id,length
0,v1_DLS_0076d08a43a612188f215c23180111039b05c1b...,MXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,SRR12068547,2025-02-11 02:04:12.774568,False,v1_DLS_0076d08a43a612188f215c23180111039b05c1b...,160
1,v1_DLS_00dba5ffc2660fb5748dc31acb772b2ad64d737...,MKPKWFXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,SRR12068547,2025-02-11 02:04:12.774589,False,v1_DLS_00dba5ffc2660fb5748dc31acb772b2ad64d737...,229
2,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,MVIRMPMGGGIRALEHHSEALEAIFAHIPGIKTVMPSTPYDTKGLL...,SRR12068547,2025-02-11 02:04:12.774599,False,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,117


First 3 rows of sra_metadata:
sra_metadata has 1 rows and 14 columns


Unnamed: 0,sample_id,organism,study_title,study_abstract,bioproject,biosample,library_strategy,library_source,library_selection,library_layout,instrument,run_spots,run_bases,run_published
0,SRR12068547,Mesoplasma florum,Transcriptome profiling of Mesoplasma florum s...,Mesoplasma florum is a near-minimal bacterium ...,PRJNA641121,SAMN15341344,,,,,ILLUMINA,,,


First 3 rows of annotations:
annotations has 320 rows and 10 columns


Unnamed: 0,seqhash_id,seed_ortholog,evalue,score,eggnog_ogs,max_annot_lvl,cog_category,description,preferred_name,sample_id
0,v1_DLS_0045223251933025812ef4b1f3f2c589a13ffa3...,265311.Mfl097,3.6200000000000003e-93,288.0,"COG0444@1|root,COG4608@1|root,COG0444@2|Bacter...",544448|Tenericutes,P,Belongs to the ABC transporter superfamily,oppF,SRR12068550
1,v1_DLS_009e48be93dbf3e6aa24c3efcc2cd6582490192...,265311.Mfl111,2.3900000000000002e-113,326.0,"COG0711@1|root,COG0711@2|Bacteria,3WTD0@544448...",544448|Tenericutes,C,"Component of the F(0) channel, it forms part o...",atpF,SRR12068550
2,v1_DLS_00c863de08751623595b1ec55aa36ce3cd37216...,265311.Mfl111,2.3900000000000002e-113,326.0,"COG0711@1|root,COG0711@2|Bacteria,3WTD0@544448...",544448|Tenericutes,C,"Component of the F(0) channel, it forms part o...",atpF,SRR12068550


First 3 rows of cluster_members:
cluster_members has 0 rows and 2 columns


Unnamed: 0,seqhash_id,cluster_id


First 3 rows of clusters:
clusters has 0 rows and 3 columns


Unnamed: 0,cluster_id,representative_seqhash_id,size


First 3 rows of ec_numbers:
ec_numbers has 144 rows and 2 columns


Unnamed: 0,seqhash_id,ec_number
0,v1_DLS_0c8eb4715304e15ccd1af0a087202a1e20b3c28...,5.99.1.3
1,v1_DLS_119a5ee2390475746fff8bdab1c8c9a5d6bdc03...,2.7.1.76
2,v1_DLS_1717e40d619e5ed64ed6baf2d7c5de070f0c551...,2.7.1.76


First 3 rows of go_terms:
go_terms has 214 rows and 2 columns


Unnamed: 0,seqhash_id,go_term
0,v1_DLS_203caeecec00ae2e766e821b8d4b5aac54085dc...,GO:0005575
1,v1_DLS_4082c0cc7a2f556d8672eb7d893e708426e0dd4...,GO:0005575
2,v1_DLS_5ac4e49f5163173bbecbc5e8047e483c30ccb2d...,GO:0044237


First 3 rows of kegg_info:
kegg_info has 0 rows and 6 columns


Unnamed: 0,seqhash_id,kegg_ko,kegg_pathway,kegg_module,kegg_reaction,kegg_rclass


First 3 rows of schema_version:
schema_version has 3 rows and 3 columns


Unnamed: 0,version,migration_name,applied_at
0,1,000_schema_version.sql,2025-02-11 02:04:08.913
1,2,001_initial_schema.sql,2025-02-11 02:04:08.913
2,3,002_add_indexes.sql,2025-02-11 02:04:08.913


First 3 rows of sequences:
sequences has 519 rows and 7 columns


Unnamed: 0,seqhash_id,sequence,sample_id,assembly_date,is_representative,repseq_id,length
0,v1_DLS_0045223251933025812ef4b1f3f2c589a13ffa3...,MLPEHLSRYPHEFSGGQRQRIGIARALVMKPSFVVCDEPISALDVS...,SRR12068550,2025-02-11 02:04:10.514353,False,v1_DLS_0045223251933025812ef4b1f3f2c589a13ffa3...,148
1,v1_DLS_009e48be93dbf3e6aa24c3efcc2cd6582490192...,MIFFAETQTAGVPEIITSLFPNLPNFIAHVIATIVLVVILSKLMYK...,SRR12068550,2025-02-11 02:04:10.514371,False,v1_DLS_009e48be93dbf3e6aa24c3efcc2cd6582490192...,178
2,v1_DLS_00c863de08751623595b1ec55aa36ce3cd37216...,MNNYLXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,SRR12068550,2025-02-11 02:04:10.514381,False,v1_DLS_00c863de08751623595b1ec55aa36ce3cd37216...,150


First 3 rows of sra_metadata:
sra_metadata has 1 rows and 14 columns


Unnamed: 0,sample_id,organism,study_title,study_abstract,bioproject,biosample,library_strategy,library_source,library_selection,library_layout,instrument,run_spots,run_bases,run_published
0,SRR12068550,Mesoplasma florum,Transcriptome profiling of Mesoplasma florum s...,Mesoplasma florum is a near-minimal bacterium ...,PRJNA641121,SAMN15341341,,,,,ILLUMINA,,,


In [46]:
# Path to your schema SQL file
schema_sql_path = Path('../planter/database/schema/migrations/001_initial_schema.sql')

# Path for the master (merged) DuckDB
master_db_path = '/tmp/merged.duckdb'

def merge_duckdbs(
    duckdb_paths: List[Union[str, Path]],
    master_db_path: Union[str, Path],
    schema_sql_path: Union[str, Path]
) -> None:
    """
    Merge multiple DuckDB databases into a master DuckDB.
    
    Parameters:
      duckdb_paths (List[Union[str, Path]]): List of paths to source DuckDB files.
      master_db_path (Union[str, Path]): Path to the master (merged) DuckDB.
      schema_sql_path (Union[str, Path]): Path to the SQL file defining the schema.
    
    The function:
      - Creates (or opens) the master database.
      - Executes the schema SQL to create tables if they don't exist.
      - Iterates through each source database, attaches it,
        and inserts data into the master tables in dependency order.
      - Uses INSERT OR IGNORE to avoid duplicate key errors.
      - Detaches each source database after merging.
    """
    
    master_db_path = str(master_db_path)
    schema_sql_path = Path(schema_sql_path)
    
    # Read the schema SQL
    schema_sql = schema_sql_path.read_text()
    
    with duckdb.connect(master_db_path) as master_conn:
        # Set up the schema in the master database
        master_conn.execute(schema_sql)
        
        # Process each source DuckDB
        for i, source_db in enumerate(duckdb_paths):
            alias = f"db{i}"
            source_db_str = str(source_db)
            print(f"Attaching {source_db_str} as {alias}...")
            master_conn.execute(f"ATTACH '{source_db_str}' AS {alias};")
            
            # Insert data in dependency order
            master_conn.execute(f"""
                INSERT OR IGNORE INTO sra_metadata
                SELECT * FROM {alias}.sra_metadata;
            """)
            master_conn.execute(f"""
                INSERT OR IGNORE INTO sequences
                SELECT * FROM {alias}.sequences;
            """)
            master_conn.execute(f"""
                INSERT OR IGNORE INTO annotations
                SELECT * FROM {alias}.annotations;
            """)
            master_conn.execute(f"""
                INSERT OR IGNORE INTO go_terms
                SELECT * FROM {alias}.go_terms;
            """)
            master_conn.execute(f"""
                INSERT OR IGNORE INTO ec_numbers
                SELECT * FROM {alias}.ec_numbers;
            """)
            master_conn.execute(f"""
                INSERT OR IGNORE INTO kegg_info
                SELECT * FROM {alias}.kegg_info;
            """)
            master_conn.execute(f"""
                INSERT OR IGNORE INTO clusters
                SELECT * FROM {alias}.clusters;
            """)
            master_conn.execute(f"""
                INSERT OR IGNORE INTO cluster_members
                SELECT * FROM {alias}.cluster_members;
            """)
            
            # Optionally, merge schema_version if needed:
            # master_conn.execute(f"""
            #     INSERT OR IGNORE INTO schema_version
            #     SELECT * FROM {alias}.schema_version;
            # """)
            
            master_conn.execute(f"DETACH {alias};")
            print(f"Finished merging {source_db_str}\n")
        
        # Optional commit; DuckDB auto-commits by default.
        master_conn.commit()
    
    print("All databases have been merged into:", master_db_path)
    return master_db_path

master_db_path = merge_duckdbs(DUCKDB_PATHS, master_db_path, schema_sql_path)
display_tables(master_db_path)

Attaching /mnt/data3/planter_outputs/SRR12068547/SRR12068547.duckdb as db0...
Finished merging /mnt/data3/planter_outputs/SRR12068547/SRR12068547.duckdb

Attaching /mnt/data3/planter_outputs/SRR12068550/SRR12068550.duckdb as db1...
Finished merging /mnt/data3/planter_outputs/SRR12068550/SRR12068550.duckdb

All databases have been merged into: /tmp/merged.duckdb
First 3 rows of annotations:
annotations has 44023 rows and 10 columns


Unnamed: 0,seqhash_id,seed_ortholog,evalue,score,eggnog_ogs,max_annot_lvl,cog_category,description,preferred_name,sample_id
0,v1_DLS_0076d08a43a612188f215c23180111039b05c1b...,265311.Mfl224,1.93e-06,53.5,"COG1564@1|root,COG1564@2|Bacteria,3WTAN@544448...",544448|Tenericutes,H,"Thiamin pyrophosphokinase, vitamin B1 binding ...",thiN,SRR12068547
1,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,265311.Mfl040,3.74e-45,155.0,"COG0022@1|root,COG0022@2|Bacteria,3WSYT@544448...",544448|Tenericutes,C,dehydrogenase e1 component,pdhB,SRR12068547
2,v1_DLS_02340e54c7bf3192db0a8f6d8bad893802d33fb...,265311.Mfl489,1.3e-29,120.0,"2EWBJ@1|root,33PQA@2|Bacteria,3WT4V@544448|Ten...",544448|Tenericutes,-,-,-,SRR12068547


First 3 rows of cluster_members:
cluster_members has 0 rows and 2 columns


Unnamed: 0,seqhash_id,cluster_id


First 3 rows of clusters:
clusters has 0 rows and 3 columns


Unnamed: 0,cluster_id,representative_seqhash_id,size


First 3 rows of ec_numbers:
ec_numbers has 16492 rows and 2 columns


Unnamed: 0,seqhash_id,ec_number
0,v1_DLS_05fc6dc2fcc79581a85e20a59498e2ca4389b43...,1.2.4.4
1,v1_DLS_116aa24751cc0c03d972d124b07632ab5a2db0e...,3.4.23.36
2,v1_DLS_166a64b63f322440c5a41bc6219f0a942373c2b...,5.4.2.12


First 3 rows of go_terms:
go_terms has 1994038 rows and 2 columns


Unnamed: 0,seqhash_id,go_term
0,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,GO:0140032
1,v1_DLS_1016173829ac36ac2333823a671a085347bddb2...,GO:0005515
2,v1_DLS_27169752765a0e3a22d9727f0dcb6ece7c520a9...,GO:0005622


First 3 rows of kegg_info:
kegg_info has 0 rows and 6 columns


Unnamed: 0,seqhash_id,kegg_ko,kegg_pathway,kegg_module,kegg_reaction,kegg_rclass


First 3 rows of schema_version:
schema_version has 0 rows and 3 columns


Unnamed: 0,version,migration_name,applied_at


First 3 rows of sequences:
sequences has 85232 rows and 7 columns


Unnamed: 0,seqhash_id,sequence,sample_id,assembly_date,is_representative,repseq_id,length
0,v1_DLS_0076d08a43a612188f215c23180111039b05c1b...,MXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,SRR12068547,2025-02-11 02:04:12.774568,False,v1_DLS_0076d08a43a612188f215c23180111039b05c1b...,160
1,v1_DLS_00dba5ffc2660fb5748dc31acb772b2ad64d737...,MKPKWFXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,SRR12068547,2025-02-11 02:04:12.774589,False,v1_DLS_00dba5ffc2660fb5748dc31acb772b2ad64d737...,229
2,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,MVIRMPMGGGIRALEHHSEALEAIFAHIPGIKTVMPSTPYDTKGLL...,SRR12068547,2025-02-11 02:04:12.774599,False,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,117


First 3 rows of sra_metadata:
sra_metadata has 8 rows and 14 columns


Unnamed: 0,sample_id,organism,study_title,study_abstract,bioproject,biosample,library_strategy,library_source,library_selection,library_layout,instrument,run_spots,run_bases,run_published
0,SRR12068547,Mesoplasma florum,Transcriptome profiling of Mesoplasma florum s...,Mesoplasma florum is a near-minimal bacterium ...,PRJNA641121,SAMN15341344,,,,,ILLUMINA,,,
1,SRR12068550,Mesoplasma florum,Transcriptome profiling of Mesoplasma florum s...,Mesoplasma florum is a near-minimal bacterium ...,PRJNA641121,SAMN15341341,,,,,ILLUMINA,,,
2,ERR12954319,Heribaudiella fluviatilis,Sequencing data of the Phaeoexplorer project,This project collects genomic and transcriptom...,PRJEB72149,SAMEA115094190,,,,,ILLUMINA,,,


In [47]:
duckdb_paths = list(Path('/mnt/data3/planter_outputs').rglob('*RR*.duckdb'))
master_db_path = merge_duckdbs(duckdb_paths, master_db_path, schema_sql_path)
display_tables(master_db_path)


Attaching /mnt/data3/planter_outputs/ERR12954319/ERR12954319.duckdb as db0...
Finished merging /mnt/data3/planter_outputs/ERR12954319/ERR12954319.duckdb

Attaching /mnt/data3/planter_outputs/SRR12068548/SRR12068548.duckdb as db1...
Finished merging /mnt/data3/planter_outputs/SRR12068548/SRR12068548.duckdb

Attaching /mnt/data3/planter_outputs/SRR12068552/SRR12068552.duckdb as db2...
Finished merging /mnt/data3/planter_outputs/SRR12068552/SRR12068552.duckdb

Attaching /mnt/data3/planter_outputs/SRR12068551/SRR12068551.duckdb as db3...
Finished merging /mnt/data3/planter_outputs/SRR12068551/SRR12068551.duckdb

Attaching /mnt/data3/planter_outputs/SRR12068549/SRR12068549.duckdb as db4...
Finished merging /mnt/data3/planter_outputs/SRR12068549/SRR12068549.duckdb

Attaching /mnt/data3/planter_outputs/SRR12068550/SRR12068550.duckdb as db5...
Finished merging /mnt/data3/planter_outputs/SRR12068550/SRR12068550.duckdb

Attaching /mnt/data3/planter_outputs/ERR2040767/ERR2040767.duckdb as db6...


Unnamed: 0,seqhash_id,seed_ortholog,evalue,score,eggnog_ogs,max_annot_lvl,cog_category,description,preferred_name,sample_id
0,v1_DLS_0076d08a43a612188f215c23180111039b05c1b...,265311.Mfl224,1.93e-06,53.5,"COG1564@1|root,COG1564@2|Bacteria,3WTAN@544448...",544448|Tenericutes,H,"Thiamin pyrophosphokinase, vitamin B1 binding ...",thiN,SRR12068547
1,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,265311.Mfl040,3.74e-45,155.0,"COG0022@1|root,COG0022@2|Bacteria,3WSYT@544448...",544448|Tenericutes,C,dehydrogenase e1 component,pdhB,SRR12068547
2,v1_DLS_02340e54c7bf3192db0a8f6d8bad893802d33fb...,265311.Mfl489,1.3e-29,120.0,"2EWBJ@1|root,33PQA@2|Bacteria,3WT4V@544448|Ten...",544448|Tenericutes,-,-,-,SRR12068547


First 3 rows of cluster_members:
cluster_members has 0 rows and 2 columns


Unnamed: 0,seqhash_id,cluster_id


First 3 rows of clusters:
clusters has 0 rows and 3 columns


Unnamed: 0,cluster_id,representative_seqhash_id,size


First 3 rows of ec_numbers:
ec_numbers has 31634 rows and 2 columns


Unnamed: 0,seqhash_id,ec_number
0,v1_DLS_05fc6dc2fcc79581a85e20a59498e2ca4389b43...,1.2.4.4
1,v1_DLS_116aa24751cc0c03d972d124b07632ab5a2db0e...,3.4.23.36
2,v1_DLS_166a64b63f322440c5a41bc6219f0a942373c2b...,5.4.2.12


First 3 rows of go_terms:
go_terms has 3756450 rows and 2 columns


Unnamed: 0,seqhash_id,go_term
0,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,GO:0140032
1,v1_DLS_1016173829ac36ac2333823a671a085347bddb2...,GO:0005515
2,v1_DLS_27169752765a0e3a22d9727f0dcb6ece7c520a9...,GO:0005622


First 3 rows of kegg_info:
kegg_info has 0 rows and 6 columns


Unnamed: 0,seqhash_id,kegg_ko,kegg_pathway,kegg_module,kegg_reaction,kegg_rclass


First 3 rows of schema_version:
schema_version has 0 rows and 3 columns


Unnamed: 0,version,migration_name,applied_at


First 3 rows of sequences:
sequences has 149833 rows and 7 columns


Unnamed: 0,seqhash_id,sequence,sample_id,assembly_date,is_representative,repseq_id,length
0,v1_DLS_0076d08a43a612188f215c23180111039b05c1b...,MXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,SRR12068547,2025-02-11 02:04:12.774568,False,v1_DLS_0076d08a43a612188f215c23180111039b05c1b...,160
1,v1_DLS_00dba5ffc2660fb5748dc31acb772b2ad64d737...,MKPKWFXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,SRR12068547,2025-02-11 02:04:12.774589,False,v1_DLS_00dba5ffc2660fb5748dc31acb772b2ad64d737...,229
2,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,MVIRMPMGGGIRALEHHSEALEAIFAHIPGIKTVMPSTPYDTKGLL...,SRR12068547,2025-02-11 02:04:12.774599,False,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,117


First 3 rows of sra_metadata:
sra_metadata has 9 rows and 14 columns


Unnamed: 0,sample_id,organism,study_title,study_abstract,bioproject,biosample,library_strategy,library_source,library_selection,library_layout,instrument,run_spots,run_bases,run_published
0,SRR12068547,Mesoplasma florum,Transcriptome profiling of Mesoplasma florum s...,Mesoplasma florum is a near-minimal bacterium ...,PRJNA641121,SAMN15341344,,,,,ILLUMINA,,,
1,SRR12068550,Mesoplasma florum,Transcriptome profiling of Mesoplasma florum s...,Mesoplasma florum is a near-minimal bacterium ...,PRJNA641121,SAMN15341341,,,,,ILLUMINA,,,
2,ERR12954319,Heribaudiella fluviatilis,Sequencing data of the Phaeoexplorer project,This project collects genomic and transcriptom...,PRJEB72149,SAMEA115094190,,,,,ILLUMINA,,,


In [51]:
display_tables(master_db_path)

First 3 rows of annotations:
annotations has 88743 rows and 10 columns


Unnamed: 0,seqhash_id,seed_ortholog,evalue,score,eggnog_ogs,max_annot_lvl,cog_category,description,preferred_name,sample_id
0,v1_DLS_0076d08a43a612188f215c23180111039b05c1b...,265311.Mfl224,1.93e-06,53.5,"COG1564@1|root,COG1564@2|Bacteria,3WTAN@544448...",544448|Tenericutes,H,"Thiamin pyrophosphokinase, vitamin B1 binding ...",thiN,SRR12068547
1,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,265311.Mfl040,3.74e-45,155.0,"COG0022@1|root,COG0022@2|Bacteria,3WSYT@544448...",544448|Tenericutes,C,dehydrogenase e1 component,pdhB,SRR12068547
2,v1_DLS_02340e54c7bf3192db0a8f6d8bad893802d33fb...,265311.Mfl489,1.3e-29,120.0,"2EWBJ@1|root,33PQA@2|Bacteria,3WT4V@544448|Ten...",544448|Tenericutes,-,-,-,SRR12068547


First 3 rows of cluster_members:
cluster_members has 0 rows and 2 columns


Unnamed: 0,seqhash_id,cluster_id


First 3 rows of clusters:
clusters has 0 rows and 3 columns


Unnamed: 0,cluster_id,representative_seqhash_id,size


First 3 rows of ec_numbers:
ec_numbers has 31634 rows and 2 columns


Unnamed: 0,seqhash_id,ec_number
0,v1_DLS_05fc6dc2fcc79581a85e20a59498e2ca4389b43...,1.2.4.4
1,v1_DLS_116aa24751cc0c03d972d124b07632ab5a2db0e...,3.4.23.36
2,v1_DLS_166a64b63f322440c5a41bc6219f0a942373c2b...,5.4.2.12


First 3 rows of go_terms:
go_terms has 3756450 rows and 2 columns


Unnamed: 0,seqhash_id,go_term
0,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,GO:0140032
1,v1_DLS_1016173829ac36ac2333823a671a085347bddb2...,GO:0005515
2,v1_DLS_27169752765a0e3a22d9727f0dcb6ece7c520a9...,GO:0005622


First 3 rows of kegg_info:
kegg_info has 0 rows and 6 columns


Unnamed: 0,seqhash_id,kegg_ko,kegg_pathway,kegg_module,kegg_reaction,kegg_rclass


First 3 rows of schema_version:
schema_version has 0 rows and 3 columns


Unnamed: 0,version,migration_name,applied_at


First 3 rows of sequences:
sequences has 149833 rows and 7 columns


Unnamed: 0,seqhash_id,sequence,sample_id,assembly_date,is_representative,repseq_id,length
0,v1_DLS_0076d08a43a612188f215c23180111039b05c1b...,MXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,SRR12068547,2025-02-11 02:04:12.774568,False,v1_DLS_0076d08a43a612188f215c23180111039b05c1b...,160
1,v1_DLS_00dba5ffc2660fb5748dc31acb772b2ad64d737...,MKPKWFXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,SRR12068547,2025-02-11 02:04:12.774589,False,v1_DLS_00dba5ffc2660fb5748dc31acb772b2ad64d737...,229
2,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,MVIRMPMGGGIRALEHHSEALEAIFAHIPGIKTVMPSTPYDTKGLL...,SRR12068547,2025-02-11 02:04:12.774599,False,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,117


First 3 rows of sra_metadata:
sra_metadata has 9 rows and 14 columns


Unnamed: 0,sample_id,organism,study_title,study_abstract,bioproject,biosample,library_strategy,library_source,library_selection,library_layout,instrument,run_spots,run_bases,run_published
0,SRR12068547,Mesoplasma florum,Transcriptome profiling of Mesoplasma florum s...,Mesoplasma florum is a near-minimal bacterium ...,PRJNA641121,SAMN15341344,,,,,ILLUMINA,,,
1,SRR12068550,Mesoplasma florum,Transcriptome profiling of Mesoplasma florum s...,Mesoplasma florum is a near-minimal bacterium ...,PRJNA641121,SAMN15341341,,,,,ILLUMINA,,,
2,ERR12954319,Heribaudiella fluviatilis,Sequencing data of the Phaeoexplorer project,This project collects genomic and transcriptom...,PRJEB72149,SAMEA115094190,,,,,ILLUMINA,,,


In [81]:
repseq_clusters = pd.read_csv('/mnt/data3/planter_outputs/cluster/newClusterDB.tsv', delimiter='\t', header=None, names=['repseq_id', 'seqhash_id'])
repseq_clusters.head()
repseq_clusters.shape
repseq_clusters['repseq_id'].nunique()

Unnamed: 0,repseq_id,seqhash_id
0,v1_DLS_693b34809e671efcc491066951624176abdec2e...,v1_DLS_693b34809e671efcc491066951624176abdec2e...
1,v1_DLS_693b34809e671efcc491066951624176abdec2e...,v1_DLS_693b34809e671efcc491066951624176abdec2e...
2,v1_DLS_693b34809e671efcc491066951624176abdec2e...,v1_DLS_754e0acb8250a3a8f1f6f8ca380d0fb660f7008...
3,v1_DLS_693b34809e671efcc491066951624176abdec2e...,v1_DLS_d1e220b01aa8d311108099006dccc45340eddd1...
4,v1_DLS_08d72bfa8cd748621c47c5ab9976e310ad1393f...,v1_DLS_08d72bfa8cd748621c47c5ab9976e310ad1393f...


(73065, 2)

56341

In [74]:
def update_master_db_with_cluster_mapping(master_db_path: str, tsv_path: str, inplace=False):
    """
    Update the 'sequences' table in the master DuckDB using a cluster mapping TSV.
    
    Parameters:
        master_db_path (str): Path to the master DuckDB file.
        tsv_path (str): Path to the TSV file containing cluster mappings.
                        The TSV should have two columns (no header):
                          - Column 0: repseq_id (the representative seqhash)
                          - Column 1: seqhash_id (the member seqhash)
                          
    The function updates the 'repseq_id' column in the 'sequences' table such that
    for every row where sequences.seqhash_id matches the TSV's member, repseq_id is set
    to the TSV's repseq_id. It also sets is_representative to TRUE for sequences that are representatives.
    """
    # Connect to the master DuckDB.
    con = duckdb.connect(master_db_path)
    
    # Create a temporary table by reading the TSV file.
    # We assume the TSV has no header and uses tab as the separator.
    con.execute(f"""
    CREATE TEMPORARY TABLE new_clusters AS 
      SELECT * FROM read_csv_auto('{tsv_path}', header=False, sep='\t', names=['repseq_id', 'seqhash_id'])
      AS (repseq_id VARCHAR, seqhash_id VARCHAR);
    """)
    
    # Update the sequences table: set repseq_id from new_clusters for matching seqhash_id.
    # con.execute("""
    # UPDATE sequences
    # SET repseq_id = new_clusters.repseq_id
    # FROM new_clusters
    # WHERE sequences.seqhash_id = new_clusters.seqhash_id;
    # """)
    
    # # Optionally, update the is_representative flag.
    # # For every sequence that appears as a repseq_id in the new_clusters table, mark it as representative.
    con.execute("""
    UPDATE sequences
    SET is_representative = TRUE
    WHERE seqhash_id IN (
        SELECT DISTINCT repseq_id FROM new_clusters
    );
    """)
    
    con.close()
    print("Master DB updated using cluster mapping.")

update_master_db_with_cluster_mapping(master_db_path, '/mnt/data3/planter_outputs/cluster/newClusterDB.tsv')

Master DB updated using cluster mapping.


In [75]:
display_tables(master_db_path)

First 3 rows of annotations:
annotations has 88743 rows and 10 columns


Unnamed: 0,seqhash_id,seed_ortholog,evalue,score,eggnog_ogs,max_annot_lvl,cog_category,description,preferred_name,sample_id
0,v1_DLS_0076d08a43a612188f215c23180111039b05c1b...,265311.Mfl224,1.93e-06,53.5,"COG1564@1|root,COG1564@2|Bacteria,3WTAN@544448...",544448|Tenericutes,H,"Thiamin pyrophosphokinase, vitamin B1 binding ...",thiN,SRR12068547
1,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,265311.Mfl040,3.74e-45,155.0,"COG0022@1|root,COG0022@2|Bacteria,3WSYT@544448...",544448|Tenericutes,C,dehydrogenase e1 component,pdhB,SRR12068547
2,v1_DLS_02340e54c7bf3192db0a8f6d8bad893802d33fb...,265311.Mfl489,1.3e-29,120.0,"2EWBJ@1|root,33PQA@2|Bacteria,3WT4V@544448|Ten...",544448|Tenericutes,-,-,-,SRR12068547


First 3 rows of cluster_members:
cluster_members has 0 rows and 2 columns


Unnamed: 0,seqhash_id,cluster_id


First 3 rows of clusters:
clusters has 0 rows and 3 columns


Unnamed: 0,cluster_id,representative_seqhash_id,size


First 3 rows of ec_numbers:
ec_numbers has 31634 rows and 2 columns


Unnamed: 0,seqhash_id,ec_number
0,v1_DLS_05fc6dc2fcc79581a85e20a59498e2ca4389b43...,1.2.4.4
1,v1_DLS_116aa24751cc0c03d972d124b07632ab5a2db0e...,3.4.23.36
2,v1_DLS_166a64b63f322440c5a41bc6219f0a942373c2b...,5.4.2.12


First 3 rows of go_terms:
go_terms has 3756450 rows and 2 columns


Unnamed: 0,seqhash_id,go_term
0,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,GO:0140032
1,v1_DLS_1016173829ac36ac2333823a671a085347bddb2...,GO:0005515
2,v1_DLS_27169752765a0e3a22d9727f0dcb6ece7c520a9...,GO:0005622


First 3 rows of kegg_info:
kegg_info has 0 rows and 6 columns


Unnamed: 0,seqhash_id,kegg_ko,kegg_pathway,kegg_module,kegg_reaction,kegg_rclass


First 3 rows of schema_version:
schema_version has 0 rows and 3 columns


Unnamed: 0,version,migration_name,applied_at


First 3 rows of sequences:
sequences has 149833 rows and 7 columns


Unnamed: 0,seqhash_id,sequence,sample_id,assembly_date,is_representative,repseq_id,length
0,v1_DLS_0076d08a43a612188f215c23180111039b05c1b...,MXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,SRR12068547,2025-02-11 02:04:12.774568,True,v1_DLS_0076d08a43a612188f215c23180111039b05c1b...,160
1,v1_DLS_00dba5ffc2660fb5748dc31acb772b2ad64d737...,MKPKWFXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,SRR12068547,2025-02-11 02:04:12.774589,True,v1_DLS_00dba5ffc2660fb5748dc31acb772b2ad64d737...,229
2,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,MVIRMPMGGGIRALEHHSEALEAIFAHIPGIKTVMPSTPYDTKGLL...,SRR12068547,2025-02-11 02:04:12.774599,True,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,117


First 3 rows of sra_metadata:
sra_metadata has 9 rows and 14 columns


Unnamed: 0,sample_id,organism,study_title,study_abstract,bioproject,biosample,library_strategy,library_source,library_selection,library_layout,instrument,run_spots,run_bases,run_published
0,SRR12068547,Mesoplasma florum,Transcriptome profiling of Mesoplasma florum s...,Mesoplasma florum is a near-minimal bacterium ...,PRJNA641121,SAMN15341344,,,,,ILLUMINA,,,
1,SRR12068550,Mesoplasma florum,Transcriptome profiling of Mesoplasma florum s...,Mesoplasma florum is a near-minimal bacterium ...,PRJNA641121,SAMN15341341,,,,,ILLUMINA,,,
2,ERR12954319,Heribaudiella fluviatilis,Sequencing data of the Phaeoexplorer project,This project collects genomic and transcriptom...,PRJEB72149,SAMEA115094190,,,,,ILLUMINA,,,


In [84]:
with duckdb.connect(master_db_path) as con:
    df = con.execute("""
    SELECT m.organism, s.seqhash_id, s.sequence
    FROM sequences s
    JOIN sra_metadata m ON s.sample_id = m.sample_id
    WHERE s.is_representative = TRUE;
    """).fetchdf()

In [85]:
df

Unnamed: 0,organism,seqhash_id,sequence
0,Mesoplasma florum,v1_DLS_0076d08a43a612188f215c23180111039b05c1b...,MXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...
1,Mesoplasma florum,v1_DLS_00dba5ffc2660fb5748dc31acb772b2ad64d737...,MKPKWFXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...
2,Mesoplasma florum,v1_DLS_01dd7c83e5a125c8fdd52f600ed3662e13d57e7...,MVIRMPMGGGIRALEHHSEALEAIFAHIPGIKTVMPSTPYDTKGLL...
3,Mesoplasma florum,v1_DLS_02340e54c7bf3192db0a8f6d8bad893802d33fb...,MYDFIPITVPNNNXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...
4,Mesoplasma florum,v1_DLS_02340e54c7bf3192db0a8f6d8bad893802d33fb...,MFYTTTKKQFHLLKDYKLNWQXXXXXXXXXXXXXXXXXXXXXXXXX...
...,...,...,...
56336,Ascophyllum nodosum,v1_DLS_708463fc43ff8b43c03f893d2a71dba98a21425...,MFSSVFDKVVSATAPDALATFPPRAALPVPPSTLCCLSFVDRFLTH...
56337,Ascophyllum nodosum,v1_DLS_70854bbb50bcc05f9a40dcbb148cc5e4c903011...,MRKPILVPYWSTWNYSIRIVSDQRSEGVCVVLGSPPFAVRGTSSLS...
56338,Ascophyllum nodosum,v1_DLS_708728a6e86ec522e7e23e2e24113ad92934efa...,MSGLTRGETAEPVSRDQILRLEGIQERRAAARQANRDAAMDRKERE...
56339,Ascophyllum nodosum,v1_DLS_708728a6e86ec522e7e23e2e24113ad92934efa...,MTAGRRVKPATMIDRAIAVVRDFNPATQTVDSYADDALLGGNSKTS...
