In [2]:
%load_ext autoreload
%autoreload 2
import logging
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

import duckdb
import pandas as pd

In [3]:
from planter.database.utils.duckdb_utils import (
    extract_representative_sequences, 
    create_duckdb,
    merge_duckdbs,
    validate_duckdb_schema,
    update_clusters
)

samples = ['SRR12068547', 'SRR12068548', 'SRR12068549', 'SRR12068550', 'SRR12068551', 'SRR12068552']
outdir = Path('/mnt/data4/recombia.planter')
cluster_path = '/mnt/data4/planter_outputs/tmp/newClusterDB.tsv'

# 1. Create duckdbs for Mesoplasma samples
for sample in samples:
    duckdb_path = outdir / f'{sample}/{sample}.duckdb'
    if duckdb_path.exists():
        # remove the duckdb file
        duckdb_path.unlink()
    create_duckdb(
        sample_id=sample,
        outdir=outdir,
        duckdb_out=outdir / f'{sample}/{sample}.duckdb'
    )

# 2. Merge all duckdbs into a master duckdb
merge_duckdbs(
    duckdb_paths=[outdir / f'{sample}/{sample}.duckdb' for sample in samples],
    master_db_path=outdir / 'master.duckdb',
    schema_sql_path=Path('../planter/database/schema/migrations/004_add_gene_protein_map.sql'),
    upgrade_schema=True,
    target_schema_version=None
)

# 3. Validate the master duckdb
validate_duckdb_schema(outdir / 'master.duckdb')

# 4. Add cluster info to the master duckdb
update_clusters(
    db_path=outdir / 'master.duckdb',
    tsv_path=cluster_path,
    backup_first=True
)


Validating database: /mnt/data4/recombia.planter/master.duckdb

=== Tables in database ===
Table: annotations - 2049722 rows
Table: clusters - 428254 rows
Table: cluster_members - 433839 rows
Table: ec_numbers - 571387 rows
Table: expression - 3091553 rows
Table: gene_protein_map - 745417 rows
Table: go_terms - 74845016 rows
Table: kegg_info - 1626 rows
Table: schema_version - 0 rows
Table: sequences - 2489980 rows
Table: sra_metadata - 100 rows

=== Schema for each table ===

Schema for annotations:
  seqhash_id (VARCHAR) PRIMARY KEY
  seed_ortholog (VARCHAR)
  evalue (DOUBLE)
  score (DOUBLE)
  eggnog_ogs (VARCHAR)
  max_annot_lvl (VARCHAR)
  cog_category (VARCHAR)
  description (VARCHAR)
  preferred_name (VARCHAR)
  sample_id (VARCHAR)

Schema for clusters:
  cluster_id (VARCHAR) PRIMARY KEY
  representative_seqhash_id (VARCHAR)
  size (INTEGER)

Schema for cluster_members:
  seqhash_id (VARCHAR) PRIMARY KEY
  cluster_id (VARCHAR)

Schema for ec_numbers:
  seqhash_id (VARCHAR) PRIMA

Starting cluster update for database: /mnt/data4/recombia.planter/master.duckdb
Using clustering data from: /mnt/data4/planter_outputs/tmp/newClusterDB.tsv
Creating backup at: /mnt/data4/recombia.planter/master.duckdb.backup


Annotated proteins: 622261
Genes of annotated proteins: 622261
Genes with both annotation and expression: 622259

=== Cluster Data ===
Total clusters: 428254
Average cluster size: 1.01
Largest cluster size: 11
Unique sequences in clusters: 433839
Total cluster membership records: 433839

Top 3 largest clusters:
  Cluster v1_DLS_b6cf04a60dfba05cfefdf7b30e5096b124df4862e8ca750b224ce458186859b2.p1: 11 members
  Cluster v1_DLS_9840696f6ab23bf92c6f0052abb5a1155bb216e1fd9f4cf72640dc61d0d1067f.p1: 10 members
  Cluster v1_DLS_9cc2c904a6081b260551b7098f155bbd47bed2065f5df13581bf137cec7793e6.p1: 9 members


Beginning database transaction
Dropping existing cluster tables...
Recreating cluster tables...
Loading cluster data from TSV...
Loaded 434196 entries from clustering TSV
Identifying missing sequences...
Found 405 unique sequences missing from database
Found 97 unique representatives missing from database
Examples of missing sequences:
  1. v1_DLS_949373a9e71f2f214edd35ea996aa191e86d68078100881964c9ae62a807bdfc.p1
  2. v1_DLS_980f3743865237442263468769fa1d181375a50a71cbc2a8f4d9b5653c61631e.p1
  3. v1_DLS_f849657514ab54a94eafc43e9269145cb965b63f4fc5c9dae3f03f93d39d3e29.p1
  4. v1_DLS_fad3debbef4f903e627bf3302904961b8a2e0881b1413428a37f724f889654e0.p3
  5. v1_DLS_e43154b19a472f3a71380e77f39d01c164328ebd8ceeaff426b375e21dc96444.p1
  6. v1_DLS_ea79808b029b69dd603e99ce798c5b9babd7b7fa57934542d938192d0feeeab4.p1
  7. v1_DLS_5f9281e47aef25e07b74bf6c464e6d1febca8efceb8b5d4526d5ac98a5ee2322.p2
  8. v1_DLS_b9c34104a087753049bf5ff737c06fdc880308e775d72d943695eff2481b0650.p1
  9. v1_DLS_26eecf7a81

'/mnt/data4/recombia.planter/master.duckdb.backup'

In [4]:
merge_duckdbs(
    duckdb_paths=[outdir / f'{sample}/{sample}.duckdb' for sample in samples],
    master_db_path=outdir / 'master.duckdb.backup',
    schema_sql_path=Path('../planter/database/schema/migrations/004_add_gene_protein_map.sql'),
    upgrade_schema=True,
    target_schema_version=None
)

Using existing master database at /mnt/data4/recombia.planter/master.duckdb.backup


Merging databases using schema version 2
Existing tables in master database: go_terms, cluster_members, gene_protein_map, sra_metadata, kegg_info, annotations, ec_numbers, clusters, sequences, schema_version, expression
Attaching /mnt/data4/recombia.planter/SRR12068547/SRR12068547.duckdb as db0...
Source database schema version: 2
Tables in source database: go_terms, cluster_members, gene_protein_map, sra_metadata, kegg_info, annotations, ec_numbers, clusters, sequences, schema_version, expression
  - Inserted rows in sra_metadata: 100
  - Inserted rows in sequences: 2489980
  - Inserted rows in annotations: 2049722
  - Inserted rows in go_terms: 74845016
  - Inserted rows in ec_numbers: 571387
  - Inserted rows in clusters: 428254
  - Inserted rows in cluster_members: 433839
Processed 398 rows for gene_protein_map individually
  - Inserted rows in gene_protein_map: 745417
  - Inserted rows in expression: 3091553
Adapting kegg_info schema: matching 6 columns
Target columns: ['seqhash_i

'/mnt/data4/recombia.planter/master.duckdb.backup'

In [3]:
def check_tables_in_db(db_path):
    """
    Check which tables exist in a DuckDB database.
    
    Args:
        db_path: Path to the DuckDB file
        
    Returns:
        Dictionary of table names and their row counts
    """
    db_path = Path(db_path)
    
    if not db_path.exists():
        print(f"Database file does not exist: {db_path}")
        return {}
    
    try:
        with duckdb.connect(str(db_path)) as conn:
            # Get all table names
            tables = conn.execute(
                "SELECT name FROM sqlite_master WHERE type='table'"
            ).fetchall()
            
            # Create a dictionary to store table info
            table_info = {}
            
            # For each table, get row count and column info
            for table_row in tables:
                table_name = table_row[0]
                
                # Get row count
                try:
                    row_count = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
                except Exception as e:
                    row_count = f"Error: {str(e)}"
                
                # Get column info
                try:
                    columns = conn.execute(f"PRAGMA table_info({table_name})").fetchall()
                    column_names = [col[1] for col in columns]
                except Exception as e:
                    column_names = [f"Error: {str(e)}"]
                
                # Store table info
                table_info[table_name] = {
                    "row_count": row_count,
                    "columns": column_names
                }
            
            return table_info
    except Exception as e:
        print(f"Error connecting to database: {str(e)}")
        return {}
    
# db_path = "/mnt/data4/recombia.planter/SRR12068547/SRR12068547.duckdb"
db_path = '/mnt/data4/recombia.planter/master.duckdb'
check_tables_in_db(db_path)

{'annotations': {'row_count': 2049722,
  'columns': ['seqhash_id',
   'seed_ortholog',
   'evalue',
   'score',
   'eggnog_ogs',
   'max_annot_lvl',
   'cog_category',
   'description',
   'preferred_name',
   'sample_id']},
 'clusters': {'row_count': 433785,
  'columns': ['cluster_id', 'representative_seqhash_id', 'size']},
 'cluster_members': {'row_count': 2489980,
  'columns': ['seqhash_id', 'cluster_id']},
 'ec_numbers': {'row_count': 571387, 'columns': ['seqhash_id', 'ec_number']},
 'expression': {'row_count': 3091553,
  'columns': ['gene_seqhash_id',
   'sample_id',
   'tpm',
   'num_reads',
   'effective_length']},
 'gene_protein_map': {'row_count': 744572,
  'columns': ['gene_seqhash_id', 'protein_seqhash_id']},
 'go_terms': {'row_count': 74845016, 'columns': ['seqhash_id', 'go_term']},
 'kegg_info': {'row_count': 0,
  'columns': ['seqhash_id',
   'kegg_ko',
   'kegg_pathway',
   'kegg_module',
   'kegg_reaction',
   'kegg_rclass']},
 'schema_version': {'row_count': 0,
  'colum

In [85]:
db_path

'/mnt/data4/recombia.planter/master.duckdb'

In [None]:
cluster_path = '/mnt/data4/planter_outputs/tmp/newClusterDB.tsv'
from planter.database.utils.duckdb_utils import (
    update_clusters
)

update_clusters(
    db_path=db_path,
    tsv_path=cluster_path,
    backup_first=True
)


In [None]:
import pandas as pd
clusters = pd.read_csv(cluster_path, sep='\t', header=None, names=['cluster_id', 'seqhash_id'])
clusters[clusters['cluster_id'] == 'v1_DLS_42f03501d692b370647a4bac7059434916aa1d21968fca917aa214f32f2ced17.p1'].shape

(35, 2)

In [74]:
def debug_foreign_key_issues(
    db_path: Union[str, Path],
    problematic_id: str = "v1_DLS_19696ccb77b302cd0cdff9203ea61ab6436f11d2972fe64be97b00a5961373da.p1"
) -> None:
    """
    Debug foreign key constraint issues with a specific cluster ID.
    
    Args:
        db_path: Path to the DuckDB database
        problematic_id: The specific ID causing constraint issues
    """
    db_path = str(db_path)
    con = duckdb.connect(db_path)
    
    try:
        print(f"Debugging foreign key constraints for ID: {problematic_id}")
        
        # 1. First, check if this ID exists in the clusters table
        cluster_exists = con.execute(
            f"SELECT COUNT(*) FROM clusters WHERE cluster_id = '{problematic_id}'"
        ).fetchone()[0]
        print(f"ID exists in clusters table: {cluster_exists > 0}")
        
        # 2. Get all tables in the database
        tables = con.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
        print(f"Found {len(tables)} tables in the database")
        
        # 3. Check each table to see if it references clusters
        references = []
        for table in tables:
            table_name = table[0]
            if table_name == 'clusters':
                continue
                
            try:
                # Check if the table has foreign keys to clusters
                fk_info = con.execute(f"PRAGMA foreign_key_list({table_name})").fetchall()
                for fk in fk_info:
                    if fk[2] == 'clusters':
                        ref_table = table_name
                        ref_col = fk[3]  # Referenced column in clusters
                        local_col = fk[4]  # Local column in this table
                        references.append((ref_table, local_col, ref_col))
                        print(f"Table {ref_table} references clusters.{ref_col} via {local_col}")
            except Exception as e:
                print(f"Error checking foreign keys for table {table_name}: {str(e)}")
        
        # 4. For each table that references clusters, check if our problematic ID is referenced
        found_references = False
        for ref_table, local_col, ref_col in references:
            try:
                if ref_col == 'cluster_id':
                    # Check if this table references our problematic ID
                    ref_count = con.execute(
                        f"SELECT COUNT(*) FROM {ref_table} WHERE {local_col} = '{problematic_id}'"
                    ).fetchone()[0]
                    
                    if ref_count > 0:
                        found_references = True
                        print(f"FOUND REFERENCE: Table {ref_table} has {ref_count} rows referencing the problematic ID")
                        
                        # Get sample rows to understand the reference
                        sample_rows = con.execute(
                            f"SELECT * FROM {ref_table} WHERE {local_col} = '{problematic_id}' LIMIT 3"
                        ).fetchall()
                        print(f"Sample rows from {ref_table}:")
                        for row in sample_rows:
                            print(f"  {row}")
            except Exception as e:
                print(f"Error checking references in table {ref_table}: {str(e)}")
        
        if not found_references:
            print("No explicit references to the problematic ID were found.")
            print("This might indicate a schema issue or a constraint that's not properly detected.")
            
        # 5. Check if the ID might be in a different format or location
        print("\nChecking for ID in different formats:")
        for table in tables:
            table_name = table[0]
            try:
                # Get column names for this table
                columns = con.execute(f"PRAGMA table_info({table_name})").fetchall()
                column_names = [col[1] for col in columns]
                
                # Check each column that might contain IDs
                for col in column_names:
                    if "id" in col.lower() or "hash" in col.lower() or "key" in col.lower():
                        # Look for exact match
                        exact_match = con.execute(
                            f"SELECT COUNT(*) FROM {table_name} WHERE {col} = '{problematic_id}'"
                        ).fetchone()[0]
                        
                        # Look for partial match
                        partial_match = con.execute(
                            f"SELECT COUNT(*) FROM {table_name} WHERE {col} LIKE '%{problematic_id}%'"
                        ).fetchone()[0]
                        
                        if exact_match > 0:
                            print(f"Table {table_name}.{col} contains exact match: {exact_match} rows")
                        elif partial_match > 0:
                            print(f"Table {table_name}.{col} contains partial match: {partial_match} rows")
            except Exception as e:
                print(f"Error checking table {table_name} columns: {str(e)}")
                
        # 6. Check if the clusters table has the expected structure
        print("\nClusters table structure:")
        cluster_cols = con.execute("PRAGMA table_info(clusters)").fetchall()
        for col in cluster_cols:
            print(f"  {col}")
            
        print("\nCluster_members table structure:")
        member_cols = con.execute("PRAGMA table_info(cluster_members)").fetchall()
        for col in member_cols:
            print(f"  {col}")
        
    except Exception as e:
        print(f"Error during debugging: {str(e)}")
    finally:
        con.close()

debug_foreign_key_issues(db_path)

Debugging foreign key constraints for ID: v1_DLS_19696ccb77b302cd0cdff9203ea61ab6436f11d2972fe64be97b00a5961373da.p1
ID exists in clusters table: True
Found 12 tables in the database
Error checking foreign keys for table annotations: Catalog Error: Pragma Function with name foreign_key_list does not exist!
Did you mean "force_checkpoint"?
Error checking foreign keys for table cluster_members: Catalog Error: Pragma Function with name foreign_key_list does not exist!
Did you mean "force_checkpoint"?
Error checking foreign keys for table ec_numbers: Catalog Error: Pragma Function with name foreign_key_list does not exist!
Did you mean "force_checkpoint"?
Error checking foreign keys for table expression: Catalog Error: Pragma Function with name foreign_key_list does not exist!
Did you mean "force_checkpoint"?
Error checking foreign keys for table expression_backup: Catalog Error: Pragma Function with name foreign_key_list does not exist!
Did you mean "force_checkpoint"?
Error checking fore

In [6]:
def check_sequence_id(
    db_path: Union[str, Path],
    tsv_path: Union[str, Path],
    seqhash_id: str = "v1_DLS_a0a80e8508b66a50baba10818dc3b384cc13f0efb454dde8fc2c0dba1c936b19.p1"
) -> None:
    """
    Check if a specific sequence ID exists in the database and clustering TSV.
    
    Args:
        db_path: Path to the DuckDB database
        tsv_path: Path to the TSV file with cluster info
        seqhash_id: The specific sequence ID to check
    """
    db_path = str(db_path)
    tsv_path = str(tsv_path)
    
    print(f"Checking for sequence ID: {seqhash_id}")
    
    # Check database tables
    con = duckdb.connect(db_path)
    try:
        # Check if this ID exists in sequences table
        exists_in_sequences = con.execute(
            f"SELECT COUNT(*) FROM sequences WHERE seqhash_id = '{seqhash_id}'"
        ).fetchone()[0]
        
        print(f"ID exists in sequences table: {exists_in_sequences > 0}")
        
        if exists_in_sequences > 0:
            # Get details about this sequence
            seq_details = con.execute(
                f"SELECT * FROM sequences WHERE seqhash_id = '{seqhash_id}'"
            ).fetchone()
            print(f"Sequence details: {seq_details}")
        
        # Check TSV file for this ID
        con.execute(f"""
            CREATE TEMP TABLE clustering_check AS
            SELECT 
                column0 AS representative_seqhash_id,
                column1 AS seqhash_id
            FROM read_csv_auto('{tsv_path}', sep='\t', header=FALSE)
        """)
        
        # Check if ID appears as a sequence in clusters
        as_sequence = con.execute(
            f"SELECT COUNT(*) FROM clustering_check WHERE seqhash_id = '{seqhash_id}'"
        ).fetchone()[0]
        
        # Check if ID appears as a representative
        as_representative = con.execute(
            f"SELECT COUNT(*) FROM clustering_check WHERE representative_seqhash_id = '{seqhash_id}'"
        ).fetchone()[0]
        
        print(f"ID appears in TSV as sequence: {as_sequence > 0} ({as_sequence} times)")
        print(f"ID appears in TSV as representative: {as_representative > 0} ({as_representative} times)")
        
        # Get sample of problematic entries from TSV
        if as_sequence > 0:
            sample = con.execute(
                f"SELECT * FROM clustering_check WHERE seqhash_id = '{seqhash_id}' LIMIT 3"
            ).fetchall()
            print(f"Sample entries from TSV where ID is sequence:")
            for entry in sample:
                print(f"  {entry}")
        
        # Count total sequences in TSV not in sequences table
        missing_seqs = con.execute("""
            SELECT COUNT(DISTINCT tc.seqhash_id) 
            FROM clustering_check tc
            LEFT JOIN sequences s ON tc.seqhash_id = s.seqhash_id
            WHERE s.seqhash_id IS NULL
        """).fetchone()[0]
        
        print(f"Total sequences in TSV missing from sequences table: {missing_seqs}")
        
        # Get sample of missing sequences
        if missing_seqs > 0:
            sample_missing = con.execute("""
                SELECT DISTINCT tc.seqhash_id 
                FROM clustering_check tc
                LEFT JOIN sequences s ON tc.seqhash_id = s.seqhash_id
                WHERE s.seqhash_id IS NULL
                LIMIT 10
            """).fetchall()
            
            print("Sample of missing sequences:")
            for i, (seq_id,) in enumerate(sample_missing, 1):
                print(f"  {i}. {seq_id}")
    
    except Exception as e:
        print(f"Error during check: {str(e)}")
    finally:
        try:
            con.execute("DROP TABLE IF EXISTS clustering_check")
        except:
            pass
        con.close()

check_sequence_id(db_path, cluster_path)

Checking for sequence ID: v1_DLS_a0a80e8508b66a50baba10818dc3b384cc13f0efb454dde8fc2c0dba1c936b19.p1
ID exists in sequences table: False
ID appears in TSV as sequence: True (1 times)
ID appears in TSV as representative: False (0 times)
Sample entries from TSV where ID is sequence:
  ('v1_DLS_76400afa768372fd4118749f12b4252d972ea5024d4a582eaae1406cdf2c409b.p1', 'v1_DLS_a0a80e8508b66a50baba10818dc3b384cc13f0efb454dde8fc2c0dba1c936b19.p1')
Total sequences in TSV missing from sequences table: 2585
Sample of missing sequences:
  1. v1_DLS_0aa8c2e83469e1ffc35fcd62036e912dc065345e35af76618ad10d60c93254a3.p1
  2. v1_DLS_535ef37f0da2d881934117e345d4b080562c095bf4261380f2e186568bf6ce93.p1
  3. v1_DLS_ad38a37bc806cbc7f948fa82d627b090ebd1dd34198b0690c51207a76b2c64ed.p1
  4. v1_DLS_5e3b94322da686cedfffbf7ad8b27321fb217fb3036f886eb871e70d16898a1a.p1
  5. v1_DLS_f973093a0658b4dddb4aefcece0e9bf8368a87a5b3708d8ee2a50330bdb478fb.p2
  6. v1_DLS_b4948245d0398f054a1a41528136e3577c125a65504f2009ad6ea667c89e