In [2]:
%load_ext autoreload
%autoreload 2
import logging
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

import duckdb
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

from planter.database.schema.schema_version import (SCHEMA_VERSIONS,
                                                    ensure_compatibility,
                                                    get_db_schema_version)


In [1]:
from planter.database.utils.duckdb_utils import (
    extract_representative_sequences, 
    merge_duckdbs, 
    update_duckdb_with_cluster_info,
    create_duckdb
)


In [26]:
samples = ['SRR12068547', 'SRR12068548', 'SRR12068549', 'SRR12068550', 'SRR12068551', 'SRR12068552']
outdir = Path('/mnt/data4/recombia.planter')

for sample in samples:
    create_duckdb(
        sample_id=sample,
        outdir=outdir,
        duckdb_out=outdir / f'{sample}/{sample}.duckdb'
    )

In [27]:
import duckdb
import pandas as pd
from pathlib import Path

def validate_duckdb_schema(db_path):
    """Validate schema and relationships in the database."""
    print(f"Validating database: {db_path}")
    
    with duckdb.connect(db_path) as con:
        # 1. Get list of all tables
        print("\n=== Tables in database ===")
        tables = con.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
        for table in tables:
            table_name = table[0]
            count = con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
            print(f"Table: {table_name} - {count} rows")
        
        # 2. Check schema of each table
        print("\n=== Schema for each table ===")
        for table in tables:
            table_name = table[0]
            print(f"\nSchema for {table_name}:")
            schema = con.execute(f"PRAGMA table_info({table_name})").fetchall()
            for col in schema:
                print(f"  {col[1]} ({col[2]}){' PRIMARY KEY' if col[5] > 0 else ''}")
        
        # 3. Check the gene-protein relationships
        print("\n=== Gene-Protein Relationships ===")
        try:
            gene_protein_stats = con.execute("""
                SELECT 
                    COUNT(DISTINCT gene_seqhash_id) AS total_genes,
                    COUNT(DISTINCT protein_seqhash_id) AS total_proteins,
                    COUNT(*) AS total_relationships
                FROM gene_protein_map
            """).fetchone()
            
            print(f"Total genes: {gene_protein_stats[0]}")
            print(f"Total proteins: {gene_protein_stats[1]}")
            print(f"Total gene-protein relationships: {gene_protein_stats[2]}")
            
            # Check for genes with multiple proteins
            multi_protein_genes = con.execute("""
                SELECT gene_seqhash_id, COUNT(protein_seqhash_id) as protein_count
                FROM gene_protein_map
                GROUP BY gene_seqhash_id
                HAVING COUNT(protein_seqhash_id) > 1
                ORDER BY COUNT(protein_seqhash_id) DESC
                LIMIT 5
            """).fetchall()
            
            if multi_protein_genes:
                print("\nTop 5 genes with multiple proteins:")
                for gene in multi_protein_genes:
                    print(f"  Gene {gene[0]} has {gene[1]} proteins")
                    # Sample of proteins for this gene
                    proteins = con.execute(f"""
                        SELECT protein_seqhash_id 
                        FROM gene_protein_map 
                        WHERE gene_seqhash_id = '{gene[0]}'
                        LIMIT 3
                    """).fetchall()
                    for protein in proteins:
                        print(f"    - {protein[0]}")
        except Exception as e:
            print(f"Error checking gene-protein relationships: {e}")
        
        # 4. Check expression data and linkage to genes
        print("\n=== Expression Data ===")
        try:
            expr_stats = con.execute("""
                SELECT 
                    COUNT(DISTINCT gene_seqhash_id) AS genes_with_expression,
                    COUNT(DISTINCT sample_id) AS samples_with_expression,
                    AVG(tpm) AS avg_tpm
                FROM expression
            """).fetchone()
            
            print(f"Genes with expression data: {expr_stats[0]}")
            print(f"Samples with expression data: {expr_stats[1]}")
            print(f"Average TPM: {expr_stats[2]:.2f}")
            
            # Check gene-expression-protein linkage
            gene_expr_protein = con.execute("""
                SELECT 
                    COUNT(DISTINCT e.gene_seqhash_id) AS genes_with_expr_and_protein,
                    COUNT(DISTINCT gpm.protein_seqhash_id) AS proteins_linked_to_expr
                FROM expression e
                JOIN gene_protein_map gpm ON e.gene_seqhash_id = gpm.gene_seqhash_id
            """).fetchone()
            
            print(f"Genes with both expression and protein mappings: {gene_expr_protein[0]}")
            print(f"Proteins linked to genes with expression: {gene_expr_protein[1]}")
        except Exception as e:
            print(f"Error checking expression data: {e}")
        
        # 5. Check annotations
        print("\n=== Annotation Data ===")
        try:
            anno_stats = con.execute("""
                SELECT 
                    COUNT(*) AS total_annotations,
                    COUNT(DISTINCT sample_id) AS samples_with_annotations
                FROM annotations
            """).fetchone()
            
            print(f"Total annotated sequences: {anno_stats[0]}")
            print(f"Samples with annotations: {anno_stats[1]}")
            
            # Check annotation-sequence-gene-expression linkage
            anno_gene_expr = con.execute("""
                SELECT 
                    COUNT(DISTINCT a.seqhash_id) AS annotated_proteins,
                    COUNT(DISTINCT gpm.gene_seqhash_id) AS genes_of_annotated_proteins,
                    COUNT(DISTINCT e.gene_seqhash_id) AS genes_with_annotation_and_expression
                FROM annotations a
                JOIN sequences s ON a.seqhash_id = s.seqhash_id
                JOIN gene_protein_map gpm ON s.seqhash_id = gpm.protein_seqhash_id
                LEFT JOIN expression e ON gpm.gene_seqhash_id = e.gene_seqhash_id
            """).fetchone()
            
            print(f"Annotated proteins: {anno_gene_expr[0]}")
            print(f"Genes of annotated proteins: {anno_gene_expr[1]}")
            print(f"Genes with both annotation and expression: {anno_gene_expr[2]}")
        except Exception as e:
            print(f"Error checking annotation data: {e}")
        
        # 6. Check clusters
        print("\n=== Cluster Data ===")
        try:
            cluster_stats = con.execute("""
                SELECT 
                    COUNT(DISTINCT cluster_id) AS total_clusters,
                    AVG(size) AS avg_cluster_size,
                    MAX(size) AS largest_cluster
                FROM clusters
            """).fetchone()
            
            print(f"Total clusters: {cluster_stats[0]}")
            print(f"Average cluster size: {cluster_stats[1]:.2f}")
            print(f"Largest cluster size: {cluster_stats[2]}")
            
            # Check cluster members
            member_stats = con.execute("""
                SELECT 
                    COUNT(DISTINCT seqhash_id) AS unique_members,
                    COUNT(*) AS total_membership_records
                FROM cluster_members
            """).fetchone()
            
            print(f"Unique sequences in clusters: {member_stats[0]}")
            print(f"Total cluster membership records: {member_stats[1]}")
            
            # Get top clusters
            top_clusters = con.execute("""
                SELECT cluster_id, size
                FROM clusters
                ORDER BY size DESC
                LIMIT 3
            """).fetchall()
            
            if top_clusters:
                print("\nTop 3 largest clusters:")
                for cluster in top_clusters:
                    print(f"  Cluster {cluster[0]}: {cluster[1]} members")
        except Exception as e:
            print(f"Error checking cluster data: {e}")

# Usage
db_path = "/mnt/data4/recombia.planter/SRR12068547/SRR12068547.duckdb"
validate_duckdb_schema(db_path)

Validating database: /mnt/data4/recombia.planter/SRR12068547/SRR12068547.duckdb

=== Tables in database ===
Table: annotations - 251 rows
Table: clusters - 0 rows
Table: cluster_members - 0 rows
Table: ec_numbers - 114 rows
Table: expression - 365 rows
Table: gene_protein_map - 398 rows
Table: go_terms - 267 rows
Table: schema_version - 0 rows
Table: sequences - 398 rows
Table: sra_metadata - 1 rows

=== Schema for each table ===

Schema for annotations:
  seqhash_id (VARCHAR) PRIMARY KEY
  seed_ortholog (VARCHAR)
  evalue (DOUBLE)
  score (DOUBLE)
  eggnog_ogs (VARCHAR)
  max_annot_lvl (VARCHAR)
  cog_category (VARCHAR)
  description (VARCHAR)
  preferred_name (VARCHAR)
  sample_id (VARCHAR)

Schema for clusters:
  cluster_id (VARCHAR) PRIMARY KEY
  representative_seqhash_id (VARCHAR)
  size (INTEGER)

Schema for cluster_members:
  seqhash_id (VARCHAR) PRIMARY KEY
  cluster_id (VARCHAR) PRIMARY KEY

Schema for ec_numbers:
  seqhash_id (VARCHAR) PRIMARY KEY
  ec_number (VARCHAR) PRIMAR

In [28]:
merge_duckdbs(
    duckdb_paths=[outdir / f'{sample}/{sample}.duckdb' for sample in samples],
    master_db_path=outdir / 'master.duckdb',
    schema_sql_path=Path('../planter/database/schema/migrations/004_add_gene_protein_map.sql'),
    upgrade_schema=True,
    target_schema_version=None
)

CatalogException: Catalog Error: Table with name "expression_backup" already exists!