In [3]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import display

import duckdb
import pandas as pd
from pathlib import Path
import logging
from Bio import SeqIO
from datetime import datetime
import time

from typing import List, Optional, Union, Dict, Set

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# Build initial database

We're using our `database.builder` module to build the database.

TODO: Add logic to only insert sequences that don't already exist in the database.

TODO: I think there's some duplication/differences happening in the context manager that seem dumb, both of the "with" statements should be able to be combined.

In [9]:
from planter.database.builder import SequenceDBBuilder
# Define paths
DB_PATH = "/mnt/data2/planter_outputs/planter-test.duckdb"
OUTPUT_DIR = Path("/mnt/data2/planter_outputs")
# CLUSTER_TSV = "/mnt/data/planter_outputs/clusters.tsv"  # if you have clustering results

# List of SRA IDs to process
# sample_ids = ['ERR9123871', 'ERR9123872', 'ERR9123874', 'ERR9123875', 'ERR9123876']

# sample_ids = [
#     'ERR9123871', 'ERR9123872', 'ERR9123874', 'ERR9123875', 'ERR9123876', 
#     'ERR9123877', 'ERR9123878', 'ERR9123879', 'ERR9123880', 'ERR9123881', 
#     'ERR9123882', 'SRR10444679', 'SRR10444680', 'SRR10444681', 'SRR10444682', 
#     'SRR10444683', 'SRR10444684', 'SRR11011255', 'SRR11011256', 'SRR11011257', 
#     'SRR11011258', 'SRR11011259', 'SRR11011260', 'SRR12068547', 'SRR128113', 
#     'SRR128114', 'SRR13765006', 'SRR14292007', 'SRR14292008', 'SRR18070778', 
#     'SRR18070779', 'SRR18070780', 'SRR18070781', 'SRR18070782', 'SRR18070783', 
#     'SRR18070784', 'SRR18070785', 'SRR18070786', 'SRR18070787', 'SRR18070788', 
#     'SRR18070789', 'SRR18070790', 'SRR18070791', 'SRR18070792', 'SRR18070793', 
#     'SRR18070794', 'SRR18070795', 'SRR18735292', 'SRR19034772', 'SRR19034773', 
#     'SRR19619612', 'SRR19619613', 'SRR19619614', 'SRR22271585', 'SRR22271586', 
#     'SRR22271587', 'SRR22271588', 'SRR22271589', 'SRR22904707', 'SRR24974225', 
#     'SRR24974226', 'SRR24974227', 'SRR24974228', 'SRR25582085', 'SRR29366264', 
#     'SRR29366265', 'SRR29366266', 'SRR5489198', 'SRR5992919', 'SRR5992920', 
#     'SRR6048009', 'SRR8859643', 'SRR8859644', 'SRR8859645', 
#     'SRR8859646', 'SRR8859647', 'SRR8859648'
# ]

sample_ids = ['ERR9123871']
# Build database
with SequenceDBBuilder(DB_PATH, output_dir=OUTPUT_DIR) as builder:
    # Build initial database with sequences and annotations
    results = builder.build_database(sample_ids)
    display(results)
    
    # # Optionally load clustering results if available
    # if Path(CLUSTER_TSV).exists():
    #     builder.load_clusters_from_tsv(CLUSTER_TSV)
    
    # Get final database summary
    summary = builder.get_database_summary()
    display(summary)



INFO:planter.database.builder:Applying migration: 000_schema_version.sql
INFO:planter.database.builder:Applying migration: 001_initial_schema.sql
INFO:planter.database.builder:Applying migration: 002_add_indexes.sql
INFO:planter.database.builder:Fetching metadata for ERR9123871
INFO:planter.database.utils.sra:Fetching metadata for ERR9123871


INFO:planter.database.builder:Loading sequences from /mnt/data2/planter_outputs/ERR9123871/transdecoder/ERR9123871.pep
INFO:planter.database.builder:Loaded 30281 new sequences for ERR9123871
INFO:planter.database.builder:Loading annotations from /mnt/data2/planter_outputs/ERR9123871/eggnog/ERR9123871.emapper.annotations
INFO:planter.database.builder:Processed ERR9123871: 30281 sequences, 27884 annotations, 0 duplicates


Unnamed: 0,sample_id,status,sequences_loaded,annotations_loaded,duplicates,error
0,ERR9123871,success,30281,27884,0,


Unnamed: 0,total_sequences,total_samples,representative_sequences,annotated_sequences,sequences_with_go,sequences_with_ec,total_clusters,avg_sequence_length,min_sequence_length,max_sequence_length
0,30281,1,0,27884,15140,6682,0,441.07,100,3243


# Query

## Get the database summary

In [2]:
from planter.database.query_manager import DatabaseManager
db_path = "/mnt/data2/planter_outputs/planter2.duckdb"

with DatabaseManager(db_path) as db_manager:
    summary = db_manager.query_manager.database_summary()
    display(summary)

Unnamed: 0,total_sequences,total_samples,representative_sequences,annotated_sequences,sequences_with_go,sequences_with_ec,total_clusters,avg_sequence_length,min_sequence_length,max_sequence_length
0,1985570,77,0,1637348,785398,387029,0,430.94,100,7993


## See what tables and schemas are present in the database

In [3]:
with DatabaseManager(db_path) as db_manager:
    # Fetch all tables
    tables = db_manager.con.execute("SHOW TABLES;").fetchdf()
    print("Tables in database:")
    print(tables)

    # Iterate through each table and fetch its schema
    for table in tables['name']:
        print(f"\nSchema for table: {table}")
        schema = db_manager.con.execute(f"PRAGMA table_info('{table}');").fetchdf()
        display(schema)

Tables in database:
              name
0      annotations
1  cluster_members
2         clusters
3       ec_numbers
4         go_terms
5        kegg_info
6   schema_version
7        sequences
8     sra_metadata

Schema for table: annotations


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,seqhash_id,VARCHAR,True,,True
1,1,seed_ortholog,VARCHAR,False,,False
2,2,evalue,DOUBLE,False,,False
3,3,score,DOUBLE,False,,False
4,4,eggnog_ogs,VARCHAR,False,,False
5,5,max_annot_lvl,VARCHAR,False,,False
6,6,cog_category,VARCHAR,False,,False
7,7,description,VARCHAR,False,,False
8,8,preferred_name,VARCHAR,False,,False
9,9,sample_id,VARCHAR,True,,False



Schema for table: cluster_members


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,seqhash_id,VARCHAR,True,,True
1,1,cluster_id,VARCHAR,True,,False



Schema for table: clusters


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,cluster_id,VARCHAR,True,,True
1,1,representative_seqhash_id,VARCHAR,True,,False
2,2,size,INTEGER,True,,False



Schema for table: ec_numbers


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,seqhash_id,VARCHAR,True,,True
1,1,ec_number,VARCHAR,True,,True



Schema for table: go_terms


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,seqhash_id,VARCHAR,True,,True
1,1,go_term,VARCHAR,True,,True



Schema for table: kegg_info


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,seqhash_id,VARCHAR,True,,True
1,1,kegg_ko,VARCHAR,False,,False
2,2,kegg_pathway,VARCHAR,False,,False
3,3,kegg_module,VARCHAR,False,,False
4,4,kegg_reaction,VARCHAR,False,,False
5,5,kegg_rclass,VARCHAR,False,,False



Schema for table: schema_version


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,version,INTEGER,True,,True
1,1,migration_name,VARCHAR,True,,False
2,2,applied_at,TIMESTAMP,False,CURRENT_TIMESTAMP,False



Schema for table: sequences


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,seqhash_id,VARCHAR,True,,True
1,1,sequence,VARCHAR,True,,False
2,2,sample_id,VARCHAR,True,,False
3,3,assembly_date,TIMESTAMP,True,,False
4,4,is_representative,BOOLEAN,True,CAST('f' AS BOOLEAN),False
5,5,length,INTEGER,True,,False



Schema for table: sra_metadata


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,sample_id,VARCHAR,True,,True
1,1,organism,VARCHAR,False,,False
2,2,study_title,VARCHAR,False,,False
3,3,study_abstract,VARCHAR,False,,False
4,4,bioproject,VARCHAR,False,,False
5,5,biosample,VARCHAR,False,,False
6,6,library_strategy,VARCHAR,False,,False
7,7,library_source,VARCHAR,False,,False
8,8,library_selection,VARCHAR,False,,False
9,9,library_layout,VARCHAR,False,,False


## Get sequence_annotations for seqhashIDs

In [1]:
from planter.database.query_manager import DatabaseManager
import pandas as pd

db_path = "/mnt/data2/planter_outputs/planter2.duckdb"

# Test some seqhash IDs from your MMseqs results
test_ids = [
   "v1_DLS_8813cfa2d04ae4cf4c316e335a01c9d81b66255681bc3e51a98cd00dc5563466.p2",
   "v1_DLS_416333688207b36e52865182ab219b807e55f9b9cc56f30bb249fd4faa38e3da.p1",
   "v1_DLS_9ec0dd0c4e615b3c23b1b8e475cd0dc87f96b87d66fb13d2d033dc87dc652ca1.p1",
   "v1_DLS_633f05f5d947805a8eae5b38eea7cdd090f48c3dc1e3ce89eb0b6732042591aa.p1",
   "v1_DLS_700011b382d7c61a71117db9be1cfde6ad9435a22f74e60d769f57737d13c55b.p1"
]

with DatabaseManager(db_path) as db:
    annotations = db.query_manager.sequence_annotations(values=(test_ids,))
    display("\nAnnotations:", annotations)

'\nAnnotations:'

Unnamed: 0,seqhash_id,sample_id,organism,description,preferred_name,cog_category,go_terms,ec_numbers,kegg_ko,kegg_pathway
0,v1_DLS_633f05f5d947805a8eae5b38eea7cdd090f48c3...,SRR10444679,Xanthoria parietina,PKS_DH,-,Q,,,,
1,v1_DLS_8813cfa2d04ae4cf4c316e335a01c9d81b66255...,SRR128114,Digitalis purpurea,Belongs to the UDP-glycosyltransferase family,-,CG,,2.4.1.210,,
2,v1_DLS_416333688207b36e52865182ab219b807e55f9b...,SRR24974225,Digitalis purpurea,Belongs to the UDP-glycosyltransferase family,-,CG,,,,


## Search sequences

You can parameterize the search.

Examples show filtering by sample ID, length, GO Term, and COG category.

In [5]:
from planter.database.query_manager import DatabaseManager

db_path = "/mnt/data2/planter_outputs/planter2.duckdb"
output_dir = "/mnt/data2/planter_outputs"

with DatabaseManager(db_path) as db_manager:
    sample_ids = ["SRR18070780", "SRR10444679"]
    cog_categories = ['Q']  # desired COG categories
    go_terms = ['GO:0075109']  # example GO terms to search

    params = {
        # "sample_id_condition": sample_ids,  # e.g., "s.sample_id IN ('SRR18070780', 'SRR18070781')"
        # "cog_category_condition": ['M', 'Q'],
        "go_term_condition": go_terms,  # pass a list of GO terms        
        "min_length_condition": "s.length >= 100",
        "max_length_condition": "s.length <= 1000",
        "description_condition": None,
        "organism_condition": None,
        # "limit": 10,
    }

    results = db_manager.query_manager.search_sequences(params=params)
    display(results)

    # call go_term_summary dynamically
    # go_summary = db_manager.query_manager.go_term_summary(None, None, 5)
    # print("\nGO term summary:")
    # display(go_summary)

    # call organism_summary dynamically
    # organism_summary = db_manager.query_manager.organism_summary()
    # print("\nOrganism summary:")
    # display(organism_summary)
    # print(dir(db_manager.query_manager))
    # db_manager.query_manager.search_sequences('v1_DLS_31412ec4347c212e6892097053de8dc39cd53341988080fd7b80866c35840a0a.p1')
    

Unnamed: 0,seqhash_id,sample_id,length,is_representative,description,preferred_name,cog_category,evalue,seed_ortholog,max_annot_lvl,cluster_id,cluster_size,organism,study_title,bioproject,biosample
0,v1_DLS_878049f1c7d690c2a475148c24aa6215193325f...,SRR5992919,367,False,U3 snoRNA binding,BMS1,J,3.06e-13,653948.CCA25717,2759|Eukaryota,,,Zophobas atratus,De novo transcriptome assembly of the Malpighi...,PRJNA400859,SAMN07581518
1,v1_DLS_fc5046ddce2a1362613dcc9c97083bfd3e89179...,SRR10444679,502,False,structural molecule activity,-,A,1.34e-08,122962.PRO_0000039759,464095|Picornavirales,,,Xanthoria parietina,Xanthoria parietina 46-1 Gene Expression Profi...,PRJNA584076,SAMN13173247
2,v1_DLS_e5f14c12609ce0b15d873580594e6e6aa4766eb...,SRR5992920,110,False,Ribosome biogenesis protein (BMS1,-,J,1.29e-20,110365.A0A023B5C2,5794|Apicomplexa,,,Zophobas atratus,De novo transcriptome assembly of the Malpighi...,PRJNA400859,SAMN07581519
3,v1_DLS_81d3635e002a71c427b5ec350907c012f8dcd00...,SRR10444679,491,False,structural molecule activity,-,A,2.55e-33,928300.PRO_0000398370,464095|Picornavirales,,,Xanthoria parietina,Xanthoria parietina 46-1 Gene Expression Profi...,PRJNA584076,SAMN13173247


# Add new pipeline result

In [4]:
from planter.database.builder import SequenceDBBuilder
# Define paths
DB_PATH = "/mnt/data2/planter_outputs/planter-test.duckdb"
OUTPUT_DIR = Path("/mnt/data2/planter_outputs")

with SequenceDBBuilder(DB_PATH, output_dir=OUTPUT_DIR) as builder:
    # Add new sample
    new_results = builder.update_database(['ERR4235136'])


INFO:planter.database.builder:Fetching metadata for ERR4235136
INFO:planter.database.utils.sra:Fetching metadata for ERR4235136
INFO:planter.database.builder:Loading sequences from /mnt/data2/planter_outputs/ERR4235136/transdecoder/ERR4235136.pep
INFO:planter.database.builder:Loaded 42569 new sequences for ERR4235136
INFO:planter.database.builder:Loading annotations from /mnt/data2/planter_outputs/ERR4235136/eggnog/ERR4235136.emapper.annotations
INFO:planter.database.builder:Added ERR4235136: 42569 sequences, 31086 annotations
