In [None]:
import pyskim
import pandas as pd
from pysradb.search import SraSearch

# Query SRA to find relevant sample accessions

## Define query term

In [None]:
query = 'txid2697049[Organism:noexp] AND ("filetype cram"[Properties] OR "filetype bam"[Properties] OR "filetype fastq"[Properties])'

max_query_num = 1_000_000

## Search

In [None]:
instance = SraSearch(
    verbosity=3, return_max=max_query_num, query=query, platform="illumina"
)
instance.search()
df_search = instance.get_df()

## Save output

In [None]:
df_search.to_csv("sra_search.csv.gz", index=False)
df_search.head(1)

## Save all run accessions

In [None]:
run_accession_columns = df_search.filter(regex="run_?.*_accession").columns.tolist()
run_accession_columns

In [None]:
accession_set = set()
for col in run_accession_columns:
    accessions = df_search[col].dropna().unique().tolist()
    accession_set.update(accessions)

In [None]:
print(f"Found {len(accession_set)} accessions")

In [None]:
pd.Series(sorted(accession_set), name="accession").to_csv(
    "accession_list.csv", index=False
)

## Quick overview

In [None]:
pyskim.skim(df_search)