Connect to the SQL database.

In [None]:
import argparse

parser = argparse.ArgumentParser(description='Import args from snakemake into iPython')

parser.add_argument('ref_parent_file')
parser.add_argument('alt_parent_file')
parser.add_argument('ref_parent_name')
parser.add_argument('alt_parent_name')
parser.add_argument('ref_qual_cutoff')
parser.add_argument('ref_depth_cutoff')
parser.add_argument('ref_out_file')
parser.add_argument('db')

args = parser.parse_args()

ref_parent_file = args.ref_parent_file
alt_parent_file = args.alt_parent_file
ref_qual_cutoff = args.ref_qual_cutoff
ref_depth_cutoff = args.ref_depth_cutoff
ref_out_file = args.ref_out_file
db = args.db

In [None]:
import duckdb
import pandas as pd
import ipywidgets
import os

# %config SqlMagic.autopandas = True
# %config SqlMagic.feedback = False
# %config SqlMagic.displaycon = False

%load_ext sql

%config SqlMagic.autopandas = False 
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

%sql {{db}}

In [43]:
%%sql
SET preserve_insertion_order = false;

Success


Define the locations of the references and samples, and quality/depth cutoffs.

In [44]:
# ref_parent_file = snakemake.input['ref']
# alt_parent_file = snakemake.input['alt']
# ref_parent_name = snakemake.config['ref_parent']
# alt_parent_name = snakemake.config['alt_parent']

# ref_qual_cutoff = snakemake.params['min_qual']
# ref_depth_cutoff = snakemake.params['min_depth']

# ref_out_file = snakemake.output[0]

Import the reference files as a table. 

In [45]:
%%sql
CREATE
OR REPLACE VIEW parents AS
SELECT
    sample,
    chromosome,
    CAST(position AS INTEGER) AS int_pos,
    quality,
    genotype,
    depth,
    allele_depth,
    UPPER(reference) AS up_reference,
    UPPER(variant) AS up_variant,
    (CASE WHEN variant='.' THEN reference ELSE UPPER(variant) END) AS mod_variant
FROM
    read_parquet(['{{ref_parent_file}}', '{{alt_parent_file}}']);


Count


Isolate sites where there are 2 variants (meaning one parent is different from the other).

In [46]:
%%sql
CREATE
OR REPLACE VIEW check_unique_variants AS
SELECT
    chromosome,
    int_pos,
    COUNT(up_variant) AS n_variants
FROM
    parents
GROUP BY
    chromosome,
    int_pos
HAVING
    n_variants >= 2;

Count


Create new reference from the reference parent.

In [47]:
%%sql
CREATE
OR REPLACE VIEW temp_ref AS
SELECT
    sample,
    parents.chromosome,
    parents.int_pos,
    up_reference AS reference,
    mod_variant,
    quality,
    genotype,
    depth,
    allele_depth
FROM
    parents
    INNER JOIN check_unique_variants ON (
        parents.chromosome = check_unique_variants.chromosome
        AND parents.int_pos = check_unique_variants.int_pos
    )
WHERE
    genotype != '0/1'
    AND quality > '{{ref_qual_cutoff}}'
    AND LENGTH (mod_variant) <= 1
    AND LENGTH (up_reference) <= 1
    AND depth > '{{ref_depth_cutoff}}';

Count


In [48]:
%%sql
CREATE
OR REPLACE VIEW ref_parent AS
SELECT
    sample,
    chromosome,
    int_pos,
    mod_variant AS ref_allele
FROM
    temp_ref
WHERE
    sample = '{{ref_parent_name}}';

CREATE
OR REPLACE VIEW alt_parent AS
SELECT
    sample,
    chromosome,
    int_pos,
    mod_variant AS alt_allele
FROM
    temp_ref
WHERE
    sample = '{{alt_parent_name}}';

Count


In [49]:
%%sql
CREATE
OR REPLACE TABLE ref AS
SELECT
    *
FROM
    ref_parent
    INNER JOIN alt_parent ON (
        ref_parent.chromosome = alt_parent.chromosome
        AND ref_parent.int_pos = alt_parent.int_pos
    );

Count


In [50]:
%%sql
CREATE
OR REPLACE TABLE refs_unique AS
SELECT
    *
FROM
    ref
WHERE
    ref_allele != alt_allele;

Count


In [58]:
%%sql
COPY (SELECT chromosome, int_pos AS position, ref_allele AS reference, alt_allele AS variant FROM refs_unique)
TO '{{ref_out_file}}'
(FORMAT 'parquet');


Count
