In [None]:

import argparse
import os

parser = argparse.ArgumentParser(description='Import args from snakemake into iPython')

parser.add_argument('sample_name')
parser.add_argument('ref_file')
parser.add_argument('sample_file')
parser.add_argument('sample_out_file')

args = parser.parse_args()

sample_name = args.sample_name
ref_file = args.ref_file
sample_file = args.sample_file
sample_out_file = args.sample_out_file
os.environ['DATABASE_URL'] = f"duckdb://default.db"

import duckdb
import pandas as pd
import ipywidgets

conn = duckdb.connect(f'{sample_name}.db')

%load_ext sql
%sql conn --alias duckdb

In [None]:
%%sql

SET preserve_insertion_order = false;

In [None]:
%%sql

CREATE
OR REPLACE VIEW temp_vcfs AS
SELECT
    *
FROM
    read_parquet('{{sample_file}}');

In [None]:
%%sql

CREATE
OR REPLACE VIEW vcfs AS
SELECT
    sample,
    chromosome,
    CAST(position AS INTEGER) AS int_pos,
    reference AS old_ref,
    variant AS old_variant,
    quality,
    genotype,
    depth,
    allele_depth,
    (
        CASE
            WHEN temp_vcfs.variant = '.' THEN UPPER(temp_vcfs.reference)
            ELSE UPPER(temp_vcfs.variant)
        END
    ) AS new_variant
FROM
    temp_vcfs;

In [None]:
%%sql
CREATE
OR REPLACE TABLE refs AS
SELECT
    *
FROM
    read_parquet('{{ref_file}}');

In [None]:
%%sql

CREATE
OR REPLACE VIEW samples_rearranged AS
SELECT
    *,
    (
        CASE
            WHEN reference = new_variant THEN '.'
            ELSE new_variant
        END
    ) AS var_adjusted
FROM
    vcfs
    INNER JOIN refs ON vcfs.chromosome = refs.chromosome
    AND vcfs.int_pos = refs.position
WHERE
    new_variant = reference
    OR new_variant = variant;

In [None]:
%%sql
SET reserve_insertion_order = true;
CREATE
OR REPLACE TABLE final_samples AS
SELECT 
    string_split(sample, '-')[1] AS condition,
    string_split(sample, '-')[2] AS sample_type,
    string_split(sample, '-')[3] AS sample_num,
    UPPER(reference) AS reference,
    UPPER(var_adjusted) AS variant,
    chromosome,
    int_pos AS position,
    CAST(string_split(allele_depth, ',')[1] AS INTEGER) AS ref_reads,
    CAST(string_split(allele_depth, ',')[2] AS INTEGER) AS variant_reads,
    quality AS QUAL,
    (CASE WHEN var_adjusted='.' AND genotype='1/1' THEN '0/0' WHEN var_adjusted !='.' AND genotype='0/0' THEN '1/1' ELSE genotype END) AS GT,
    depth AS DP
FROM
    samples_rearranged
ORDER BY
    sample_num, chromosome, position;

In [None]:
%%sql

COPY (SELECT * FROM final_samples)
TO '{{sample_out_file}}'
(FORMAT 'parquet');