Connect to the SQL database.

In [20]:
import duckdb
import pandas as pd
import ipywidgets

%reload_ext sql

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

%sql duckdb:///vcf_dfs2.db

In [21]:
%%sql
SET enable_progress_bar = true;
SET preserve_insertion_order = false;

Unnamed: 0,Success


Define the locations of the references and samples, and quality/depth cutoffs.

In [22]:
ref_parent_file = "../../data/parquets/w1118.parquet"
alt_parent_file = "../../data/parquets/oregonr.parquet"
sample_file_glob = "../../data/parquets/WT-G0-0*.parquet"
ref_parent_name = "w1118"
alt_parent_name = "oregonr"

ref_qual_cutoff = 200
ref_depth_cutoff = 15

ref_out_file = "../../data/parquets/reference.parquet2"
sample_out_file = "../../data/parquets/progeny.parquet2"

Import the reference files as a table. 

In [23]:
%%sql
CREATE OR REPLACE VIEW parents AS
SELECT sample, chromosome, CAST(position AS INTEGER) AS int_pos, quality, genotype, depth, allele_depth, UPPER(reference) AS reference, UPPER(variant) AS variant, (CASE WHEN variant='.' THEN reference ELSE UPPER(variant) END) AS mod_variant FROM read_parquet(['{{ref_parent_file}}', '{{alt_parent_file}}']);


Unnamed: 0,Success


Isolate sites where there are 2 variants (meaning one parent is different from the other).

In [24]:
%%sql
CREATE OR REPLACE VIEW check_unique_variants AS
SELECT chromosome, int_pos, COUNT(variant) AS n_variants FROM parents
    GROUP BY chromosome, int_pos
    HAVING n_variants >= 2;

Unnamed: 0,Success


Create new reference from the reference parent.

In [25]:
%%sql
CREATE OR REPLACE VIEW temp_ref AS
SELECT
    sample,
    parents.chromosome,
    parents.int_pos,
    reference,
    mod_variant,
    quality,
    genotype,
    depth,
    allele_depth
    FROM parents
    INNER JOIN check_unique_variants ON parents.chromosome = check_unique_variants.chromosome AND parents.int_pos = check_unique_variants.int_pos
    WHERE genotype!='0/1'
        AND quality > '{{ref_qual_cutoff}}'
        AND LENGTH(mod_variant) <= 1
        AND LENGTH(reference) <= 1
        AND depth > '{{ref_depth_cutoff}}';

Unnamed: 0,Success


In [26]:
%%sql

CREATE OR REPLACE VIEW ref_parent AS
SELECT sample, chromosome, int_pos, mod_variant AS ref_allele FROM temp_ref
WHERE sample='{{ref_parent_name}}';

CREATE OR REPLACE VIEW alt_parent AS
SELECT sample, chromosome, int_pos, mod_variant AS alt_allele FROM temp_ref
WHERE sample='{{alt_parent_name}}';

CREATE OR REPLACE TABLE ref AS
SELECT * FROM ref_parent
INNER JOIN alt_parent ON (ref_parent.chromosome = alt_parent.chromosome AND ref_parent.int_pos = alt_parent.int_pos)
ORDER BY ref_parent.chromosome, ref_parent.int_pos;

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,Success


In [27]:
%%sql
CREATE OR REPLACE TABLE refs_unique AS
SELECT * FROM ref
WHERE ref_allele != alt_allele;

Unnamed: 0,Success


In [28]:
%%sql
SELECT * FROM refs_unique;

Unnamed: 0,sample,chromosome,int_pos,ref_allele,sample_1,chromosome_1,int_pos_1,alt_allele
0,w1118,chr2R,13843951,G,oregonr,chr2R,13843951,A
1,w1118,chr2R,13843981,A,oregonr,chr2R,13843981,T
2,w1118,chr2R,13845603,A,oregonr,chr2R,13845603,T
3,w1118,chr2R,13845663,C,oregonr,chr2R,13845663,T
4,w1118,chr2R,13845674,A,oregonr,chr2R,13845674,G
...,...,...,...,...,...,...,...,...
83367,w1118,chr2R,10002638,G,oregonr,chr2R,10002638,C
83368,w1118,chr2R,10002726,G,oregonr,chr2R,10002726,C
83369,w1118,chr2R,10002727,C,oregonr,chr2R,10002727,T
83370,w1118,chr2R,10003359,A,oregonr,chr2R,10003359,C


In [29]:
%%sql

CREATE OR REPLACE VIEW temp_vcfs AS
SELECT * FROM read_parquet('{{sample_file_glob}}');

Unnamed: 0,Success


In [30]:
%%sql

CREATE OR REPLACE VIEW vcfs AS
SELECT
    sample,
    chromosome,
    CAST(position AS INTEGER) AS int_pos,
    reference,
    variant,
    quality,
    genotype,
    depth,
    allele_depth,
    (CASE WHEN temp_vcfs.variant='.' THEN temp_vcfs.reference ELSE temp_vcfs.variant END) AS new_variant
    FROM temp_vcfs;

Unnamed: 0,Success


In [31]:
%%sql

CREATE OR REPLACE VIEW samples_rearranged AS
SELECT *, (CASE WHEN ref_allele=new_variant THEN '.' ELSE new_variant END) AS var_adjusted FROM vcfs
    INNER JOIN refs_unique ON vcfs.chromosome = refs_unique.chromosome AND vcfs.int_pos = refs_unique.int_pos
    WHERE new_variant=ref_allele OR new_variant=alt_allele;

Unnamed: 0,Success


In [32]:
%%sql

SELECT * FROM samples_rearranged
WHERE var_adjusted != '.'
LIMIT 10;

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,sample,chromosome,int_pos,reference,variant,quality,genotype,depth,allele_depth,new_variant,sample_1,chromosome_1,int_pos_1,ref_allele,sample_1_1,chromosome_1_1,int_pos_1_1,alt_allele,var_adjusted
0,WT-G0-001,chr3L,655648,G,T,225.009003,0/1,86,3848,T,w1118,chr3L,655648,G,oregonr,chr3L,655648,T,T
1,WT-G0-001,chr3L,661618,T,.,240.996002,0/0,70,70,T,w1118,chr3L,661618,t,oregonr,chr3L,661618,T,T
2,WT-G0-001,chr3L,660851,G,A,225.009003,0/1,69,3435,A,w1118,chr3L,660851,G,oregonr,chr3L,660851,A,A
3,WT-G0-001,chr3L,668738,C,A,93.007698,0/1,51,4110,A,w1118,chr3L,668738,C,oregonr,chr3L,668738,A,A
4,WT-G0-001,chr3L,694885,A,G,225.009003,0/1,73,4132,G,w1118,chr3L,694885,A,oregonr,chr3L,694885,G,G
5,WT-G0-001,chr3L,699318,T,C,225.009003,0/1,84,4539,C,w1118,chr3L,699318,T,oregonr,chr3L,699318,C,C
6,WT-G0-001,chr3L,714112,G,A,225.009003,0/1,82,3646,A,w1118,chr3L,714112,G,oregonr,chr3L,714112,A,A
7,WT-G0-001,chr3L,714319,T,G,225.009003,0/1,72,2745,G,w1118,chr3L,714319,T,oregonr,chr3L,714319,G,G
8,WT-G0-001,chr3L,749771,C,T,225.009003,0/1,76,3442,T,w1118,chr3L,749771,C,oregonr,chr3L,749771,T,T
9,WT-G0-001,chr3L,749936,A,T,225.009003,0/1,62,3329,T,w1118,chr3L,749936,A,oregonr,chr3L,749936,T,T


In [33]:
%%sql

CREATE OR REPLACE TABLE final_samples AS
SELECT 
    string_split(sample, '-')[1] AS condition,
    string_split(sample, '-')[2] AS sample_type,
    string_split(sample, '-')[3] AS sample_num,
    reference,
    variant,
    chromosome,
    int_pos AS position,
    string_split(allele_depth, ',')[1] AS ref_reads,
    string_split(allele_depth, ',')[2] AS variant_reads,
    quality AS QUAL,
    genotype AS GT,
    depth AS DP
    FROM samples_rearranged
ORDER BY sample_num, chromosome, position;

CHECKPOINT;

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,Success


In [34]:
%%sql
SELECT * FROM vcfs
WHERE chromosome = 'chr2L' AND int_pos >= 7902
LIMIT 10;

Unnamed: 0,sample,chromosome,int_pos,reference,variant,quality,genotype,depth,allele_depth,new_variant
0,WT-G0-001,chr2L,7902,A,.,234.996002,0/0,68,68,A
1,WT-G0-001,chr2L,7903,A,.,188.996002,0/0,64,62,A
2,WT-G0-001,chr2L,7904,C,.,222.996002,0/0,64,64,C
3,WT-G0-001,chr2L,7905,T,.,222.996002,0/0,64,64,T
4,WT-G0-001,chr2L,7906,C,.,222.996002,0/0,64,64,C
5,WT-G0-001,chr2L,7907,C,.,222.996002,0/0,64,64,C
6,WT-G0-001,chr2L,7908,C,.,222.996002,0/0,64,64,C
7,WT-G0-001,chr2L,7909,G,.,228.996002,0/0,66,66,G
8,WT-G0-001,chr2L,7910,C,.,200.996002,0/0,68,66,C
9,WT-G0-001,chr2L,7911,G,.,234.996002,0/0,68,68,G


In [35]:
%%sql
SELECT * FROM final_samples
LIMIT 10;

Unnamed: 0,condition,sample_type,sample_num,reference,variant,chromosome,position,ref_reads,variant_reads,QUAL,GT,DP
0,WT,G0,3,T,C,chrX,6530334,0,72,221.998993,1/1,72
1,WT,G0,3,A,G,chrX,6530561,0,79,221.998993,1/1,79
2,WT,G0,3,G,A,chrX,6531387,0,88,221.998993,1/1,88
3,WT,G0,3,C,G,chrX,6531408,0,81,221.998993,1/1,81
4,WT,G0,3,A,G,chrX,6532212,0,79,221.998993,1/1,79
5,WT,G0,3,G,A,chrX,6532213,0,79,221.998993,1/1,79
6,WT,G0,3,G,T,chrX,6532219,0,68,221.998993,1/1,68
7,WT,G0,3,G,T,chrX,6532277,0,48,221.998993,1/1,48
8,WT,G0,3,G,A,chrX,6534904,0,105,221.998993,1/1,105
9,WT,G0,3,C,T,chrX,6534909,0,97,221.998993,1/1,97


In [36]:
%%sql

COPY (SELECT * FROM final_samples)
TO '{{ref_out_file}}'
(FORMAT 'parquet');

COPY (SELECT chromosome, int_pos AS position, ref_allele AS reference, alt_allele AS variant FROM ref)
TO '{{sample_out_file}}'
(FORMAT 'parquet');


Unnamed: 0,Success
