# Variants

In [1]:
from typing import List
import vcf
import pandas as pd
import re
import os

class VariantsReader:
    
    def __init__(self):
        pass

    def read_vcf(self, file: str) -> pd.DataFrame:
        reader = vcf.Reader(open(file, 'r'))
        df = pd.DataFrame([vars(r) for r in reader])
        out = df.merge(pd.DataFrame(df.INFO.tolist()),
                       left_index=True, right_index=True)
        out = out[['CHROM', 'POS', 'REF', 'ALT', 'DP', 'QUAL', 'RO', 'AO', 'INFO']]
        out['TYPE'] = out['INFO'].map(lambda x: x['TYPE'][0])
        out = out.drop('INFO', axis='columns')
        out['ALT'] = out['ALT'].map(lambda x: str(x[0]))
        out['REF'] = out['REF'].map(lambda x: str(x[0]))
        out['AO'] = out['AO'].map(lambda x: x[0])
        cols = out.columns.tolist()
        out['FILE'] = os.path.basename(file)
        out = out.reindex(columns=['FILE'] + cols)
        return out

    def read_vcfs(self, files: List[str]) -> pd.DataFrame:
        frames = [self.read_vcf(f) for f in files]
        return pd.concat(frames)

directory = 'data/snps-vcf'
files = [os.path.join(directory, f) for f in os.listdir('data/snps-vcf')]
files = [f for f in files if f.endswith('.vcf')]

vr = VariantsReader()
df = vr.read_vcfs(files)
df

Unnamed: 0,FILE,CHROM,POS,REF,ALT,DP,QUAL,RO,AO,TYPE
0,2014C-3857.filt.vcf,JASV01000001.1,16854,T,C,38,1175.580,0,38,snp
1,2014C-3857.filt.vcf,JASV01000001.1,16871,C,C,29,832.403,0,28,del
2,2014C-3857.filt.vcf,JASV01000001.1,16897,G,C,25,735.848,0,25,snp
3,2014C-3857.filt.vcf,JASV01000001.1,17327,T,TA,11,258.368,0,11,ins
4,2014C-3857.filt.vcf,JASV01000001.1,17335,G,G,13,292.065,0,13,del
...,...,...,...,...,...,...,...,...,...,...
125,2014C-3600.filt.vcf,JASV01000007.1,767386,A,G,87,2684.540,0,87,snp
126,2014C-3600.filt.vcf,JASV01000007.1,890046,T,C,96,3115.580,0,96,snp
127,2014C-3600.filt.vcf,JASV01000007.1,958303,T,G,93,3115.200,0,93,snp
128,2014C-3600.filt.vcf,JASV01000007.1,963356,A,G,23,692.768,0,23,snp


# DB model

In [2]:
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship

Base = declarative_base()
Base

from sqlalchemy import Column, Integer, String, Sequence, BigInteger, ForeignKey, Table
    
association_table = Table('sample_variation_allele', Base.metadata,
    Column('sample_id', Integer, ForeignKey('sample.id')),
    Column('variantion_allele_id', String, ForeignKey('variation_allele.id')),
)

class VariationAllele(Base):
    __tablename__ = 'variation_allele'
    id = Column(String, primary_key=True)
    sequence_name = Column(String, ForeignKey('reference_sequence.id'))
    position = Column(Integer, primary_key=True)
    ref = Column(String(255), primary_key=True)
    alt = Column(String(255), primary_key=True)
    var_type = Column(String(255))
    
    samples = relationship('Sample', secondary=association_table)
    
    def __init__(self, sequence_name: str, position: int, ref: str, alt: str, var_type: str):
        self.sequence_name = sequence_name
        self.position = position
        self.ref = ref
        self.alt = alt
        self.var_type = var_type
        
        self.id = self.to_spdi()
    
    def to_spdi(self):
        return f'{self.sequence_name}:{self.position}:{self.ref}:{self.alt}'
    
    def __repr__(self):
        return (f'<VariationAllele(sequence_name={self.sequence_name}'
                f', position={self.position}, ref={self.ref}, alt={self.alt}, var_type={self.var_type})>')

class Reference(Base):
    __tablename__ = 'reference'
    id = Column(Integer, primary_key=True)
    name = Column(String(255))
    length = Column(Integer)
    sequences = relationship('ReferenceSequence')
    
    def __repr__(self):
        return f'<Reference(id={self.id}, name={self.name}, length={self.length})>'
    
    
class ReferenceSequence(Base):
    __tablename__ = 'reference_sequence'
    id = Column(Integer, primary_key=True)
    reference_id = Column(Integer, ForeignKey('reference.id'))
    sequence_name = Column(String(255))
#     variants = relationship('VariationAllele', back_populates='sequence_name')
    
    def __repr__(self):
        return f'<ReferenceSequence(id={self.id}, sequence_name={self.sequence_name}, reference_id={self.reference_id})>'
    
class Sample(Base):
    __tablename__ = 'sample'
    id = Column(Integer, primary_key=True)
    name = Column(String(255))
    variants = relationship('VariationAllele', secondary=association_table)
    
    def __repr__(self):
        return f'<Sample(id={self.id}, name={self.name})>'

# Create some data

In [3]:
from Bio import SeqIO

ref_name = '2011C-3609.fasta'
ref_length = 0
ref_contigs = {}
for record in SeqIO.parse(f"reference/{ref_name}", "fasta"):
    ref_contigs[record.id] = ReferenceSequence(sequence_name=record.id)
    ref_length += len(record.seq)

reference = Reference(name = ref_name, length = ref_length, sequences=list(ref_contigs.values()))
ref_contigs

{'JASV01000001.1': <ReferenceSequence(id=None, sequence_name=JASV01000001.1, reference_id=None)>,
 'JASV01000002.1': <ReferenceSequence(id=None, sequence_name=JASV01000002.1, reference_id=None)>,
 'JASV01000003.1': <ReferenceSequence(id=None, sequence_name=JASV01000003.1, reference_id=None)>,
 'JASV01000004.1': <ReferenceSequence(id=None, sequence_name=JASV01000004.1, reference_id=None)>,
 'JASV01000005.1': <ReferenceSequence(id=None, sequence_name=JASV01000005.1, reference_id=None)>,
 'JASV01000006.1': <ReferenceSequence(id=None, sequence_name=JASV01000006.1, reference_id=None)>,
 'JASV01000007.1': <ReferenceSequence(id=None, sequence_name=JASV01000007.1, reference_id=None)>}

# Insert into database

In [4]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

engine = create_engine('sqlite:///:memory:', echo=True)

Session = sessionmaker(bind=engine)
session = Session()
session

Base.metadata.create_all(engine)

2020-12-22 02:13:16,631 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2020-12-22 02:13:16,632 INFO sqlalchemy.engine.base.Engine ()
2020-12-22 02:13:16,633 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2020-12-22 02:13:16,634 INFO sqlalchemy.engine.base.Engine ()
2020-12-22 02:13:16,637 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("sample_variation_allele")
2020-12-22 02:13:16,638 INFO sqlalchemy.engine.base.Engine ()
2020-12-22 02:13:16,640 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("sample_variation_allele")
2020-12-22 02:13:16,642 INFO sqlalchemy.engine.base.Engine ()
2020-12-22 02:13:16,644 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("variation_allele")
2020-12-22 02:13:16,645 INFO sqlalchemy.engine.base.Engine ()
2020-12-22 02:13:16,648 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("variation_allele")
2020-12-22 02:13:16,649 INFO s

In [5]:
session.add(reference)
session.commit()

2020-12-22 02:13:17,214 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2020-12-22 02:13:17,216 INFO sqlalchemy.engine.base.Engine INSERT INTO reference (name, length) VALUES (?, ?)
2020-12-22 02:13:17,219 INFO sqlalchemy.engine.base.Engine ('2011C-3609.fasta', 5412686)
2020-12-22 02:13:17,224 INFO sqlalchemy.engine.base.Engine INSERT INTO reference_sequence (reference_id, sequence_name) VALUES (?, ?)
2020-12-22 02:13:17,226 INFO sqlalchemy.engine.base.Engine (1, 'JASV01000001.1')
2020-12-22 02:13:17,229 INFO sqlalchemy.engine.base.Engine INSERT INTO reference_sequence (reference_id, sequence_name) VALUES (?, ?)
2020-12-22 02:13:17,232 INFO sqlalchemy.engine.base.Engine (1, 'JASV01000002.1')
2020-12-22 02:13:17,234 INFO sqlalchemy.engine.base.Engine INSERT INTO reference_sequence (reference_id, sequence_name) VALUES (?, ?)
2020-12-22 02:13:17,238 INFO sqlalchemy.engine.base.Engine (1, 'JASV01000003.1')
2020-12-22 02:13:17,242 INFO sqlalchemy.engine.base.Engine INSERT INTO reference

In [6]:
ref = session.query(Reference).filter_by(id = 1).first()
ref_contigs = {c.sequence_name: c for c in ref.sequences}
ref_contigs

2020-12-22 02:13:17,751 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2020-12-22 02:13:17,756 INFO sqlalchemy.engine.base.Engine SELECT reference.id AS reference_id, reference.name AS reference_name, reference.length AS reference_length 
FROM reference 
WHERE reference.id = ?
 LIMIT ? OFFSET ?
2020-12-22 02:13:17,759 INFO sqlalchemy.engine.base.Engine (1, 1, 0)
2020-12-22 02:13:17,770 INFO sqlalchemy.engine.base.Engine SELECT reference_sequence.id AS reference_sequence_id, reference_sequence.reference_id AS reference_sequence_reference_id, reference_sequence.sequence_name AS reference_sequence_sequence_name 
FROM reference_sequence 
WHERE ? = reference_sequence.reference_id
2020-12-22 02:13:17,773 INFO sqlalchemy.engine.base.Engine (1,)


{'JASV01000001.1': <ReferenceSequence(id=1, sequence_name=JASV01000001.1, reference_id=1)>,
 'JASV01000002.1': <ReferenceSequence(id=2, sequence_name=JASV01000002.1, reference_id=1)>,
 'JASV01000003.1': <ReferenceSequence(id=3, sequence_name=JASV01000003.1, reference_id=1)>,
 'JASV01000004.1': <ReferenceSequence(id=4, sequence_name=JASV01000004.1, reference_id=1)>,
 'JASV01000005.1': <ReferenceSequence(id=5, sequence_name=JASV01000005.1, reference_id=1)>,
 'JASV01000006.1': <ReferenceSequence(id=6, sequence_name=JASV01000006.1, reference_id=1)>,
 'JASV01000007.1': <ReferenceSequence(id=7, sequence_name=JASV01000007.1, reference_id=1)>}

In [7]:
import re

df2 = df.copy()
variant_table = {}
file_variants = {}
for row in df2.iterrows():
    sample_name = re.sub('\.filt\.vcf$', '', row[1]['FILE'])
    
    variant = VariationAllele(sequence_name=row[1]['CHROM'], position=row[1]['POS'],
                             ref=row[1]['REF'], alt=row[1]['ALT'], var_type=row[1]['TYPE'])
    if variant.id not in variant_table:
        variant_table[variant.id] = variant
    else:    
        variant = variant_table[variant.id]
        
    if sample_name not in file_variants:
        file_variants[sample_name] = []
        
    file_variants[sample_name].append(variant)
    
file_variants.keys()

dict_keys(['2014C-3857', '2014C-3655', '2014C-3598', '2014C-3840', '2014C-3599', '2014C-3907', '2014C-3656', '2014C-3850', '2014C-3600'])

In [8]:
for s in file_variants:
    sample = Sample(name=s, variants=file_variants[s])
    session.add(sample)
session.commit()

2020-12-22 02:13:18,802 INFO sqlalchemy.engine.base.Engine INSERT INTO variation_allele (id, sequence_name, position, ref, alt, var_type) VALUES (?, ?, ?, ?, ?, ?)
2020-12-22 02:13:18,804 INFO sqlalchemy.engine.base.Engine (('JASV01000001.1:16854:T:C', 'JASV01000001.1', 16854, 'T', 'C', 'snp'), ('JASV01000001.1:16871:C:C', 'JASV01000001.1', 16871, 'C', 'C', 'del'), ('JASV01000001.1:16897:G:C', 'JASV01000001.1', 16897, 'G', 'C', 'snp'), ('JASV01000001.1:17327:T:TA', 'JASV01000001.1', 17327, 'T', 'TA', 'ins'), ('JASV01000001.1:17335:G:G', 'JASV01000001.1', 17335, 'G', 'G', 'del'), ('JASV01000001.1:17347:C:C', 'JASV01000001.1', 17347, 'C', 'C', 'del'), ('JASV01000001.1:17360:T:A', 'JASV01000001.1', 17360, 'T', 'A', 'snp'), ('JASV01000001.1:17468:T:C', 'JASV01000001.1', 17468, 'T', 'C', 'snp')  ... displaying 10 of 601 total bound parameter sets ...  ('JASV01000007.1:13994:C:A', 'JASV01000007.1', 13994, 'C', 'A', 'snp'), ('JASV01000007.1:963356:A:G', 'JASV01000007.1', 963356, 'A', 'G', 'sn

In [9]:
v = session.query(VariationAllele).filter_by(position = 16854).all()
v

2020-12-22 02:13:19,248 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2020-12-22 02:13:19,253 INFO sqlalchemy.engine.base.Engine SELECT variation_allele.id AS variation_allele_id, variation_allele.sequence_name AS variation_allele_sequence_name, variation_allele.position AS variation_allele_position, variation_allele.ref AS variation_allele_ref, variation_allele.alt AS variation_allele_alt, variation_allele.var_type AS variation_allele_var_type 
FROM variation_allele 
WHERE variation_allele.position = ?
2020-12-22 02:13:19,257 INFO sqlalchemy.engine.base.Engine (16854,)


[<VariationAllele(sequence_name=JASV01000001.1, position=16854, ref=T, alt=C, var_type=snp)>]

In [10]:
r = session.query(ReferenceSequence).filter_by(sequence_name = 'JASV01000004.1').first()
r

2020-12-22 02:13:19,795 INFO sqlalchemy.engine.base.Engine SELECT reference_sequence.id AS reference_sequence_id, reference_sequence.reference_id AS reference_sequence_reference_id, reference_sequence.sequence_name AS reference_sequence_sequence_name 
FROM reference_sequence 
WHERE reference_sequence.sequence_name = ?
 LIMIT ? OFFSET ?
2020-12-22 02:13:19,801 INFO sqlalchemy.engine.base.Engine ('JASV01000004.1', 1, 0)


<ReferenceSequence(id=4, sequence_name=JASV01000004.1, reference_id=1)>

In [11]:
s = session.query(Sample).filter_by(name = '2014C-3857').first()
s

2020-12-22 02:13:20,385 INFO sqlalchemy.engine.base.Engine SELECT sample.id AS sample_id, sample.name AS sample_name 
FROM sample 
WHERE sample.name = ?
 LIMIT ? OFFSET ?
2020-12-22 02:13:20,388 INFO sqlalchemy.engine.base.Engine ('2014C-3857', 1, 0)


<Sample(id=1, name=2014C-3857)>

In [12]:
s.variants[0].id

2020-12-22 02:13:20,997 INFO sqlalchemy.engine.base.Engine SELECT variation_allele.id AS variation_allele_id, variation_allele.sequence_name AS variation_allele_sequence_name, variation_allele.position AS variation_allele_position, variation_allele.ref AS variation_allele_ref, variation_allele.alt AS variation_allele_alt, variation_allele.var_type AS variation_allele_var_type 
FROM variation_allele, sample_variation_allele 
WHERE ? = sample_variation_allele.sample_id AND variation_allele.id = sample_variation_allele.variantion_allele_id
2020-12-22 02:13:20,999 INFO sqlalchemy.engine.base.Engine (1,)


'JASV01000001.1:16854:T:C'

In [13]:
v = session.query(VariationAllele).filter_by(id='JASV01000001.1:16854:T:C').first()
v.samples

2020-12-22 02:13:21,615 INFO sqlalchemy.engine.base.Engine SELECT variation_allele.id AS variation_allele_id, variation_allele.sequence_name AS variation_allele_sequence_name, variation_allele.position AS variation_allele_position, variation_allele.ref AS variation_allele_ref, variation_allele.alt AS variation_allele_alt, variation_allele.var_type AS variation_allele_var_type 
FROM variation_allele 
WHERE variation_allele.id = ?
 LIMIT ? OFFSET ?
2020-12-22 02:13:21,616 INFO sqlalchemy.engine.base.Engine ('JASV01000001.1:16854:T:C', 1, 0)
2020-12-22 02:13:21,628 INFO sqlalchemy.engine.base.Engine SELECT sample.id AS sample_id, sample.name AS sample_name 
FROM sample, sample_variation_allele 
WHERE ? = sample_variation_allele.variantion_allele_id AND sample.id = sample_variation_allele.sample_id
2020-12-22 02:13:21,630 INFO sqlalchemy.engine.base.Engine ('JASV01000001.1:16854:T:C',)


[<Sample(id=7, name=2014C-3656)>,
 <Sample(id=3, name=2014C-3598)>,
 <Sample(id=4, name=2014C-3840)>,
 <Sample(id=6, name=2014C-3907)>,
 <Sample(id=5, name=2014C-3599)>,
 <Sample(id=8, name=2014C-3850)>,
 <Sample(id=2, name=2014C-3655)>,
 <Sample(id=1, name=2014C-3857)>,
 <Sample(id=9, name=2014C-3600)>]