# Variants

In [1]:
from typing import List
import vcf
import pandas as pd
import re
import os

class VariantsReader:
    
    def __init__(self):
        pass

    def read_vcf(self, file: str) -> pd.DataFrame:
        reader = vcf.Reader(open(file, 'r'))
        df = pd.DataFrame([vars(r) for r in reader])
        out = df.merge(pd.DataFrame(df.INFO.tolist()),
                       left_index=True, right_index=True)
        out = out[['CHROM', 'POS', 'REF', 'ALT', 'DP', 'QUAL', 'RO', 'AO', 'INFO']]
        out['TYPE'] = out['INFO'].map(lambda x: x['TYPE'][0])
        out = out.drop('INFO', axis='columns')
        out['ALT'] = out['ALT'].map(lambda x: str(x[0]))
        out['REF'] = out['REF'].map(lambda x: str(x[0]))
        out['AO'] = out['AO'].map(lambda x: x[0])
        cols = out.columns.tolist()
        out['FILE'] = os.path.basename(file)
        out = out.reindex(columns=['FILE'] + cols)
        return out

    def read_vcfs(self, files: List[str]) -> pd.DataFrame:
        frames = [self.read_vcf(f) for f in files]
        return pd.concat(frames)

directory = 'data/snps-vcf'
files = [os.path.join(directory, f) for f in os.listdir('data/snps-vcf')]
files = [f for f in files if f.endswith('.vcf')]

vr = VariantsReader()
df = vr.read_vcfs(files)
df

Unnamed: 0,FILE,CHROM,POS,REF,ALT,DP,QUAL,RO,AO,TYPE
0,2014C-3857.filt.vcf,JASV01000001.1,16854,T,C,38,1175.580,0,38,snp
1,2014C-3857.filt.vcf,JASV01000001.1,16871,C,C,29,832.403,0,28,del
2,2014C-3857.filt.vcf,JASV01000001.1,16897,G,C,25,735.848,0,25,snp
3,2014C-3857.filt.vcf,JASV01000001.1,17327,T,TA,11,258.368,0,11,ins
4,2014C-3857.filt.vcf,JASV01000001.1,17335,G,G,13,292.065,0,13,del
...,...,...,...,...,...,...,...,...,...,...
125,2014C-3600.filt.vcf,JASV01000007.1,767386,A,G,87,2684.540,0,87,snp
126,2014C-3600.filt.vcf,JASV01000007.1,890046,T,C,96,3115.580,0,96,snp
127,2014C-3600.filt.vcf,JASV01000007.1,958303,T,G,93,3115.200,0,93,snp
128,2014C-3600.filt.vcf,JASV01000007.1,963356,A,G,23,692.768,0,23,snp


In [2]:
import re

sample_names = df.FILE.value_counts().index.tolist()
sample_names = [re.sub('\.filt\.vcf$', '', n) for n in sample_names]
sample_names

['2014C-3907',
 '2014C-3840',
 '2014C-3656',
 '2014C-3655',
 '2014C-3857',
 '2014C-3600',
 '2014C-3850',
 '2014C-3599',
 '2014C-3598']

# DB model

In [3]:
from bitarray import bitarray
import Bio.Seq

class CoreBitMask:
    
    def __init__(self, sequence: Bio.Seq.Seq = None, existing_bitmask: bitarray = None):
        if existing_bitmask is not None and sequence is not None:
            raise Exception(f'Cannot set both existing_bitmask={existing_bitmask} and sequence={sequence}')
            
        if existing_bitmask:
            self._core_bitmask = existing_bitmask
        elif sequence:
            self._core_bitmask = bitarray(len(sequence))
            self._core_bitmask.setall(True)
            self._add_sequence(sequence)
        else:
            raise Exception('If no existing_bitmask set then sequence must be defined')
            
    def _add_sequence(self, sequence: Bio.Seq.Seq) -> float:
        for idx, char in enumerate(sequence):
            if char.upper() == 'N' or char == '-':
                self._core_bitmask[idx] = False
                
    def get_bytes(self):
        return self._core_bitmask.tobytes()
    
    def core_length(self) -> int:
        return self._core_bitmask.count()
    
    def core_proportion(self) -> float:
        return self.core_length()/len(self)
    
    def __len__(self) -> int:
        return len(self._core_bitmask)
    
    def __getitem__(self, index: int) -> bool:
        return self._core_bitmask[index]
    
s = Bio.Seq.Seq('ATCG-NN')
mask = CoreBitMask(sequence=s)
mask.core_proportion()

0.5714285714285714

In [4]:
from typing import List, Any

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship

from bitarray import bitarray

Base = declarative_base()
Base

from sqlalchemy import Column, Integer, String, Sequence, BigInteger, ForeignKey, Table, LargeBinary
    
association_table = Table('sample_variation_allele', Base.metadata,
    Column('sample_id', Integer, ForeignKey('sample.id')),
    Column('variantion_allele_id', String, ForeignKey('variation_allele.id')),
)

class VariationAllele(Base):
    __tablename__ = 'variation_allele'
    id = Column(String, primary_key=True)
    sequence_name = Column(String, ForeignKey('reference_sequence.id'))
    position = Column(Integer, primary_key=True)
    ref = Column(String(255), primary_key=True)
    alt = Column(String(255), primary_key=True)
    var_type = Column(String(255))
    
    samples = relationship('Sample', secondary=association_table)
    sequence = relationship('ReferenceSequence', back_populates='variants')
    
    def __init__(self, sequence_name: str = None, position: int = -1, ref: str = None, alt: str = None,
                 var_type: str = None):
        self.sequence_name = sequence_name
        self.position = position
        self.ref = ref
        self.alt = alt
        self.var_type = var_type
        
        self.id = self.to_spdi()
    
    def to_spdi(self):
        return f'{self.sequence_name}:{self.position}:{self.ref}:{self.alt}'
    
    def __repr__(self):
        return (f'<VariationAllele(sequence_name={self.sequence_name}'
                f', position={self.position}, ref={self.ref}, alt={self.alt}, var_type={self.var_type})>')

class Reference(Base):
    __tablename__ = 'reference'
    id = Column(Integer, primary_key=True)
    name = Column(String(255))
    length = Column(Integer)
    sequences = relationship('ReferenceSequence')
    
    def __repr__(self):
        return f'<Reference(id={self.id}, name={self.name}, length={self.length})>'
    
    
class SampleSequence(Base):
    __tablename__ = 'sample_sequence'
    sample_id = Column(Integer, ForeignKey('sample.id'), primary_key=True)
    sequence_id = Column(Integer, ForeignKey('reference_sequence.id'), primary_key=True)
    core_mask = Column(LargeBinary)
    flag = Column(String(255))
    
    sequence = relationship('ReferenceSequence')
    sample = relationship('Sample', back_populates='sample_sequences')
    
    def get_core_mask(self):
        if self.core_mask is None:
            raise Exception('core_mask is not set')
        else:
            barray = bitarray(self.sequence.sequence_length)
            barray.frombytes(self.core_mask)
            return CoreBitMask(existing_bitmask=barray)
        
    def set_core_mask(self, core_mask: CoreBitMask) -> None:
        if core_mask is None:
            raise Exception('Cannot set core_mask to None')
        else:
            self.core_mask = core_mask.get_bytes()
    
    def __repr__(self):
        return f'<SampleSequence(sample_id={self.sample_id}, sequence_id={self.sequence_id}, flag={self.flag})>'
    
    
class ReferenceSequence(Base):
    __tablename__ = 'reference_sequence'
    id = Column(Integer, primary_key=True)
    reference_id = Column(Integer, ForeignKey('reference.id'))
    sequence_name = Column(String(255))
    sequence_length = Column(Integer)
    variants = relationship('VariationAllele', back_populates='sequence')
    
    def __repr__(self):
        return (f'<ReferenceSequence(id={self.id}, sequence_name={self.sequence_name},'
                f'sequence_length={self.sequence_length}, reference_id={self.reference_id})>')
    
    
class Sample(Base):
    __tablename__ = 'sample'
    id = Column(Integer, primary_key=True)
    name = Column(String(255))
    
    variants = relationship('VariationAllele', secondary=association_table)
    sample_sequences = relationship('SampleSequence', back_populates='sample')
    
    def __repr__(self):
        return f'<Sample(id={self.id}, name={self.name})>'

# Create some data

In [5]:
from Bio import SeqIO

ref_name = '2011C-3609.fasta'
ref_length = 0
ref_contigs = {}
for record in SeqIO.parse(f"reference/{ref_name}", "fasta"):
    ref_contigs[record.id] = ReferenceSequence(sequence_name=record.id, sequence_length=len(record.seq))
    ref_length += len(record.seq)

reference = Reference(name = ref_name, length = ref_length, sequences=list(ref_contigs.values()))
ref_contigs

{'JASV01000001.1': <ReferenceSequence(id=None, sequence_name=JASV01000001.1,sequence_length=521264, reference_id=None)>,
 'JASV01000002.1': <ReferenceSequence(id=None, sequence_name=JASV01000002.1,sequence_length=6598, reference_id=None)>,
 'JASV01000003.1': <ReferenceSequence(id=None, sequence_name=JASV01000003.1,sequence_length=225424, reference_id=None)>,
 'JASV01000004.1': <ReferenceSequence(id=None, sequence_name=JASV01000004.1,sequence_length=3051854, reference_id=None)>,
 'JASV01000005.1': <ReferenceSequence(id=None, sequence_name=JASV01000005.1,sequence_length=49092, reference_id=None)>,
 'JASV01000006.1': <ReferenceSequence(id=None, sequence_name=JASV01000006.1,sequence_length=506004, reference_id=None)>,
 'JASV01000007.1': <ReferenceSequence(id=None, sequence_name=JASV01000007.1,sequence_length=1052450, reference_id=None)>}

In [6]:
from typing import Dict, List
import logging
import pandas as pd
import re

logger = logging.getLogger('VariationService')
logger.setLevel(logging.DEBUG)
#logging.basicConfig(level=logging.DEBUG)

class VariationService:
    
    def __init__(self, session):
        self._session = session
        
    def _create_file_variants(self, var_df: pd.DataFrame) -> Dict[str, List[VariationAllele]]:
        variant_table = {}
        file_variants = {}
        sample_sequences = {}
        for row in var_df.iterrows():
            sample_name = re.sub('\.filt\.vcf$', '', row[1]['FILE'])

            variant = VariationAllele(sequence_name=row[1]['CHROM'], position=row[1]['POS'],
                                     ref=row[1]['REF'], alt=row[1]['ALT'], var_type=row[1]['TYPE'])
            if variant.id not in variant_table:
                variant_table[variant.id] = variant
            else:    
                variant = variant_table[variant.id]

            if sample_name not in file_variants:
                file_variants[sample_name] = []

            file_variants[sample_name].append(variant)

        return file_variants
    
    def insert_variants(self, var_df: pd.DataFrame, ref_contigs: Dict[str, ReferenceSequence]) -> None:
        file_variants = self._create_file_variants(var_df)
        
        for s in file_variants:
            ref_objects = {ref_contigs[v.sequence_name] for v in file_variants[s]}
            sample_sequences = [SampleSequence(sequence=r) for r in ref_objects]
            sample = Sample(name=s, variants=file_variants[s], sample_sequences=sample_sequences)
            self._session.add(sample)
            
        self._session.commit()
        
    def pairwise_distance(self, samples: List[str], var_type = 'all', distance_type = 'jaccard') -> pd.DataFrame:
        sample_objs = self._session.query(Sample).filter(Sample.name.in_(samples)).all()
        
        if var_type == 'all':
            sample_variants = {s.name: {v.to_spdi() for v in s.variants} for s in sample_objs}
        else:
            sample_variants = {s.name: {v.to_spdi() for v in s.variants if v.var_type == var_type} for s in sample_objs}
        
        names = sample_variants.keys()
        distances = []
        for name1 in names:
            row = []
            for name2 in names:
                if name1 == name2:
                    row.append(0)
                else:
                    if distance_type == 'jaccard':
                        logger.debug(f'variants1=[{sample_variants[name1]}]')
                        logger.debug(f'variants2=[{sample_variants[name2]}]')
                        intersection = sample_variants[name1].intersection(sample_variants[name2])
                        union = sample_variants[name1].union(sample_variants[name2])
                        
#                         print(f'unique_variants1=[{sample_variants[name1] - sample_variants[name2]}]')
#                         print(f'unique_variants2=[{sample_variants[name2] - sample_variants[name1]}]')
                        
                        row.append(1 - (len(intersection)/len(union)))
                    else:
                        raise Exception(f'Unsupported distance_type=[{distance_type}]')
            distances.append(row)
            
        return pd.DataFrame(distances, columns=names, index=names)

# Insert into database

In [7]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

engine = create_engine('sqlite:///:memory:', echo=True)

Session = sessionmaker(bind=engine)
session = Session()
session

Base.metadata.create_all(engine)

2020-12-30 17:04:26,353 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2020-12-30 17:04:26,355 INFO sqlalchemy.engine.base.Engine ()
2020-12-30 17:04:26,357 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2020-12-30 17:04:26,358 INFO sqlalchemy.engine.base.Engine ()
2020-12-30 17:04:26,363 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("sample_variation_allele")
2020-12-30 17:04:26,365 INFO sqlalchemy.engine.base.Engine ()
2020-12-30 17:04:26,368 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("sample_variation_allele")
2020-12-30 17:04:26,369 INFO sqlalchemy.engine.base.Engine ()
2020-12-30 17:04:26,373 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("variation_allele")
2020-12-30 17:04:26,375 INFO sqlalchemy.engine.base.Engine ()
2020-12-30 17:04:26,377 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("variation_allele")
2020-12-30 17:04:26,379 INFO s

In [8]:
session.add(reference)
session.commit()

2020-12-30 17:04:26,498 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2020-12-30 17:04:26,500 INFO sqlalchemy.engine.base.Engine INSERT INTO reference (name, length) VALUES (?, ?)
2020-12-30 17:04:26,501 INFO sqlalchemy.engine.base.Engine ('2011C-3609.fasta', 5412686)
2020-12-30 17:04:26,503 INFO sqlalchemy.engine.base.Engine INSERT INTO reference_sequence (reference_id, sequence_name, sequence_length) VALUES (?, ?, ?)
2020-12-30 17:04:26,504 INFO sqlalchemy.engine.base.Engine (1, 'JASV01000001.1', 521264)
2020-12-30 17:04:26,505 INFO sqlalchemy.engine.base.Engine INSERT INTO reference_sequence (reference_id, sequence_name, sequence_length) VALUES (?, ?, ?)
2020-12-30 17:04:26,506 INFO sqlalchemy.engine.base.Engine (1, 'JASV01000002.1', 6598)
2020-12-30 17:04:26,507 INFO sqlalchemy.engine.base.Engine INSERT INTO reference_sequence (reference_id, sequence_name, sequence_length) VALUES (?, ?, ?)
2020-12-30 17:04:26,508 INFO sqlalchemy.engine.base.Engine (1, 'JASV01000003.1', 225424

In [9]:
ref = session.query(Reference).filter_by(id = 1).first()
ref_contigs = {c.sequence_name: c for c in ref.sequences}
ref_contigs

2020-12-30 17:04:26,670 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2020-12-30 17:04:26,674 INFO sqlalchemy.engine.base.Engine SELECT reference.id AS reference_id, reference.name AS reference_name, reference.length AS reference_length 
FROM reference 
WHERE reference.id = ?
 LIMIT ? OFFSET ?
2020-12-30 17:04:26,676 INFO sqlalchemy.engine.base.Engine (1, 1, 0)
2020-12-30 17:04:26,683 INFO sqlalchemy.engine.base.Engine SELECT reference_sequence.id AS reference_sequence_id, reference_sequence.reference_id AS reference_sequence_reference_id, reference_sequence.sequence_name AS reference_sequence_sequence_name, reference_sequence.sequence_length AS reference_sequence_sequence_length 
FROM reference_sequence 
WHERE ? = reference_sequence.reference_id
2020-12-30 17:04:26,689 INFO sqlalchemy.engine.base.Engine (1,)


{'JASV01000001.1': <ReferenceSequence(id=1, sequence_name=JASV01000001.1,sequence_length=521264, reference_id=1)>,
 'JASV01000002.1': <ReferenceSequence(id=2, sequence_name=JASV01000002.1,sequence_length=6598, reference_id=1)>,
 'JASV01000003.1': <ReferenceSequence(id=3, sequence_name=JASV01000003.1,sequence_length=225424, reference_id=1)>,
 'JASV01000004.1': <ReferenceSequence(id=4, sequence_name=JASV01000004.1,sequence_length=3051854, reference_id=1)>,
 'JASV01000005.1': <ReferenceSequence(id=5, sequence_name=JASV01000005.1,sequence_length=49092, reference_id=1)>,
 'JASV01000006.1': <ReferenceSequence(id=6, sequence_name=JASV01000006.1,sequence_length=506004, reference_id=1)>,
 'JASV01000007.1': <ReferenceSequence(id=7, sequence_name=JASV01000007.1,sequence_length=1052450, reference_id=1)>}

In [10]:
variation_service = VariationService(session)

variation_service.insert_variants(df, ref_contigs)

2020-12-30 17:04:27,071 INFO sqlalchemy.engine.base.Engine INSERT INTO variation_allele (id, sequence_name, position, ref, alt, var_type) VALUES (?, ?, ?, ?, ?, ?)
2020-12-30 17:04:27,072 INFO sqlalchemy.engine.base.Engine (('JASV01000001.1:16854:T:C', 'JASV01000001.1', 16854, 'T', 'C', 'snp'), ('JASV01000001.1:16871:C:C', 'JASV01000001.1', 16871, 'C', 'C', 'del'), ('JASV01000001.1:16897:G:C', 'JASV01000001.1', 16897, 'G', 'C', 'snp'), ('JASV01000001.1:17327:T:TA', 'JASV01000001.1', 17327, 'T', 'TA', 'ins'), ('JASV01000001.1:17335:G:G', 'JASV01000001.1', 17335, 'G', 'G', 'del'), ('JASV01000001.1:17347:C:C', 'JASV01000001.1', 17347, 'C', 'C', 'del'), ('JASV01000001.1:17360:T:A', 'JASV01000001.1', 17360, 'T', 'A', 'snp'), ('JASV01000001.1:17468:T:C', 'JASV01000001.1', 17468, 'T', 'C', 'snp')  ... displaying 10 of 601 total bound parameter sets ...  ('JASV01000007.1:13994:C:A', 'JASV01000007.1', 13994, 'C', 'A', 'snp'), ('JASV01000007.1:963356:A:G', 'JASV01000007.1', 963356, 'A', 'G', 'sn

In [11]:
variation_service.pairwise_distance(['2014C-3857', '2014C-3600'], var_type='snp')
#variation_service.pairwise_distance(sample_names, var_type='snp')

2020-12-30 17:04:27,136 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2020-12-30 17:04:27,139 INFO sqlalchemy.engine.base.Engine SELECT sample.id AS sample_id, sample.name AS sample_name 
FROM sample 
WHERE sample.name IN (?, ?)
2020-12-30 17:04:27,140 INFO sqlalchemy.engine.base.Engine ('2014C-3857', '2014C-3600')
2020-12-30 17:04:27,147 INFO sqlalchemy.engine.base.Engine SELECT variation_allele.id AS variation_allele_id, variation_allele.sequence_name AS variation_allele_sequence_name, variation_allele.position AS variation_allele_position, variation_allele.ref AS variation_allele_ref, variation_allele.alt AS variation_allele_alt, variation_allele.var_type AS variation_allele_var_type 
FROM variation_allele, sample_variation_allele 
WHERE ? = sample_variation_allele.sample_id AND variation_allele.id = sample_variation_allele.variantion_allele_id
2020-12-30 17:04:27,149 INFO sqlalchemy.engine.base.Engine (1,)
2020-12-30 17:04:27,154 INFO sqlalchemy.engine.base.Engine SELECT vari

Unnamed: 0,2014C-3857,2014C-3600
2014C-3857,0.0,0.722543
2014C-3600,0.722543,0.0


In [12]:
v = session.query(VariationAllele).filter_by(position = 16854).all()
v

2020-12-30 17:04:27,203 INFO sqlalchemy.engine.base.Engine SELECT variation_allele.id AS variation_allele_id, variation_allele.sequence_name AS variation_allele_sequence_name, variation_allele.position AS variation_allele_position, variation_allele.ref AS variation_allele_ref, variation_allele.alt AS variation_allele_alt, variation_allele.var_type AS variation_allele_var_type 
FROM variation_allele 
WHERE variation_allele.position = ?
2020-12-30 17:04:27,204 INFO sqlalchemy.engine.base.Engine (16854,)


[<VariationAllele(sequence_name=JASV01000001.1, position=16854, ref=T, alt=C, var_type=snp)>]

In [13]:
v[0].sequence

2020-12-30 17:04:27,369 INFO sqlalchemy.engine.base.Engine SELECT reference_sequence.id AS reference_sequence_id, reference_sequence.reference_id AS reference_sequence_reference_id, reference_sequence.sequence_name AS reference_sequence_sequence_name, reference_sequence.sequence_length AS reference_sequence_sequence_length 
FROM reference_sequence 
WHERE reference_sequence.id = ?
2020-12-30 17:04:27,370 INFO sqlalchemy.engine.base.Engine ('JASV01000001.1',)


In [14]:
r = session.query(ReferenceSequence).filter_by(sequence_name = 'JASV01000003.1').first()
r

2020-12-30 17:04:27,553 INFO sqlalchemy.engine.base.Engine SELECT reference_sequence.id AS reference_sequence_id, reference_sequence.reference_id AS reference_sequence_reference_id, reference_sequence.sequence_name AS reference_sequence_sequence_name, reference_sequence.sequence_length AS reference_sequence_sequence_length 
FROM reference_sequence 
WHERE reference_sequence.sequence_name = ?
 LIMIT ? OFFSET ?
2020-12-30 17:04:27,554 INFO sqlalchemy.engine.base.Engine ('JASV01000003.1', 1, 0)


<ReferenceSequence(id=3, sequence_name=JASV01000003.1,sequence_length=225424, reference_id=1)>

In [15]:
r.variants

2020-12-30 17:04:27,730 INFO sqlalchemy.engine.base.Engine SELECT variation_allele.id AS variation_allele_id, variation_allele.sequence_name AS variation_allele_sequence_name, variation_allele.position AS variation_allele_position, variation_allele.ref AS variation_allele_ref, variation_allele.alt AS variation_allele_alt, variation_allele.var_type AS variation_allele_var_type 
FROM variation_allele 
WHERE ? = variation_allele.sequence_name
2020-12-30 17:04:27,731 INFO sqlalchemy.engine.base.Engine (3,)


[]

In [16]:
s = session.query(Sample).filter_by(name='2014C-3857').first()
s

2020-12-30 17:04:27,913 INFO sqlalchemy.engine.base.Engine SELECT sample.id AS sample_id, sample.name AS sample_name 
FROM sample 
WHERE sample.name = ?
 LIMIT ? OFFSET ?
2020-12-30 17:04:27,914 INFO sqlalchemy.engine.base.Engine ('2014C-3857', 1, 0)


<Sample(id=1, name=2014C-3857)>

In [17]:
s.sample_sequences

2020-12-30 17:04:28,082 INFO sqlalchemy.engine.base.Engine SELECT sample_sequence.sample_id AS sample_sequence_sample_id, sample_sequence.sequence_id AS sample_sequence_sequence_id, sample_sequence.core_mask AS sample_sequence_core_mask, sample_sequence.flag AS sample_sequence_flag 
FROM sample_sequence 
WHERE ? = sample_sequence.sample_id
2020-12-30 17:04:28,084 INFO sqlalchemy.engine.base.Engine (1,)


[<SampleSequence(sample_id=1, sequence_id=1, flag=None)>,
 <SampleSequence(sample_id=1, sequence_id=3, flag=None)>,
 <SampleSequence(sample_id=1, sequence_id=4, flag=None)>,
 <SampleSequence(sample_id=1, sequence_id=5, flag=None)>,
 <SampleSequence(sample_id=1, sequence_id=6, flag=None)>,
 <SampleSequence(sample_id=1, sequence_id=7, flag=None)>]

In [18]:
s.variants[0].id

2020-12-30 17:04:28,259 INFO sqlalchemy.engine.base.Engine SELECT variation_allele.id AS variation_allele_id, variation_allele.sequence_name AS variation_allele_sequence_name, variation_allele.position AS variation_allele_position, variation_allele.ref AS variation_allele_ref, variation_allele.alt AS variation_allele_alt, variation_allele.var_type AS variation_allele_var_type 
FROM variation_allele, sample_variation_allele 
WHERE ? = sample_variation_allele.sample_id AND variation_allele.id = sample_variation_allele.variantion_allele_id
2020-12-30 17:04:28,261 INFO sqlalchemy.engine.base.Engine (1,)


'JASV01000001.1:16854:T:C'

In [19]:
v = session.query(VariationAllele).filter_by(id='JASV01000001.1:16854:T:C').first()
v.samples

2020-12-30 17:04:28,450 INFO sqlalchemy.engine.base.Engine SELECT variation_allele.id AS variation_allele_id, variation_allele.sequence_name AS variation_allele_sequence_name, variation_allele.position AS variation_allele_position, variation_allele.ref AS variation_allele_ref, variation_allele.alt AS variation_allele_alt, variation_allele.var_type AS variation_allele_var_type 
FROM variation_allele 
WHERE variation_allele.id = ?
 LIMIT ? OFFSET ?
2020-12-30 17:04:28,452 INFO sqlalchemy.engine.base.Engine ('JASV01000001.1:16854:T:C', 1, 0)
2020-12-30 17:04:28,455 INFO sqlalchemy.engine.base.Engine SELECT sample.id AS sample_id, sample.name AS sample_name 
FROM sample, sample_variation_allele 
WHERE ? = sample_variation_allele.variantion_allele_id AND sample.id = sample_variation_allele.sample_id
2020-12-30 17:04:28,456 INFO sqlalchemy.engine.base.Engine ('JASV01000001.1:16854:T:C',)


[<Sample(id=8, name=2014C-3850)>,
 <Sample(id=9, name=2014C-3600)>,
 <Sample(id=6, name=2014C-3907)>,
 <Sample(id=1, name=2014C-3857)>,
 <Sample(id=4, name=2014C-3840)>,
 <Sample(id=3, name=2014C-3598)>,
 <Sample(id=2, name=2014C-3655)>,
 <Sample(id=5, name=2014C-3599)>,
 <Sample(id=7, name=2014C-3656)>]

In [20]:
ss = session.query(SampleSequence).first()
ss

2020-12-30 17:04:28,649 INFO sqlalchemy.engine.base.Engine SELECT sample_sequence.sample_id AS sample_sequence_sample_id, sample_sequence.sequence_id AS sample_sequence_sequence_id, sample_sequence.core_mask AS sample_sequence_core_mask, sample_sequence.flag AS sample_sequence_flag 
FROM sample_sequence
 LIMIT ? OFFSET ?
2020-12-30 17:04:28,650 INFO sqlalchemy.engine.base.Engine (1, 0)


<SampleSequence(sample_id=1, sequence_id=3, flag=None)>

In [21]:
s = Bio.Seq.Seq('ATCG-NN')
mask = CoreBitMask(sequence=s)
mask.core_proportion()

0.5714285714285714

In [22]:
ss.set_core_mask(mask)
session.commit()

2020-12-30 17:04:29,134 INFO sqlalchemy.engine.base.Engine UPDATE sample_sequence SET core_mask=? WHERE sample_sequence.sample_id = ? AND sample_sequence.sequence_id = ?
2020-12-30 17:04:29,137 INFO sqlalchemy.engine.base.Engine (<memory at 0x7f53e72aa040>, 1, 3)
2020-12-30 17:04:29,144 INFO sqlalchemy.engine.base.Engine COMMIT


In [23]:
ss = session.query(SampleSequence).first()
ss

2020-12-30 17:04:29,610 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2020-12-30 17:04:29,614 INFO sqlalchemy.engine.base.Engine SELECT sample_sequence.sample_id AS sample_sequence_sample_id, sample_sequence.sequence_id AS sample_sequence_sequence_id, sample_sequence.core_mask AS sample_sequence_core_mask, sample_sequence.flag AS sample_sequence_flag 
FROM sample_sequence
 LIMIT ? OFFSET ?
2020-12-30 17:04:29,619 INFO sqlalchemy.engine.base.Engine (1, 0)


<SampleSequence(sample_id=1, sequence_id=3, flag=None)>

In [31]:
ss.get_core_mask().core_length()

95631

In [25]:
ss.sequence

<ReferenceSequence(id=3, sequence_name=JASV01000003.1,sequence_length=225424, reference_id=1)>