In [2]:
import pandas as pd
import numpy as np
from fastparquet import write
from Bio import SeqIO

In [3]:
ifile = "../../data/reference/by-chrom/chr14.fa"
snp_file = "../../data/variant-calls/253.snps.vcf"

In [6]:
seq_data = SeqIO.to_dict(SeqIO.parse(open(ifile), 'fasta'))

AttributeError: module 'Bio.SeqIO' has no attribute 'to_list'

In [5]:
seq_data

<Bio.SeqIO.FastaIO.FastaIterator at 0x174c3cdd0>

In [3]:
def read_vcf(ifile, chrom):
    '''
    Reads and extracts relevant information from VCF file
    '''
    # ifile = "../../data/variant-calls/253.snps.vcf"
    df = pd.read_csv(ifile, 
                     sep='\t', 
                     comment='#', 
                     header=None, 
                     usecols=[0, 1, 3, 4, 9],
                     names=['chrom', 'pos', 'reference', 'alternate', 'extra_info']
                     )
    
    # Filter to shrink the number of comparisons that need to be made
    df = df.loc[df['chrom'] == chrom]
    
    # Pull out the SNP call
    df['variant_call'] = df['extra_info'].str[:3]

    # Get rid of Ns, indicate that ref homozygous
    df = df[df['reference'].isin(['A', 'C', 'G', 'T'])]

    # Makes iteration work
    df = df.drop(columns=['extra_info'])
    df = df.reset_index(drop=True)

    return df


In [4]:
snp_df = read_vcf("../../data/variant-calls/253.snps.vcf", 'chr14')

In [5]:

def encode_one_letter(x):

    out = np.zeros((4,),dtype = 'float32')

    if x == "A":
        out[0] = 1
    elif x == "C":
        out[1] = 1
    elif x == "G":
        out[2] = 1
    elif x == "T":
        out[3] = 1
    
    return(out)

def encode_snp(row):
    '''
    Given chrom	pos	reference	alternate	variant_call
    encode with fractionals
    '''
    ref, alt, vc = row['reference'], row['alternate'], row['variant_call']

    if vc == "0/1":
        output = (encode_one_letter(ref) + encode_one_letter(alt)) / 2
    elif vc == "1/1":
        output = encode_one_letter(alt)
    elif vc == "1/2":
        alt_split = alt.split(alt)
        non_alt = np.setdiff1d(['A', 'C', 'G', 'T'], [ref])
        # output = (encode_one_letter(minor2[0]) + encode_one_letter(minor2[1])) / 4 + encode_one_letter(alt) / 2
        output = (encode_one_letter(non_alt[0]) + encode_one_letter(non_alt[1]) + encode_one_letter(non_alt[2])) / 3
    return output

In [6]:
def run_one_hot_encoder(sequence, snp_df):

    l = len(sequence)
    x = np.zeros((l, 4),dtype = 'float32')

    # Remember that i starts at zero, 
    # whereas the positions in the VCF start at 1
    for i, nt in enumerate(sequence):
        # For position
        p = i + 1

        if p in snp_df.index:
            x[i, :] = encode_snp(snp_df.loc[p])
        else:
            x[i, :] = encode_one_letter(nt)
    return x

In [8]:
out = run_one_hot_encoder(seq_data['chr14'], snp_df)

In [18]:
out[np.random.randint(out.shape[0], size=10), :]

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]], dtype=float32)

In [19]:
start = [x for x in range(out.shape[0])]

In [20]:
out = pd.DataFrame(out, columns=['A','C', 'G', 'T'])


In [22]:
out['chrom'] = 'chr14'
out['start'] = start
out['end'] = out['start'] + 1

In [26]:
out = out[['chrom', 'start', 'end', 'A', 'C', 'G', 'T']]

In [27]:
out.head()

Unnamed: 0,chrom,start,end,A,C,G,T
0,chr14,0,1,0.0,0.5,0.5,0.0
1,chr14,1,2,0.0,0.0,0.5,0.5
2,chr14,2,3,0.5,0.5,0.0,0.0
3,chr14,3,4,0.0,0.5,0.5,0.0
4,chr14,4,5,0.0,0.0,1.0,0.0


In [30]:
out.to_csv("test.csv")

In [3]:
a = "../../data/reference/by-chrom/chr19.fa"

a.split("/")[-1].split(".")[0]

'chr19'