In [1]:
import pandas as pd
import numpy as np
import vcf
import time
import re
import collections
import datetime
from scipy.stats import binom

In [2]:
class Pileup_line:
    def __init__(self, line): 
        split_line = line.split("\t")
        self.sequence = split_line[0]
        self.position = split_line[1]
        self.ref_base = split_line[2]
        self.read_count = split_line[3]
        self.read_results = split_line[4]
        self.base_quality = split_line[5]
        self.mapping_quality = split_line[6]
        
    # Pomocna funkcija za racunanje kvaliteta baze iz odgovarajuceg ascii karaktera
    # Phred quality score: https://en.wikipedia.org/wiki/Phred_quality_score
    def calc_base_quality(ascii_char):
        return 1-10**(-ord(ascii_char)/10)
    
    # Ista funkcija kao gore, samo sto se prosledi redni broj baze/read-a
    def calc_base_quality(self, base_num):
        # base_num < self.read_count !!!!
        return 1-10**(-ord(self.base_quality[base_num])/10)
    
    def ideal_rr_count(self, read_result):
    # read_result je string pa mogu odmah da promijenim , u .
    # i mala slova u velika
        read_result = read_result.replace(',','.').upper()
        char_counter = collections.Counter(read_result).most_common() # Prebroji karaktere i poreda ih od najcescih do najrjedje  
        var_list = char_counter 
        return var_list

    # Funkcija za izdvajanjanje insercija ili delecija ili oboje istovremeno, iz originalnog stringa read_result
    # Povratna vrijednost su dvije lista, prva lista je lista tacaka, zareza i mozda drugih karaktera
    # Druga lista je lista insercija i delecija
    def segregate_indels(self, read_result):
        # Iskoristila sam re, da na laksi nacin pronadjem brojeve, jer su oni zajednicki za insercije i delecije
        # Mozda je ovo moguce napraviti i tako sto se prvo read result iz stringa prebaci u array, ali da se pripazi na visecifrene brojeve
        # Pa da se iskoristi np.findall ili slicno da se nadju ti brojevi, i njihovi indeksi i na slican nacin kao u ovom kodu, izbace odgovarajuci el iz tog arraya
        numbers = np.array([int(m) for m in re.findall('[\d]+', read_result)])
        # Indeksi brojeva
        indices = np.array([m.span() for m in re.finditer('[\d]+', read_result)])
        # String -> np.array
        read_result = np.array([char for char in read_result])
        # Formiram listu uredjenih dvojki kojih ima koliko i indelsa, a prvi broj u toj dvojki odgovara pocetnom indeksu insercije, tj tacki ili zarezu, 
        # A drugi broj odgovara indeksu na kom se nalazi zadnji element te insercije
        ranges = [(index[0]-2,index[1]+numbers[i]) for i,index in enumerate(indices)]
        # Formiranje liste insercija na koju je moguce primjeniti collection.counter
        indels_list = []
        for item in ranges:
            indels_list.append(''.join(read_result[item[0]:item[1]]))

        # Brisanje insercija iz read_results
        indices_for_removing = []
        for item in ranges:
            indices_for_removing.extend([*range(item[0],item[1])])
        read_result = np.delete(read_result,indices_for_removing)
        # read_result to string
        read_result = ''.join(read_result[:])
        
        return read_result, indels_list

    def delete_carets(self, read_result):
        read_result = re.sub('\^.','',read_result)
        return read_result

    def determine_variant(self):
        read_result = self.read_results
        read_count = self.read_count
        # Ovo je prva opcija, znaci da koristim find koja ce samo da posluzi za flag
        ins = read_result.find("+") # Flag za inserciju
        dele = read_result.find("-") # Flag za deleciju
        car = read_result.find("^") # Flag za caret
        # Druga opcija je da koristim re bibl da nadjem odmah i indekse svih insercija, delecija i kapica, kako bih onda mogla da ih izbacim
        # Zasad ostavljam prvu opciju
  
        if ins == -1 and dele == -1 and car == -1:
            #print("No insertion, deletions or carets")
            read_result = re.sub('\$','',read_result) # PROBAAAA
            list_of_vars = self.ideal_rr_count(read_result)
            # TO DO
            #Varijanta1 = list_of_vars[0]
            #Varijanta2 = list_of_vars[1]
            Varijanta1 = Variation(list_of_vars[0][0],list_of_vars[0][1], self.read_count)
            if len(list_of_vars) > 1:
                Varijanta2 = Variation(list_of_vars[1][0],list_of_vars[1][1], self.read_count)
            else:
                Varijanta2 = None

        # Sljedeci slucaj je da nemamo ni insercija ni delecija samo kapicu imamo, izbrisemo kapicu i znak pored, i primjenimo ovo gore
        elif (car != -1) and (ins == -1 and dele == -1):
            #print("Carets yes, no indels")
            #print('Prije brisanja:', read_result)
            #print(read_count)
            read_result = self.delete_carets(read_result)
            read_result = re.sub('\$','',read_result) # PROBAAAA
            # Prakticno smo dobili idealan read result i mozemo pozvati funkciju ideal_rr_count
            list_of_vars = self.ideal_rr_count(read_result)
            #print('Poslije brisanja:', read_result)
            #Varijanta1 = list_of_vars[0]
            #Varijanta2 = list_of_vars[1]
            Varijanta1 = Variation(list_of_vars[0][0],list_of_vars[0][1], self.read_count)
            if len(list_of_vars) > 1:
                Varijanta2 = Variation(list_of_vars[1][0],list_of_vars[1][1], self.read_count)
            else:
                Varijanta2 = None

        # Ukoliko imam insercije ili delecije, ili oboje istovremeno
        elif (ins != -1) or (dele != -1):
            # Provjeri ima li kapica
            if car != -1:
                #print("Carets! Oh, no!!!")
                read_result = self.delete_carets(read_result)
            #print("No carets")
            read_result = re.sub('\$','',read_result) # PROBAAAA
            read_result, indels_list = self.segregate_indels(read_result)

            # zarezi -> tacke, mala slova -> VELIKA SLOVA
            indels_list = [s.replace(',','.') for s in indels_list]
            number_of_insertions = collections.Counter(map(str.upper, indels_list)).most_common()
            # Pozvati ideal_rr_count na read_result
            list_of_vars = self.ideal_rr_count(read_result)
            list_of_vars.extend(number_of_insertions)
            list_of_vars = sorted(list_of_vars, key=lambda tup: tup[1], reverse = True)
            #print(self.position, ' ', list_of_vars)

            #Varijanta1 = list_of_vars[0]
            #Varijanta2 = list_of_vars[1]
            Varijanta1 = Variation(list_of_vars[0][0],list_of_vars[0][1], self.read_count)
            if len(list_of_vars) > 1:
                Varijanta2 = Variation(list_of_vars[1][0],list_of_vars[1][1], self.read_count)
            else:
                Varijanta2 = None
        #print("Varijanta1:", Varijanta1)
        #print("Varijanta2:", Varijanta2)

        return Varijanta1, Varijanta2
            
    def determine_genotype(self,variation1, variation2):
        genotype_list = []
        if variation2 is None:
            if self.position == '18027817':
                print(1)
            genotype_list.append(variation1)
            if variation1.variation_type == 'REF':
                genotype = '0/0'
            else:
                genotype = '0/1'
            return genotype, genotype_list
            
        k1 = variation1.number_of_supporting_reads
        k2 = variation2.number_of_supporting_reads
        n = k1+k2
        p = 0.9 # ovaj broj moze da se modifikuje i da predstavlja kvalitet baza
        P_k1k1 = binom.pmf(k1,n,p)  # OVDE BILA GRESKA PLAKY
        P_k1k2 = binom.pmf(n,n,p)
        P_k2k2 = binom.pmf(k2,n,p)
        # napomena: brojevi genotipa 1, 2 .. zavise od toga kojim se redosledom pojavljuju u ALT polju u vcf fajlu
        # ali cu ja ovde da radim ovako posto mi nista drugacije ne pada na pamet:
        # na osnovu onoga sto dobijem ovde za genotip cu da formiram ALT polje
        # 1) ako je genotip 0/0 nemamo alt polje uopste
        # 2) ako je genotip 1/1, ili 0/1 imam samo jedan slucaj u ALT polju
        # 3) ako je genotip 1/2 imacu dva slucaja u ALT polju
        # 4) pretpostavljam da necemo imati vise od 3 slucaja u alt polju
        if P_k1k1 >= P_k1k2 and P_k1k1 >= P_k2k2: # genotype = 'k1/k1'
            if self.position == '18027817':
                print('prvi')
            if variation1.variation_type == 'REF':
                genotype = '0/0'
            else:
                genotype = '1/1'
                genotype_list.append(variation1)
        elif P_k1k2 >= P_k1k1 and P_k1k2 >= P_k2k2: # genotype = 'k1/k2'
            if self.position == '18027817':
                print('drugi')
            if variation1.variation_type == 'REF' or variation2.variation_type == 'REF':
                genotype = '0/1'
                if variation1.variation_type == 'REF':
                    genotype_list.append(variation2)
                else:
                    genotype_list.append(variation1)
            else:
                genotype = '1/2'
                genotype_list.append(variation1)
                genotype_list.append(variation2)
        #elif P_k1k1 == P_k1k2 or P_k1k1 == P_k2k2 or P_k1k2 == P_k2k2:
            
        else: # genotype = 'k2/k2'
            if self.position == '18027817':
                print('treci')
            if variation2.variation_type == 'REF':
                genotype = '0/0'
            else:
                genotype = '1/1'
                genotype_list.append(variation2)

        '''print(self.position, 'variation1:', variation1.variation_type, ' , ', variation1.variation_sequence, ', ', variation1.number_of_supporting_reads, '.......variation2:', variation2.variation_type, ' , ', variation2.variation_sequence, ', ', variation2.number_of_supporting_reads)
        print('Pk1k1', P_k1k1, 'Pk1k2', P_k1k2, 'Pk2k2', P_k2k2)
        print(genotype, genotype_list[0].variation_sequence)'''
        if self.position == '18027817': #29819004
            print('var1', variation1.variation_sequence)
            print('var2', variation2.variation_sequence)
            print('Pk1k1', P_k1k1, 'Pk1k2', P_k1k2, 'Pk2k2', P_k2k2)
        return genotype, genotype_list
        

In [3]:
class Variation:
    # variation_type = {'SNV', 'insertion', 'deletion', 'inversion ?'}
    # variation_sequence = string od velikih slova, npr 'A' za SNV, 'AG' za inserciju/deleciju
    # quality ??
    def __init__(self, sequence, number_of_supporting_reads, total_num_of_reads): 
        self.variation_type, self.variation_sequence, self.variation_len = self.convert_pileup_string(sequence)
        self.number_of_supporting_reads = number_of_supporting_reads
        self.VAF = float(number_of_supporting_reads)/float(total_num_of_reads)

    # Metoda prima string koji je sekvenca preuzeta iz pileup fajla, a vraca dva izlaza na osnovu te sekvence. 
    # Prvi je variation_type = {'REF', 'SNV', 'insertion', 'deletion'}, a drugi je variation_sequence. Variation_sequence za REF je '.',
    # za SNV odgovarajuce veliko slovo npr 'A', a za inserciju i deleciju je string od insertovanih/deletovanih baza npr 'AGC'
    def convert_pileup_string(self, sequence):
        if len(sequence) == 1:
            variation_sequence = sequence
            variation_len = 1
            if sequence == '.': 
                variation_type = 'REF'
            else:
                variation_type = 'SNV'
        else:
            if sequence[1] == '+':
                variation_type = 'insertion'
            else:
                variation_type = 'deletion'
            # kod ispod konvertuje string oblika ,+3AGC u AGC
            sequence = sequence[2:]
            variation_sequence = ''.join(filter(str.isalpha, sequence))
            variation_len = int(''.join(filter(str.isdigit, sequence)))
        return variation_type, variation_sequence, variation_len
        

In [12]:
with open('/sbgenomics/project-files/merged-normal_pileup.pileup', 'r') as pileup_file, open('vcf4-2.txt', 'w') as vcf:
    vcf = create_vcf_header(vcf)
    start = time.time()
    for line in iter(pileup_file.readline, ''):
        position = pileup_file.tell() # uzimamo poziciju dokle smo stigli
        pileup = Pileup_line(line)
        if pileup.read_count == '0':
            genotype = '0/0'
        else:
            variation1, variation2 = pileup.determine_variant()
            genotype, genotype_list = pileup.determine_genotype(variation1, variation2)
            
        if genotype == '0/0':
            continue
        elif genotype == '1/1' or genotype == '0/1': # proveri jel / ili \
            variation = genotype_list[0]
            VAF = variation.VAF
            if variation.variation_type == 'insertion':
                REF = pileup.ref_base
                ALT = pileup.ref_base + variation.variation_sequence
            elif variation.variation_type == 'deletion':
                REF = pileup.ref_base
                ALT = pileup.ref_base
            else: # SNV
                REF = pileup.ref_base
                ALT = variation.variation_sequence
        
            if variation.variation_type == 'insertion' or variation.variation_type == 'deletion':
                condition = True
                del_temp = False # del_temp nam treba samo u slucaju delecije
                if variation.variation_type == 'deletion':
                    del_temp = True
                while(condition):
                    for i in range(variation.variation_len):
                        line = pileup_file.readline()
                        pileup_temp = Pileup_line(line)
                        if variation.variation_sequence[i] == pileup_temp.ref_base:
                            REF += pileup_temp.ref_base
                            del_position = pileup_file.tell()
                            if not del_temp:
                                ALT += pileup_temp.ref_base
                        else:
                            condition = False
                            if variation.variation_type == 'insertion': 
                                pileup_file.seek(position)
                            #else:
                                #pileup_file.seek(del_position)
                            break
                    del_temp = False
        else: # genotype == '1/2' # OVAJ ELSE NIJE ZAVRSEN!!!!
            REF = ''
            ALT = ''
            VAF = ''
            for j in range(2):
                variation = genotype_list[j]
                VAF += '' + str(variation.VAF)
                if variation.variation_type == 'insertion':
                    REF += pileup.ref_base  # kako se formira uopste ref kad imamo 2 alta?? sta ako je na jednom insercija, a na drugom delecija, ??? promenljiva REF treba da se modifikuje kad skontamo ovo... ALT bi trebalo da je okej
                    ALT += pileup.ref_base + variation.variation_sequence
                elif variation.variation_type == 'deletion': # OVO TREBA PREPRAVITI
                    REF += pileup.ref_base #BEZ VARIAION SEQUENCE
                    ALT += pileup.ref_base
                else: # SNV
                    REF += pileup.ref_base
                    ALT += variation.variation_sequence
        
                if variation.variation_type == 'insertion' or variation.variation_type == 'deletion':
                    condition = True
                    del_temp = False # del_temp nam treba samo u slucaju delecije
                    if variation.variation_type == 'deletion':
                        del_temp = True
                    while(condition):
                        for i in range(variation.variation_len):
                            line = pileup_file.readline()
                            pileup_temp = Pileup_line(line)
                            if variation.variation_sequence[i] == pileup_temp.ref_base:
                                REF += pileup_temp.ref_base
                                del_position = pileup_file.tell()
                                if not del_temp:
                                    ALT += pileup_temp.ref_base
                            else:
                                condition = False
                                # ovaj kod ispod je dodat zbog delecije, da ne bismo dodavali suvisne *
                                # proveri da li je ovo ok za mikrosatelite! 
                                if variation.variation_type == 'insertion' or j == 0: 
                                    pileup_file.seek(position)
                                #else:
                                    #pileup_file.seek(del_position)
                                break
                        del_temp = False # ova promenljiva je flag za prvi pass while petlje
                if j == 0:
                    ALT += ',' 
                    REF += ','
                    VAF += ','
        
        vcf.write(pileup.sequence + '\t' + pileup.position + '\t' + '.' + '\t' + REF + '\t' + ALT + '\t' + 'QUAL' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:VAF' + '\t' + genotype + ':' + str(VAF) + '\n')
                
    end = time.time()
    print(end-start)

drugi
var1 TT
var2 T
Pk1k1 0.18 Pk1k2 0.81 Pk2k2 0.18
186.5842728614807


In [4]:
def create_vcf_header(vcf):
    x = datetime.datetime.now()
    fileformat = '##fileformat=VCFv4.2'
    date = x.strftime('%Y%m%d')
    source = '##source=Jadranka&Suki'
    filedate = '##fileDate='+date
    reference = '##reference=file:///sbgenomics/Projects/4dc3fffd-51c3-42f3-885b-8b18384be984/human_g1k_v37_decoy.fasta'
    vcf.write(fileformat+'\n' + filedate + '\n' + source + '\n' + reference + '\n')
    with open('/sbgenomics/project-files/human_g1k_v37_decoy.fasta.fai','r') as fai:
        for line in fai:
            split_line = line.split("\t")
            ID = split_line[0]
            length = split_line[1]
            contig = '##contig=<ID=' + str(ID) + ', length=' + str(length) + '>'
            vcf.write(contig + '\n')
    fai.close()
    filter = '##FILTER=<ID=PASS,Description="All filters passed">'
    vcf.write(filter + '\n')
    vcf.write('#CHROM' + '\t' + 'POS' + '\t' + 'ID' + '\t' + 'REF' + '\t' + 'ALT' + '\t' + 'QUAL' + '\t' + 'FILTER' + '\t' + 'INFO' + '\t' + 'FORMAT' + '\t' + 'HCC1143BL' + '\n')
    return vcf # mozda i ne treba ovo return???

In [10]:
int('225')+5

230