In [1]:
import pysam
from pysam import AlignmentFile
from collections import defaultdict

## Indicizzare e aprire il file

In [2]:
pysam.index('./reads.aln.bam')
bam_file = AlignmentFile('./reads.aln.bam', 'rb')

## Aprire il reference

In [3]:
ref = ''
for line in open('ref.fa', 'r'):
    if not line.startswith(">"):
        line = line.strip()
        ref+=line.upper()
        
print(ref[:100])
print(len(ref))

GATCCACCCGCCTTGGCCTCCTAAAGTGCTGGGATTACAGGTGTTAGCCACCACGTCCAGCTGTTAATTTTTATTTAATAAGAATGACAGAGTGAGGGCC
4999950


## Usare `pileup` per trovare le posizioni che non concordano tra REFERENCE e READS

Attenzione a quante/quali posizioni alternative sono presenti nelle reads

In [None]:
for pileupcolumn in bam_file.pileup(min_base_quality = 0):
    refpos = ref[pileupcolumn.pos]
    bampos = defaultdict(int)
    for pileupread in pileupcolumn.pileups:
        if not pileupread.is_del and not pileupread.is_refskip:
            bampos[pileupread.alignment.query_sequence[pileupread.query_position]] += 1
    if len(bampos) > 0:
        support = sorted(bampos, key=bampos.get)
        if len(support) == 1:
            if support[0] != refpos:
                print(pileupcolumn.pos, refpos, support[0], sep="\t")
        else:
            if support[0] == refpos:
                print(pileupcolumn.pos, refpos, support[1], sep="\t")
            elif support[1] == refpos:
                print(pileupcolumn.pos, refpos, support[0], sep="\t")
            else:
                print(pileupcolumn.pos, refpos, f"{support[0]},{support[1]}", sep="\t")

## Aggiungere genotipo

- `0/0`: le reads supportano solo il REFERENCE (nel nostro caso non ci interessa)
- `0/1`: le reads supportano sia il REFERENCE che l'ALTERNATIVE
- `1/1`: le reads supportano solo l'ALTERNATIVE
- `1/2`: sono presenti 2 ALTERNATIVE e le reads supportano solo i due ALTERNATIVE

In [None]:
for pileupcolumn in bam_file.pileup(min_base_quality = 0):
    refpos = ref[pileupcolumn.pos]
    bampos = defaultdict(int)
    for pileupread in pileupcolumn.pileups:
        if not pileupread.is_del and not pileupread.is_refskip:
            bampos[pileupread.alignment.query_sequence[pileupread.query_position]] += 1
    if len(bampos) > 0:
        support = sorted(bampos, key=bampos.get)
        if len(support) == 1:
            if support[0] != refpos:
                print(pileupcolumn.pos, refpos, support[0], "1/1", sep="\t")
        else:
            if support[0] == refpos:
                print(pileupcolumn.pos, refpos, support[1], "0/1", sep="\t")
            elif support[1] == refpos:
                print(pileupcolumn.pos, refpos, support[0], "0/1", sep="\t")
            else:
                print(pileupcolumn.pos, refpos, f"{support[0]},{support[1]}", "1/2", sep="\t")

## Aggiungere campi `AD:DP` sul VCF

- `AD`: Allele Difference, per ogni allele rappresenta il totale di reads che lo supportano seperati da `,`
- `DP`: Depth Position, rappresenta la somma totale di reads che mappano in quella posizione

In [None]:
for pileupcolumn in bam_file.pileup(min_base_quality = 0):
    refpos = ref[pileupcolumn.pos]
    bampos = defaultdict(int)
    for pileupread in pileupcolumn.pileups:
        if not pileupread.is_del and not pileupread.is_refskip:
            bampos[pileupread.alignment.query_sequence[pileupread.query_position]] += 1
    if len(bampos) > 0:
        support = sorted(bampos, key=bampos.get)
        if len(support) == 1:
            if support[0] != refpos:
                print(pileupcolumn.pos, refpos, support[0], "1/1", "AD:DP", f"{0},{bampos[support[0]]}:{bampos[support[0]]}", sep="\t")
        else:
            if support[0] == refpos:
                print(pileupcolumn.pos, refpos, support[1], "0/1", "AD:DP", f"{bampos[support[0]]},{bampos[support[1]]}:{sum(bampos.values())}", sep="\t")
            elif support[1] == refpos:
                print(pileupcolumn.pos, refpos, support[0], "0/1", "AD:DP", f"{bampos[support[1]]},{bampos[support[0]]}:{sum(bampos.values())}", sep="\t")
            else:
                print(pileupcolumn.pos, refpos, f"{support[0]},{support[1]}", "1/2", "AD:DP", f"{bampos[support[0]]},{bampos[support[1]]}:{sum(bampos.values())}", sep="\t")