In [2]:
import subprocess

def query_variant_bcftools(vcf_path, variant_key):
    """
    查询VCF中指定变异的信息。
    输入:
        variant_key: 字符串形式 "CHROM:POS:REF:ALT"
    返回:
        rsid: ID列
        gene_symbols: ANN注释中的gene symbol集合
        filter_status: FILTER列
        allele_freq: INFO字段中的AF值（float or list）
    """
    try:
        chrom, pos, ref, alt = variant_key.strip().split(":")
        region = f"{chrom}:{pos}-{pos}"
    except ValueError:
        raise ValueError("variant_key格式应为 CHROM:POS:REF:ALT")

    # 调用 bcftools
    cmd = [
        "bcftools", "view", "-r", region, vcf_path
    ]

    try:
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
    except subprocess.CalledProcessError as e:
        print(f"[bcftools ERROR]\n{e.stderr}")
        return None, None, None, None

    rsid = None
    gene_symbols = set()
    filter_status = None
    allele_freq = None

    for line in result.stdout.splitlines():
        if line.startswith("#"):
            continue
        fields = line.strip().split("\t")
        vcf_ref = fields[3]
        vcf_alts = fields[4].split(",")
        info = fields[7]
        filt = fields[6]

        if vcf_ref == ref and alt in vcf_alts:
            rsid = fields[2]
            filter_status = filt

            # 提取 AF（可能是多个 ALT，用逗号分隔）
            af = None
            for entry in info.split(";"):
                if entry.startswith("AF="):
                    af = entry.split("=", 1)[1]
                    allele_freq = af.split(",") if "," in af else [af]
                    break

            # 提取 ANN 字段
            for entry in info.split(";"):
                if entry.startswith("ANN="):
                    ann_data = entry[4:]
                    for ann in ann_data.split(","):
                        parts = ann.split("|")
                        if len(parts) > 3 and parts[3]:
                            gene_symbols.add(parts[3])

    return rsid, gene_symbols, filter_status, allele_freq

# 示例调用
vcf_file = "/LARGE0/gr10478/b37974/Pulmonary_Hypertension/ToMMo_60KJPN/tommo-60kjpn-20240904-GRCh38-snvindel-af-autosome.norm.vcf.gz"
rsid, genes, filter_status, allele_freq = query_variant_bcftools(vcf_file, "chr3:75692673:C:CA")
print("rsID:", rsid)
print("Genes:", genes)
print("Filter Status:", filter_status)
print("Allele Frequency:", allele_freq)

rsID: rs35632265
Genes: {'ZNF717', 'LINC00960-ZNF717'}
Filter Status: ExcessHet
Allele Frequency: ['0.42827']


In [None]:
import subprocess
import gzip

def vcf_uses_chr_prefix(vcf_path):
    opener = gzip.open if vcf_path.endswith(".gz") else open
    with opener(vcf_path, "rt") as f:
        for line in f:
            if line.startswith("#"):
                continue
            chrom = line.split("\t")[0]
            return chrom.startswith("chr")
    return False

def parse_ann_to_dicts(info_field, alt_filter=None):
    ann_key = "ANN="
    ann_string = None
    for entry in info_field.split(";"):
        if entry.startswith(ann_key):
            ann_string = entry[len(ann_key):]
            break
    if not ann_string:
        return []

    ann_fields = [
        "Allele", "Consequence", "Impact", "Gene_symbol", "Gene_ID",
        "Feature_type", "Transcript_ID", "Biotype", "Rank",
        "HGVS_c", "HGVS_p", "cDNA_pos", "CDS_pos", "AA_pos",
        "Distance", "Errors"
    ]

    annotations = []
    for ann_entry in ann_string.split(","):
        values = ann_entry.split("|")
        ann_dict = {ann_fields[i]: values[i] if i < len(ann_fields) else "" for i in range(len(ann_fields))}
        if alt_filter and ann_dict["Allele"] != alt_filter:
            continue
        annotations.append(ann_dict)

    return annotations

def get_ann_dicts(vcf_path, chrom, pos, ref, alt):
    use_chr = vcf_uses_chr_prefix(vcf_path)
    if chrom.startswith("chr") and not use_chr:
        chrom = chrom[3:]
    if not chrom.startswith("chr") and use_chr:
        chrom = "chr" + chrom

    region = f"{chrom}:{pos}-{pos}"
    cmd = ["bcftools", "view", "-r", region, vcf_path]

    try:
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
    except subprocess.CalledProcessError as e:
        print(f"bcftools error: {e.stderr}")
        return []

    for line in result.stdout.splitlines():
        if line.startswith("#"):
            continue
        fields = line.strip().split("\t")
        vcf_ref = fields[3]
        vcf_alts = fields[4].split(",")
        info = fields[7]

        if vcf_ref == ref and alt in vcf_alts:
            return parse_ann_to_dicts(info, alt_filter=alt)

    return []

def get_ann_by_variant_string(vcf_path, variant_str):
    """
    输入格式如 'chr4:126341683:T:C'
    返回对应的ANN注释字典列表
    """
    chrom, pos, ref, alt = variant_str.strip().split(":")
    pos = int(pos)
    return get_ann_dicts(vcf_path, chrom, pos, ref, alt)

# 示例用法
vcf_file = "/LARGE0/gr10478/b37974/Pulmonary_Hypertension/ToMMo_60KJPN/tommo-60kjpn-20240904-GRCh38-snvindel-af-autosome.norm.vcf.gz"
variant = "chr17:43092418:T:C"
records = get_ann_by_variant_string(vcf_file, variant)

for rec in records:
    print(f"{rec['Gene_symbol']} ({rec['Transcript_ID']}): {rec['Consequence']} [{rec['Biotype']}]")

BRCA1 (NM_007300.4): missense_variant [protein_coding]
BRCA1 (NM_007294.4): missense_variant [protein_coding]
BRCA1 (NM_007297.4): missense_variant [protein_coding]
BRCA1 (NM_007298.3): intron_variant [protein_coding]
BRCA1 (NM_007299.4): intron_variant [protein_coding]
BRCA1 (NR_027676.2): non_coding_transcript_exon_variant [pseudogene]


In [37]:
records

[{'Allele': 'C',
  'Consequence': 'missense_variant',
  'Impact': 'MODERATE',
  'Gene_symbol': 'BRCA1',
  'Gene_ID': 'BRCA1',
  'Feature_type': 'transcript',
  'Transcript_ID': 'NM_007300.4',
  'Biotype': 'protein_coding',
  'Rank': '10/24',
  'HGVS_c': 'c.3113A>G',
  'HGVS_p': 'p.Glu1038Gly',
  'cDNA_pos': '3226/7151',
  'CDS_pos': '3113/5655',
  'AA_pos': '1038/1884',
  'Distance': '',
  'Errors': ''},
 {'Allele': 'C',
  'Consequence': 'missense_variant',
  'Impact': 'MODERATE',
  'Gene_symbol': 'BRCA1',
  'Gene_ID': 'BRCA1',
  'Feature_type': 'transcript',
  'Transcript_ID': 'NM_007294.4',
  'Biotype': 'protein_coding',
  'Rank': '10/23',
  'HGVS_c': 'c.3113A>G',
  'HGVS_p': 'p.Glu1038Gly',
  'cDNA_pos': '3226/7088',
  'CDS_pos': '3113/5592',
  'AA_pos': '1038/1863',
  'Distance': '',
  'Errors': ''},
 {'Allele': 'C',
  'Consequence': 'missense_variant',
  'Impact': 'MODERATE',
  'Gene_symbol': 'BRCA1',
  'Gene_ID': 'BRCA1',
  'Feature_type': 'transcript',
  'Transcript_ID': 'NM_007