Visualize the vcf files to understand what I am looking at in each file.

Import pandas for data manipulation and gzip to read gzipped files

In [1]:
import pandas as pd
import gzip

vcf.gz file paths

In [2]:
file_path1 = "NA12878.chr21.slice.vcf.gz"
file_path2 = "gnomad.chr21.slice.vcf.gz"

Standard VCF format.
https://samtools.github.io/hts-specs/VCFv4.2.pdf

Define functions

In [3]:
# Extract headers from a VCF file searching for #CHROM (standard VCF format for header row).
# Raise ValueError if no headers exist.
def read_vcf_headers(file_path):
    headers = None
    with gzip.open(file_path, 'rt') as file:
        for line in file:
            if line.startswith('#CHROM'):
                headers = line.strip().lstrip('#').split('\t')
                break
    if headers is None:
        raise ValueError(f"The file {file_path} does not contain a header line starting with '#CHROM'.")
    return headers

# Read the VCF file and use the extracted headers from the specific file
def read_vcf(file_path, headers):
    with gzip.open(file_path, 'rt') as file:
        return pd.read_csv(file, comment='#', sep='\t', header=None, names=headers)

NA12878 VCF file

In [4]:
# Reading headers
headers1 = read_vcf_headers(file_path1)

# Reading VCF file into DataFrame
vcf_data1 = read_vcf(file_path1, headers1)

# Display the first few rows of the DataFrame
print(vcf_data1.head())

   CHROM       POS ID REF ALT   QUAL         FILTER  \
0  chr21  10400117  .   C   T  35.50  DRAGENHardSNP   
1  chr21  10400155  .   A   G  32.78  DRAGENHardSNP   
2  chr21  10413157  .   G   A  29.82           PASS   
3  chr21  10413190  .   G   T  36.80           PASS   
4  chr21  10413202  .   G   A  29.21           PASS   

                                                INFO  \
0  AC=2;AF=1;AN=2;DP=3;FS=0;MQ=28;QD=11.83;SOR=2....   
1  AC=2;AF=1;AN=2;DP=2;FS=0;MQ=28;QD=16.39;SOR=2....   
2  AC=1;AF=0.5;AN=2;DP=3;FS=0;MQ=54.41;MQRankSum=...   
3  AC=1;AF=0.5;AN=2;DP=4;FS=0;MQ=48.88;MQRankSum=...   
4  AC=1;AF=0.5;AN=2;DP=6;FS=0;MQ=84.25;MQRankSum=...   

                                        FORMAT  \
0     GT:AD:AF:DP:F1R2:F2R1:GQ:PL:GP:PRI:SB:MB   
1     GT:AD:AF:DP:F1R2:F2R1:GQ:PL:GP:PRI:SB:MB   
2     GT:AD:AF:DP:F1R2:F2R1:GQ:PL:GP:PRI:SB:MB   
3  GT:AD:AF:DP:F1R2:F2R1:GQ:PL:GP:PRI:SB:MB:PS   
4  GT:AD:AF:DP:F1R2:F2R1:GQ:PL:GP:PRI:SB:MB:PS   

                               

gnomAD population frequencies

In [5]:
# Reading headers
headers2 = read_vcf_headers(file_path2)

# Reading VCF file into DataFrame
vcf_data2 = read_vcf(file_path2, headers2)

# Display the first few rows of the DataFrame
print(vcf_data2.head())

   CHROM       POS            ID   REF     ALT QUAL       FILTER  \
0  chr21  10399997             .  TTTG       T    .  AC0;AS_VQSR   
1  chr21  10400001             .     T  TTTTTG    .          AC0   
2  chr21  10400005  rs1987456251     T       C    .      AS_VQSR   
3  chr21  10400006             .     G  GTTTTT    .  AC0;AS_VQSR   
4  chr21  10400010             .     T       C    .          AC0   

                                                INFO  
0  AC=0;AN=152302;AF=0;AC_XX=0;AF_XX=0;AN_XX=7788...  
1  AC=0;AN=152300;AF=0;AC_XX=0;AF_XX=0;AN_XX=7788...  
2  AC=1;AN=152336;AF=6.56444e-06;grpmax=afr;AC_XX...  
3  AC=0;AN=152306;AF=0;AC_XX=0;AF_XX=0;AN_XX=7789...  
4  AC=0;AN=152302;AF=0;AC_XX=0;AF_XX=0;AN_XX=7788...  
