### Import

In [1]:
import os
import numpy as np
import pandas as pd
from cyvcf2 import VCF
import pyarrow as pa
import pyarrow.parquet as pq

### Path to Files

In [2]:
vcf_path = './public/1000Genomes/'
csv_output_dir = './Data/csv/'
parquet_output_dir = './Data/parquet/'

### Read VCF File and Convert to CSV

In [4]:
def list_vcf_files(directory):

    prefix = "1000G_chr"
    suffix = "_pruned.vcf.gz"
    
    vcf_files = []
    
    for file in os.listdir(directory):
        if file.startswith(prefix) and file.endswith(suffix):
            chr_num = file[len(prefix):-len(suffix)].replace("chr", "")
            if chr_num.isdigit() and 1 <= int(chr_num) <= 22:
                vcf_files.append(os.path.join(directory, file))
    
    return vcf_files

In [5]:
list_of_vcf_files = list_vcf_files(vcf_path)

In [6]:
def read_vcf_write_csv(vcf_path, output_dir):
    
    vcf_reader = VCF(vcf_path)
    sample_ids = vcf_reader.samples
    
    snp_data = {'Person_ID': sample_ids}

    for record in vcf_reader:
        if record.ID:  
            snp_id = record.ID 
            genotypes = []  
            for gt in record.genotypes:  
                if gt[0] == -1 or gt[1] == -1:  
                    genotypes.append(np.nan)  
                else:
                    genotypes.append(gt[0] + gt[1])
            if len(genotypes) == len(sample_ids):
                snp_data[snp_id] = genotypes
        else:
            continue  
            
    df = pd.DataFrame(snp_data)
    
    chromosome_number = os.path.basename(vcf_path).split('_')[1]
    output_file = os.path.join(output_dir, f"{chromosome_number}.csv")
    
    df.to_csv(output_file, index=False)

In [7]:
for vcf_file in list_of_vcf_files:
    read_vcf_write_csv(vcf_file, csv_output_dir)

### Read CSV File and Convert to Parquet

In [None]:
def csv_to_parquet(source_dir, target_dir, file_pattern='chr*.csv'):

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    for file_name in os.listdir(source_dir):
        if fnmatch.fnmatch(file_name, file_pattern):
            csv_file_path = os.path.join(source_dir, file_name)
            parquet_file_path = os.path.join(target_dir, file_name.replace('.csv', '.parquet'))

            df = pd.read_csv(csv_file_path)
            table = pa.Table.from_pandas(df)
            pq.write_table(table, parquet_file_path)

In [None]:
csv_to_parquet(csv_output_dir, parquet_output_dir)