In [2]:
import pandas as pd
import glob
import os

def read_vcf_header(file_path):
    """Reads the VCF file header (column names) and returns it."""
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('#CHROM'):
                return line.strip().split('\t')
    return []

def extract_snp_data(file_path, header):
    """Extracts SNP data from a VCF file."""
    snp_data = []
    with open(file_path, 'r') as file:
        for line in file:
            if not line.startswith('#'):
                fields = line.strip().split('\t')
                chrom, pos = fields[0], fields[1]
                snp_id = f"chr{chrom}_{pos}"
                genotype_data = fields[9:]  # Genotype data starts from the 10th column
                snp_data.append((snp_id, genotype_data))
    return snp_data

def process_vcf_files(directory):
    """Processes all VCF files in the specified directory."""
    vcf_files = glob.glob(os.path.join(directory, '*.vcf'))
    all_snp_data = []
    patient_ids = None
    
    for file_path in vcf_files:
        if patient_ids is None:
            patient_ids = read_vcf_header(file_path)[9:]  # Assuming the first 9 columns are standard VCF columns
        
        snp_data = extract_snp_data(file_path, patient_ids)
        all_snp_data.extend(snp_data)

    # Convert the collected SNP data to a DataFrame
    data_dict = {snp_id: genotypes for snp_id, genotypes in all_snp_data}
    df = pd.DataFrame(data_dict, index=patient_ids).transpose()
    return df

# Replace 'your_directory_path' with the path to your directory containing the VCF files
directory_path = '../data/'
df = process_vcf_files(directory_path)

# To display the first few rows of the dataframe
print(df.head())

# If you want to save this DataFrame to a CSV file for further analysis
df.to_csv('concatenated_snps.csv')


CSV file has been created.


In [None]:
import pandas as pd