In [1]:
import pandas as pd
import glob
import os

def read_vcf_header(file_path):
    """Reads the VCF file header (column names) and returns it."""
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('#CHROM'):
                return line.strip().split('\t')
    return []

def extract_snp_data(file_path, header):
    """Extracts SNP data from a VCF file."""
    snp_data = []
    with open(file_path, 'r') as file:
        for line in file:
            if not line.startswith('#'):
                fields = line.strip().split('\t')
                chrom, pos, ref, alt = fields[0], fields[1], fields[3], fields[4]
                snp_id = f"chr{chrom}_{pos}_{ref}_{alt}"
                genotype_data = fields[9:]  # Genotype data starts from the 10th column
                snp_data.append((snp_id, genotype_data))
    return snp_data

def process_vcf_files(directory):
    """Processes all VCF files in the specified directory."""
    vcf_files = glob.glob(os.path.join(directory, '*.vcf'))
    all_snp_data = []
    patient_ids = None
    
    for file_path in vcf_files:
        if patient_ids is None:
            patient_ids = read_vcf_header(file_path)[9:]  # Assuming the first 9 columns are standard VCF columns
        snp_data = extract_snp_data(file_path, patient_ids)
        all_snp_data.extend(snp_data)

    # Convert the collected SNP data to a DataFrame
    data_dict = {snp_id: genotypes for snp_id, genotypes in all_snp_data}
    df = pd.DataFrame(data_dict, index=patient_ids)
    
    return df

In [2]:
import pandas as pd
import numpy as np
import sys

ExampleData = "../genome_Chad_Wrye_v5_Full_20220921063742.txt"

# Read the text file into a DataFrame
ttAndMe = pd.read_csv(ExampleData, sep='\t', comment='#', header=None, names=['rsid', 'chromosome', 'position', 'genotype'])

# Read the PRS313.xlsx file into a DataFrame
PRS313_df = pd.read_csv('./concatenated_snps_processed.csv', index_col=0)

# Create a list of the column names in PRS313_df
PRS313_columns = PRS313_df.columns

# Create a list of the positions in PRS313_df
PRS313_positions = [col.split('_')[1] for col in PRS313_columns]

# Create a list of the chromosomes in PRS313_df. Parse only the integer values in the string 'chr2' to get the chromosome number
PRS313_chromosomes = [int(col.split('chr')[1].split('_')[0]) for col in PRS313_columns]

# Create a new row in PRS313_df to store the boolean values
PRS313_df.loc['in_23andMe'] = False

# Iterate over the rows in ttAndMe
for index, row in ttAndMe.iterrows():
    # Check if the current position and chromosome match any position and chromosome in PRS313
    if str(row['position']) in PRS313_positions and row['chromosome'] in PRS313_chromosomes:
        # If there's a match, find the corresponding column in PRS313_df and set the 'in_23andMe' value to True
        matching_column = [col for col in PRS313_columns if str(row['position']) in col and str(row['chromosome']) in col]
        if len(matching_column) > 0:
            PRS313_df.loc['in_23andMe', matching_column[0]] = True

# Print the updated PRS313_df DataFrame
print("Updated PRS313_df:")
print(PRS313_df)

  ttAndMe = pd.read_csv(ExampleData, sep='\t', comment='#', header=None, names=['rsid', 'chromosome', 'position', 'genotype'])


Updated PRS313_df:
           chr2_192381934_C_T chr8_129199566_G_A chr8_29509616_A_C  \
HG00096                   1|1                0|1               1|1   
HG00097                   1|1                1|0               1|1   
HG00099                   1|1                0|0               0|0   
HG00100                   0|1                0|0               0|1   
HG00101                   1|1                0|0               1|1   
...                       ...                ...               ...   
NA21141                   1|1                0|0               0|0   
NA21142                   1|0                1|0               0|1   
NA21143                   1|1                0|0               1|1   
NA21144                   1|1                0|0               0|1   
in_23andMe               True              False              True   

           chr2_121089731_T_C chr15_75750383_T_C chr18_24337424_C_G  \
HG00096                   1|0                1|0                1|1  

In [81]:

for i in range (1, 23):
    chromosome_number = i
        
    # Replace 'your_directory_path' with the path to your directory containing the VCF files
    directory_path = f'../23AndMePositions/chr{chromosome_number}/'
    df_23AndMe = process_vcf_files(directory_path)


    # Filter the PRS313 data to only include SNPs on chromosome 1 by filtering column names that begin with chr1
    chromosome_N_snps = PRS313_df.loc[:, [col for col in PRS313_df.columns if col.startswith(f'chr{chromosome_number}_')]]

    # If the df_23AndMe DataFrame column has a matching column in the PRS313 DataFrame, add to its column name the string '_PRS313'
    cols_renamed = []
    for column in df_23AndMe.columns:
        if column in chromosome_N_snps.columns:
            df_23AndMe.rename(columns={column: f'{column}_PRS313_Known'}, inplace=True)
            cols_renamed.append(column)
            print(f"Column {column} renamed to {column}_PRS313_Known")
    print(len(cols_renamed))

    # Add the columns from the PRS313 DataFrame that were not present in the 23andMe DataFrame. Use the 'in_23AndMe' row to add only the rows that weren't already there. Ensure that there is a '_PRS313_Unknown' suffix in the column name

    added_columns = []
    for column in chromosome_N_snps.columns:
        if not chromosome_N_snps.loc['in_23andMe', column]:
            df_23AndMe[column + '_PRS313_Unknown'] = chromosome_N_snps.loc[:, column]
            added_columns.append(column)
            print(f"Column {column} added to df_23AndMe with suffix '_PRS313_Unknown'")

    print(len(added_columns))

    df_23AndMe.to_parquet(f'../Final_training_data/23AndMe_PRS313_merged_chr{chromosome_number}.parquet')


Unnamed: 0,chr15_75750383_T_C,chr15_100905819_A_C,chr15_50694306_A_G,chr15_91512267_G_T,chr15_46680811_C_A,chr15_67457698_A_G,chr15_66630569_G_A
HG00096,1|0,0|0,0|0,0|0,0|0,0|1,1|0
HG00097,0|0,0|1,0|0,1|0,0|0,1|0,1|0
HG00099,0|0,0|0,0|0,0|0,0|0,0|0,1|0
HG00100,0|0,1|0,0|1,1|0,0|0,0|0,1|1
HG00101,1|1,0|0,0|1,0|0,0|0,0|0,1|1
...,...,...,...,...,...,...,...
NA21141,0|1,0|0,1|0,0|0,0|0,0|0,1|0
NA21142,1|0,0|0,0|0,0|0,0|0,0|0,1|1
NA21143,1|1,0|0,0|1,0|0,0|0,0|0,1|1
NA21144,0|1,0|0,1|0,0|0,0|0,0|0,1|1


0


Column chr15_75750383_T_C added to df_23AndMe_New with suffix '_PRS313_Unknown'
Column chr15_100905819_A_C added to df_23AndMe_New with suffix '_PRS313_Unknown'
Column chr15_91512267_G_T added to df_23AndMe_New with suffix '_PRS313_Unknown'
Column chr15_46680811_C_A added to df_23AndMe_New with suffix '_PRS313_Unknown'
Column chr15_66630569_G_A added to df_23AndMe_New with suffix '_PRS313_Unknown'
5
