# Process Chromosome

In [4]:
import pandas as pd
import glob
import os

def read_vcf_header(file_path):
    """Reads the VCF file header (column names) and returns it."""
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('#CHROM'):
                return line.strip().split('\t')
    return []

def extract_snp_data(file_path, header):
    """Extracts SNP data from a VCF file."""
    snp_data = []
    with open(file_path, 'r') as file:
        for line in file:
            if not line.startswith('#'):
                fields = line.strip().split('\t')
                chrom, pos, ref, alt = fields[0], fields[1], fields[3], fields[4]
                snp_id = f"chr{chrom}_{pos}_{ref}_{alt}"
                genotype_data = fields[9:]  # Genotype data starts from the 10th column
                snp_data.append((snp_id, genotype_data))
    return snp_data

def process_vcf_files(directory):
    """Processes all VCF files in the specified directory."""
    vcf_files = glob.glob(os.path.join(directory, '*.vcf'))
    all_snp_data = []
    patient_ids = None
    
    for file_path in vcf_files:
        if patient_ids is None:
            patient_ids = read_vcf_header(file_path)[9:]  # Assuming the first 9 columns are standard VCF columns
        snp_data = extract_snp_data(file_path, patient_ids)

        if (len(snp_data) == 0):
            # print("No matches found at ", file_path)
            continue
        else:
            for SNP in snp_data:
                position_in_vcf = SNP[0].split("_")[1]
                file_path_position = file_path.split("_pos")[1][:-4]
                if (position_in_vcf == file_path_position):
                    all_snp_data.extend(snp_data)
                    # print("Adding")
                else:
                    # Ignoring copy number variations
                    # print("Ignoring")
                    continue

    # Convert the collected SNP data to a DataFrame
    data_dict = {snp_id: genotypes for snp_id, genotypes in all_snp_data}
    df = pd.DataFrame(data_dict, index=patient_ids)
    print(df.shape)
    return df

# Find intersection b/w 23AndMe and PRS313

In [5]:
import pandas as pd
import numpy as np

# Define file paths
ttAndMeFullPanel = "../../Data/23andMe_metadata_files/23andMeGenePanel.csv"
prs313_file = './concatenated_snps_processed.csv'

# Read the CSV files into DataFrames
ttAndMe = pd.read_csv(ttAndMeFullPanel)
PRS313_df = pd.read_csv(prs313_file, index_col=0)

# Extract chromosome and position information from PRS313_df columns
PRS313_info = PRS313_df.columns.str.extract(r'chr(\d+)_(\d+)', expand=True)
PRS313_info.columns = ['chromosome', 'position']

# Create a new row in PRS313_df to store the boolean values
PRS313_df.loc['in_23andMe'] = False

# Merge ttAndMe with PRS313_info to identify matching rows
ttAndMe['chromosome'] = ttAndMe['chromosome'].astype(str)
ttAndMe['position'] = ttAndMe['position'].astype(str)

merged_df = pd.merge(ttAndMe, PRS313_info, how='inner', left_on=['chromosome', 'position'], right_on=['chromosome', 'position'])

# Set the 'in_23andMe' row to True for matching columns
matching_columns = 'chr' + merged_df['chromosome'] + '_' + merged_df['position']
PRS313_df.loc['in_23andMe', matching_columns] = True

# Print the updated PRS313_df DataFrame
print("Updated PRS313_df:")
print(PRS313_df)


Updated PRS313_df:
                    Chromosome  Positionb Reference Allele Effect Allele  \
SNPa                                                                       
1_100880328_A_T              1  100880328                A             T   
1_10566215_A_G               1   10566215                A             G   
1_110198129_CAAA_C           1  110198129             CAAA             C   
1_114445880_G_A              1  114445880                G             A   
1_118141492_A_C              1  118141492                A             C   
...                        ...        ...              ...           ...   
22_40904707_CT_C            22   40904707               CT             C   
22_43433100_C_T             22   43433100                C             T   
22_45319953_G_A             22   45319953                G             A   
22_46283297_G_A             22   46283297                G             A   
in_23andMe                   0          0            False         Fa

In [16]:
sum(PRS313_df.loc['in_23andMe'])

77.0

In [6]:

import os
import pandas as pd
ttandMePositions = "../../../Data/23andMe_metadata_files/23andMeGenePanel.csv"

output_dir   = "../../../Data/Raw_training_data_23andMe_union/"

os.makedirs(output_dir, exist_ok=True)
# Read the text file into a DataFrame
ttAndMe = pd.read_csv(ttandMePositions, comment='#', header = 0)
for i in range (1, 23):
    chromosome_number = i
        
    # Replace 'your_directory_path' with the path to your directory containing the VCF files
    directory_path = f'../../../Data/23AndMePositionsUnion/chr{chromosome_number}/'
    df_23AndMe = process_vcf_files(directory_path)


    # Filter the PRS313 data to only include SNPs on chromosome 1 by filtering column names that begin with chr1
    chromosome_N_snps = PRS313_df.loc[:, [col for col in PRS313_df.columns if col.startswith(f'chr{chromosome_number}_')]]

    # If the df_23AndMe DataFrame column has a matching column in the PRS313 DataFrame, add to its column name the string '_PRS313'
    cols_renamed = []
    for column in df_23AndMe.columns:
        if column in chromosome_N_snps.columns:
            df_23AndMe.rename(columns={column: f'{column}_PRS313_Known'}, inplace=True)
            cols_renamed.append(column)
            print(f"Column {column} renamed to {column}_PRS313_Known")
            
    print(len(cols_renamed))

    # Add the columns from the PRS313 DataFrame that were not present in the 23andMe DataFrame. Use the 'in_23AndMe' row to add only the rows that weren't already there. Ensure that there is a '_PRS313_Unknown' suffix in the column name

    added_columns = []
    for column in chromosome_N_snps.columns:
        if not chromosome_N_snps.loc['in_23andMe', column]:
            df_23AndMe[column + '_PRS313_Unknown'] = chromosome_N_snps.loc[:, column]
            added_columns.append(column)
            print(f"Column {column} added to df_23AndMe with suffix '_PRS313_Unknown'")
    print(len(added_columns))

    df_23AndMe.to_parquet(f'{output_dir}/23AndMe_PRS313_merged_chr{chromosome_number}.parquet')

(2504, 53891)


KeyboardInterrupt: 

In [4]:
import os
import pandas as pd

# Directory containing the parquet files
directory = "../Final_training_data/"

# Iterate over the files in the directory
for filename in os.listdir(directory):
    if filename.startswith("23AndMe_PRS313_merged_chr") and filename.endswith(".parquet"):
        file_path = os.path.join(directory, filename)
        
        # Extract the chromosome number from the filename
        chromosome = filename.split("_")[-1].split(".")[0]
        
        # Read the parquet file into a DataFrame
        df = pd.read_parquet(file_path)
        
        # Filter the columns starting with "chrN_"
        filtered_columns = [col for col in df.columns if col.startswith(f"{chromosome}_")]
        filtered_df = df[filtered_columns]
        
        # Replace the original parquet file with the filtered DataFrame
        filtered_df.to_parquet(file_path, index=False)

KeyboardInterrupt: 