# Process Chromosome

In [85]:
import pandas as pd
import glob
import os
import re

def extract_snp_data(file_path):
    """Extracts SNP data from a VCF file."""
    error_log = []
    
    # Extract the original sample names from the VCF header
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('#CHROM'):
                headers = line.strip().split('\t')
                break
    
    try:
        vcf_df = pd.read_csv(file_path, sep='\t', comment='#', names=headers)
    except pd.errors.EmptyDataError:
        error_log.append([f'Error: No data found.', file_path])
        return None, error_log
    
    # Use os.path.splitext() to extract the file extension
    filename, _ = os.path.splitext(file_path)
    filename_parts = filename.split('chr')[-1].split("_pos")
    
    # Split by underscores
    # Get the chromosome number (removing 'chr' prefix)
    # chromosome_number = filename_parts[0]
    # Get the position
    position = filename_parts[1]
    
    # Use df.query() to find matching rows
    matching_rows = vcf_df.query(f"POS == {position}").index
    
    if len(matching_rows) > 1:
        error_log.append([f"Error: More than one match found", file_path])
        # print("Error: More than one match found")
        matching_index = matching_rows
        return vcf_df.iloc[matching_index], error_log
    elif len(matching_rows) == 0:
        # print("Error: No matches found")
        error_log.append([f"Error: No matches found", file_path])
        return None, error_log
    else:
        # print("Successfully processed: ", file_path)
        matching_index = matching_rows[0]
        return vcf_df.iloc[matching_index], error_log

def process_vcf_files(directory, error_logs_dir):
    """Processes all VCF files in the specified directory."""
    vcf_files = glob.glob(os.path.join(directory, '*.vcf'))
    all_snp_data = []
    all_error_logs = []
    
    for file_path in vcf_files[0:10]:
        snp_data, error_log = extract_snp_data(file_path)
        if snp_data is not None:
            all_snp_data.append(snp_data)
        all_error_logs.extend(error_log)
    
    # Save error logs to a TSV file

    error_log_df = pd.DataFrame(all_error_logs, columns=['Error', "File"])
    error_log_df['Error'] = error_log_df['Error'].astype(str)
    error_log_df['File'] = error_log_df['File'].astype(str)
    error_log_df.to_csv(os.path.join(error_logs_dir, 'error_log.csv'), index=False)
    
    print("Finished processing directory: ", directory)
    
    # Concatenate all SNP data into a single DataFrame
    if all_snp_data:
        df = pd.DataFrame(all_snp_data)
        return df
    else:
        return None

# Find intersection b/w 23AndMe and PRS313

In [68]:
import pandas as pd
import numpy as np
import sys

ttAndMeFullPanel = "../../Data/23andMe_metadata_files/23andMeGenePanel.csv"
# Read the text file into a DataFrame
ttAndMe = pd.read_csv(ttAndMeFullPanel)

# Read the PRS313.xlsx file into a DataFrame
PRS313_df = pd.read_excel('../../Data/PRS313_with_23andMe_chad.xlsx')

# Create a new row in PRS313_df to store the boolean values
PRS313_df.loc['in_23andMe'] = False

# Iterate over the rows in PRS313_df
for index, row in PRS313_df.iterrows():
    if index == 'in_23andMe':
        continue  # Skip the 'in_23andMe' row
    
    # Extract the position and chromosome from the current row
    position = row['Position']
    chromosome = row['Chromosome']
    
    # Check if the current position and chromosome exist in the 23andMe dataframe
    if ((ttAndMe['position'] == position) & (ttAndMe['chromosome'] == chromosome)).any():
        PRS313_df.loc[index, 'in_23andMe'] = True


# Print the updated PRS313_df DataFrame

print("Total number of SNPs in 23andMe data:", PRS313_df['in_23andMe'].sum())
PRS313_df.to_excel('../../Data/PRS313_with_23andMe_Full.xlsx')

KeyboardInterrupt: 

In [44]:
PRS313_df = pd.read_excel('../../Data/PRS313_with_23andMe_Full.xlsx')

In [86]:

ttandMePositions = "../../Data/23andMe_metadata_files/23andMeGenePanel.csv"

output_dir   = "../../Data/Raw_training_data_23andMe_union/"

os.makedirs(output_dir, exist_ok=True)
# Read the text file into a DataFrame
ttAndMe = pd.read_csv(ttandMePositions, comment='#', header = 0)

for i in range(1, 23):
    chromosome_number = i
    
    # Replace 'your_directory_path' with the path to your directory containing the VCF files
    directory_path = f'../../Data/23AndMePositionsUnion/chr{chromosome_number}/'
    
    df_23AndMe = process_vcf_files(directory_path, "../../Data/23AndMePositionsUnion/")
    
    # Filter the PRS313 data to only include SNPs on chromosome 1 by filtering column names that begin with chr1
    chromosome_N_snps = PRS313_df.loc[:, [col for col in PRS313_df.columns if col.startswith(f'chr{chromosome_number}_')]]
    
    # If the df_23AndMe DataFrame column has a matching column in the PRS313 DataFrame, add to its column name the string '_PRS313'
    cols_renamed = []
    for column in df_23AndMe.columns:
        if column in chromosome_N_snps.columns:
            df_23AndMe.rename(columns={column: f'{column}_PRS313_Known'}, inplace=True)
            cols_renamed.append(column)
            print(f"Column {column} renamed to {column}_PRS313_Known")
    print(len(cols_renamed))
    
    # Add the columns from the PRS313 DataFrame that were not present in the 23andMe DataFrame. Use the 'in_23AndMe' row to add only the rows that weren't already there. Ensure that there is a '_PRS313_Unknown' suffix in the column name
    added_columns = []
    for column in chromosome_N_snps.columns:
        if not chromosome_N_snps.loc['in_23andMe', column]:
            df_23AndMe[column + '_PRS313_Unknown'] = chromosome_N_snps.loc[:, column]
            added_columns.append(column)
            print(f"Column {column} added to df_23AndMe with suffix '_PRS313_Unknown'")
    print(len(added_columns))
    
    df_23AndMe.to_parquet(f'{output_dir}/23AndMe_PRS313_merged_chr{chromosome_number}.parquet')


Successfully processed:  ../../Data/23AndMePositionsUnion/chr1/chr1_pos4060196.vcf
Successfully processed:  ../../Data/23AndMePositionsUnion/chr1/chr1_pos207509287.vcf
Successfully processed:  ../../Data/23AndMePositionsUnion/chr1/chr1_pos193469300.vcf
Successfully processed:  ../../Data/23AndMePositionsUnion/chr1/chr1_pos90477520.vcf
Successfully processed:  ../../Data/23AndMePositionsUnion/chr1/chr1_pos162832406.vcf
Successfully processed:  ../../Data/23AndMePositionsUnion/chr1/chr1_pos21999437.vcf
Successfully processed:  ../../Data/23AndMePositionsUnion/chr1/chr1_pos233057152.vcf
Successfully processed:  ../../Data/23AndMePositionsUnion/chr1/chr1_pos98186118.vcf
Successfully processed:  ../../Data/23AndMePositionsUnion/chr1/chr1_pos103765339.vcf
Successfully processed:  ../../Data/23AndMePositionsUnion/chr1/chr1_pos91504195.vcf
Finished processing directory:  ../../Data/23AndMePositionsUnion/chr1/
0
0
Successfully processed:  ../../Data/23AndMePositionsUnion/chr2/chr2_pos184653608.

KeyboardInterrupt: 

In [4]:
import os
import pandas as pd

# Directory containing the parquet files
directory = "../Final_training_data/"

# Iterate over the files in the directory
for filename in os.listdir(directory):
    if filename.startswith("23AndMe_PRS313_merged_chr") and filename.endswith(".parquet"):
        file_path = os.path.join(directory, filename)
        
        # Extract the chromosome number from the filename
        chromosome = filename.split("_")[-1].split(".")[0]
        
        # Read the parquet file into a DataFrame
        df = pd.read_parquet(file_path)
        
        # Filter the columns starting with "chrN_"
        filtered_columns = [col for col in df.columns if col.startswith(f"{chromosome}_")]
        filtered_df = df[filtered_columns]
        
        # Replace the original parquet file with the filtered DataFrame
        filtered_df.to_parquet(file_path, index=False)

KeyboardInterrupt: 