# Process Chromosome

In [137]:
import pandas as pd
import glob
import os

def read_vcf_header(file_path):
    """Reads the VCF file header (column names) and returns it."""
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('#CHROM'):
                return line.strip().split('\t')
    return []

def extract_snp_data(file_path, header):
    """Extracts SNP data from a VCF file."""
    snp_data = []
    with open(file_path, 'r') as file:
        for line in file:
            if not line.startswith('#'):
                fields = line.strip().split('\t')
                chrom, pos, ref, alt = fields[0], fields[1], fields[3], fields[4]
                snp_id = f"chr{chrom}_{pos}_{ref}_{alt}"
                genotype_data = fields[9:]  # Genotype data starts from the 10th column
                snp_data.append((snp_id, genotype_data))
    return snp_data

def process_vcf_files(directory):
    """Processes all VCF files in the specified directory."""
    vcf_files = glob.glob(os.path.join(directory, '*.vcf'))
    all_snp_data = []
    patient_ids = None
    
    for file_path in vcf_files:
        if patient_ids is None:
            patient_ids = read_vcf_header(file_path)[9:]  # Assuming the first 9 columns are standard VCF columns
        snp_data = extract_snp_data(file_path, patient_ids)

        if (len(snp_data) == 0):
            print("No matches found at ", file_path)
        else:
            for SNP in snp_data:
                position_in_vcf = SNP[0].split("_")[1]
                file_path_position = file_path.split("_pos")[1][:-4]
                if (position_in_vcf == file_path_position):
                    all_snp_data.extend(snp_data)
                    print("Adding")
                else:
                    # Ignoring copy number variations
                    print("Ignoring")
                    continue

    # Convert the collected SNP data to a DataFrame
    data_dict = {snp_id: genotypes for snp_id, genotypes in all_snp_data}
    df = pd.DataFrame(data_dict, index=patient_ids)
    print(df.shape)
    return df

# Find intersection b/w 23AndMe and PRS313

In [133]:
import pandas as pd
import numpy as np
import sys

ttAndMeFullPanel = "../../Data/23andMe_metadata_files/23andMeGenePanel.csv"
# Read the text file into a DataFrame
ttAndMe = pd.read_csv(ttAndMeFullPanel)
# Read the text file into a DataFrame

# Read the PRS313.xlsx file into a DataFrame
PRS313_df = pd.read_csv('./concatenated_snps.csv', index_col=0)

# Create a list of the column names in PRS313_df
PRS313_columns = PRS313_df.columns

# Create a list of the positions in PRS313_df
PRS313_positions = [col.split('_')[1] for col in PRS313_columns]

# Create a list of the chromosomes in PRS313_df. Parse only the integer values in the string 'chr2' to get the chromosome number
PRS313_chromosomes = [int(col.split('chr')[1].split('_')[0]) for col in PRS313_columns]

# Create a new row in PRS313_df to store the boolean values
PRS313_df.loc['in_23andMe'] = False

# Iterate over the rows in ttAndMe
for index, row in ttAndMe.iterrows():
    # Check if the current position and chromosome match any position and chromosome in PRS313
    if str(row['position']) in PRS313_positions and row['chromosome'] in PRS313_chromosomes:
        # If there's a match, find the corresponding column in PRS313_df and set the 'in_23andMe' value to True
        matching_column = [col for col in PRS313_columns if str(row['position']) in col and str(row['chromosome']) in col]
        if len(matching_column) > 0:
            PRS313_df.loc['in_23andMe', matching_column[0]] = True

# Print the updated PRS313_df DataFrame
print("Updated PRS313_df:")
print(PRS313_df)

KeyboardInterrupt: 

In [128]:
PRS313_df = pd.read_csv("./concatenated_snps.csv").transpose()

ttandMePositions = "../../Data/23andMe_metadata_files/23andMeGenePanel.csv"

output_dir   = "../../Data/Raw_training_data_23andMe_union/"

os.makedirs(output_dir, exist_ok=True)
# Read the text file into a DataFrame
ttAndMe = pd.read_csv(ttandMePositions, comment='#', header = 0)
for i in range (1, 23):
    chromosome_number = i
        
    # Replace 'your_directory_path' with the path to your directory containing the VCF files
    directory_path = f'../../Data/23AndMePositionsUnion/chr{chromosome_number}/'
    df_23AndMe = process_vcf_files(directory_path)


    # Filter the PRS313 data to only include SNPs on chromosome 1 by filtering column names that begin with chr1
    chromosome_N_snps = PRS313_df.loc[:, [col for col in PRS313_df.columns if col.startswith(f'chr{chromosome_number}_')]]

    # If the df_23AndMe DataFrame column has a matching column in the PRS313 DataFrame, add to its column name the string '_PRS313'
    cols_renamed = []
    for column in df_23AndMe.columns:
        if column in chromosome_N_snps.columns:
            df_23AndMe.rename(columns={column: f'{column}_PRS313_Known'}, inplace=True)
            cols_renamed.append(column)
            print(f"Column {column} renamed to {column}_PRS313_Known")
            
    print(len(cols_renamed))

    # Add the columns from the PRS313 DataFrame that were not present in the 23andMe DataFrame. Use the 'in_23AndMe' row to add only the rows that weren't already there. Ensure that there is a '_PRS313_Unknown' suffix in the column name

    added_columns = []
    for column in chromosome_N_snps.columns:
        if not chromosome_N_snps.loc['in_23andMe', column]:
            df_23AndMe[column + '_PRS313_Unknown'] = chromosome_N_snps.loc[:, column]
            added_columns.append(column)
            print(f"Column {column} added to df_23AndMe with suffix '_PRS313_Unknown'")
    print(len(added_columns))

    df_23AndMe.to_parquet(f'{output_dir}/23AndMe_PRS313_merged_chr{chromosome_number}.parquet')

3502139 4060196
4000527 4060196
4005849 4060196
4060196 4060196
207509287 207509287
193469300 193469300
90477520 90477520
162832406 162832406
21999437 21999437
233057055 233057152
233057152 233057152
98177077 98186118
98186118 98186118
103693675 103765339
103741196 103765339
103765339 103765339
91500251 91504195
91504195 91504195
67531216 67531216
65015838 65018578
65018578 65018578
209720496 209736577
209736577 209736577
47196468 47441156
47441038 47441156
47441156 47441156
236252880 236252880
150504821 150504821
243448786 243448786
175744322 175745664
175745664 175745664
83783521 83783521
156877110 156877110
0


KeyboardInterrupt: 

In [4]:
import os
import pandas as pd

# Directory containing the parquet files
directory = "../Final_training_data/"

# Iterate over the files in the directory
for filename in os.listdir(directory):
    if filename.startswith("23AndMe_PRS313_merged_chr") and filename.endswith(".parquet"):
        file_path = os.path.join(directory, filename)
        
        # Extract the chromosome number from the filename
        chromosome = filename.split("_")[-1].split(".")[0]
        
        # Read the parquet file into a DataFrame
        df = pd.read_parquet(file_path)
        
        # Filter the columns starting with "chrN_"
        filtered_columns = [col for col in df.columns if col.startswith(f"{chromosome}_")]
        filtered_df = df[filtered_columns]
        
        # Replace the original parquet file with the filtered DataFrame
        filtered_df.to_parquet(file_path, index=False)

KeyboardInterrupt: 