# Process Chromosome

In [2]:
import pandas as pd
import glob
import os

def read_vcf_header(file_path):
    """Reads the VCF file header (column names) and returns it."""
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('#CHROM'):
                return line.strip().split('\t')
    return []

def extract_snp_data(file_path, header):
    """Extracts SNP data from a VCF file."""
    snp_data = []
    with open(file_path, 'r') as file:
        for line in file:
            if not line.startswith('#'):
                fields = line.strip().split('\t')
                chrom, pos, ref, alt = fields[0], fields[1], fields[3], fields[4]
                snp_id = f"chr{chrom}_{pos}_{ref}_{alt}"
                genotype_data = fields[9:]  # Genotype data starts from the 10th column
                snp_data.append((snp_id, genotype_data))
    return snp_data

def process_vcf_files(directory):
    """Processes all VCF files in the specified directory."""
    vcf_files = glob.glob(os.path.join(directory, '*.vcf'))
    all_snp_data = []
    patient_ids = None
    
    for file_path in vcf_files:
        if patient_ids is None:
            patient_ids = read_vcf_header(file_path)[9:]  # Assuming the first 9 columns are standard VCF columns
        snp_data = extract_snp_data(file_path, patient_ids)

        if (len(snp_data) == 0):
            # print("No matches found at ", file_path)
            continue
        else:
            for SNP in snp_data:
                position_in_vcf = SNP[0].split("_")[1]
                file_path_position = file_path.split("_pos")[1][:-4]
                if (position_in_vcf == file_path_position):
                    all_snp_data.extend(snp_data)
                    # print("Adding")
                else:
                    # Ignoring copy number variations
                    # print("Ignoring")
                    continue

    # Convert the collected SNP data to a DataFrame
    data_dict = {snp_id: genotypes for snp_id, genotypes in all_snp_data}
    df = pd.DataFrame(data_dict, index=patient_ids)
    print(df.shape)
    return df

# Find intersection b/w 23AndMe and PRS313

In [3]:
import pandas as pd

# Define file paths
ttAndMeFullPanel = "../../../Data/23andMe_metadata_files/23andMeGenePanel.csv"
prs313_file = './concatenated_snps_processed.csv'

# Read the CSV files into DataFrames
ttAndMe = pd.read_csv(ttAndMeFullPanel)
PRS313_df = pd.read_csv(prs313_file, index_col=0)

# Extract chromosome and position information from PRS313_df columns
PRS313_info = PRS313_df.columns.str.extract(r'chr(\d+)_(\d+)', expand=True)
PRS313_info.columns = ['chromosome', 'position']
PRS313_info['full_col'] = PRS313_df.columns

# Convert chromosome and position columns to string type for comparison
ttAndMe['chromosome'] = ttAndMe['chromosome'].astype(str)
ttAndMe['position'] = ttAndMe['position'].astype(str)
PRS313_info['chromosome'] = PRS313_info['chromosome'].astype(str)
PRS313_info['position'] = PRS313_info['position'].astype(str)

# Create a new row in PRS313_df to store the boolean values
PRS313_df.loc['in_23andMe'] = False

# Merge ttAndMe with PRS313_info to identify matching rows
merged_df = pd.merge(ttAndMe, PRS313_info, how='inner', on=['chromosome', 'position'])

# Set the 'in_23andMe' row to True for matching columns
PRS313_df.loc['in_23andMe', merged_df['full_col']] = True

# Print the updated PRS313_df DataFrame
print("Updated PRS313_df:")
print(PRS313_df)


Updated PRS313_df:
           chr2_192381934_C_T chr8_129199566_G_A chr8_29509616_A_C  \
HG00096                   1|1                0|1               1|1   
HG00097                   1|1                1|0               1|1   
HG00099                   1|1                0|0               0|0   
HG00100                   0|1                0|0               0|1   
HG00101                   1|1                0|0               1|1   
...                       ...                ...               ...   
NA21141                   1|1                0|0               0|0   
NA21142                   1|0                1|0               0|1   
NA21143                   1|1                0|0               1|1   
NA21144                   1|1                0|0               0|1   
in_23andMe               True              False              True   

           chr2_121089731_T_C chr15_75750383_T_C chr18_24337424_C_G  \
HG00096                   1|0                1|0                1|1  

In [8]:
# Find the max value in PRS313_df

PRS313_df.iloc[:,23]

HG00096         1|1
HG00097         0|0
HG00099         1|1
HG00100         1|0
HG00101         1|1
              ...  
NA21141         0|0
NA21142         1|0
NA21143         0|0
NA21144         0|0
in_23andMe    False
Name: chr22_45319953_G_A, Length: 2505, dtype: object

In [9]:

import os
import pandas as pd
ttandMePositions = "../../../Data/23andMe_metadata_files/23andMeGenePanel.csv"

output_dir   = "../../../Data/Raw_training_data_23andMe_union/"

os.makedirs(output_dir, exist_ok=True)
# Read the text file into a DataFrame
ttAndMe = pd.read_csv(ttandMePositions, comment='#', header = 0)
for chromosome_number in range(1, 23):
    # Define directory path for the current chromosome
    directory_path = f'../../../Data/23AndMePositionsUnion/chr{chromosome_number}/'
    
    # Process VCF files to get the DataFrame
    df_23AndMe = process_vcf_files(directory_path)
    
    # Filter the PRS313 data to include only SNPs on the current chromosome
    chromosome_N_snps = PRS313_df.loc[:, [col for col in PRS313_df.columns if col.startswith(f'chr{chromosome_number}_')]]
    
    # Identify columns present in both DataFrames
    common_columns = df_23AndMe.columns.intersection(chromosome_N_snps.columns)
    
    # Rename the common columns in df_23AndMe
    df_23AndMe.rename(columns={col: f'{col}_PRS313_Known' for col in common_columns}, inplace=True)
    print(f"{len(common_columns)} columns renamed with suffix '_PRS313_Known'")
    
    # Identify columns in PRS313 not present in df_23AndMe using 'in_23andMe' row
    in_23andMe = chromosome_N_snps.loc['in_23andMe'].astype(bool)
    missing_columns = chromosome_N_snps.columns[~in_23andMe]
    
    # Add the missing columns to df_23AndMe with suffix '_PRS313_Unknown'
    for column in missing_columns:
        df_23AndMe[f'{column}_PRS313_Unknown'] = chromosome_N_snps[column]
    print(f"{len(missing_columns)} columns added with suffix '_PRS313_Unknown'")
    
    # Save the resulting DataFrame to a parquet file
    output_file = os.path.join(output_dir, f'23AndMe_PRS313_merged_chr{chromosome_number}.parquet')

    df_23AndMe.to_parquet(output_file)

(2504, 53891)
10 columns renamed with suffix '_PRS313_Known'
20 columns added with suffix '_PRS313_Unknown'
(2504, 53134)
4 columns renamed with suffix '_PRS313_Known'
17 columns added with suffix '_PRS313_Unknown'
(2504, 44862)
3 columns renamed with suffix '_PRS313_Known'
13 columns added with suffix '_PRS313_Unknown'
(2504, 41712)
0 columns renamed with suffix '_PRS313_Known'
11 columns added with suffix '_PRS313_Unknown'
(2504, 38762)
9 columns renamed with suffix '_PRS313_Known'
25 columns added with suffix '_PRS313_Unknown'
(2504, 45799)
2 columns renamed with suffix '_PRS313_Known'
18 columns added with suffix '_PRS313_Unknown'
(2504, 36163)
4 columns renamed with suffix '_PRS313_Known'
10 columns added with suffix '_PRS313_Unknown'
(2504, 33374)
8 columns renamed with suffix '_PRS313_Known'
13 columns added with suffix '_PRS313_Unknown'
(2504, 27799)
5 columns renamed with suffix '_PRS313_Known'
9 columns added with suffix '_PRS313_Unknown'
(2504, 33688)
4 columns renamed with 