In [38]:
import pandas as pd
import numpy as np

def process_23andme_data(position_info_path, andme_data_path, output_path):
    def read_data(position_info_path, andme_data_path):
        position_info = pd.read_csv(position_info_path)
        andme_data = pd.read_csv(andme_data_path, sep='\t', comment='#', header=None, names=['rsid', 'chromosome', 'position', 'genotype'])
        return position_info, andme_data

    def merge_data(position_info, andme_data):
        return pd.merge(position_info, andme_data, left_on='RS_Number', right_on='rsid', how='left')

    def create_allele_columns(data):
        data['maternal_allele'] = data['genotype'].str[0]
        data['paternal_allele'] = data['genotype'].str[1]
        return data

    def calc_dosage(allele, alt_allele):
        return np.where(pd.isna(allele) | pd.isna(alt_allele), np.nan, np.where(allele == alt_allele, 1, 0))

    def apply_dosage_calculation(data):
        data['maternal_dosage'] = calc_dosage(data['maternal_allele'], data['Alt'])
        data['paternal_dosage'] = calc_dosage(data['paternal_allele'], data['Alt'])
        data['unphased_dosage'] = data['maternal_dosage'] + data['paternal_dosage']
        return data

    def create_phased_columns(data):
        coord_prefix = data['Coord'].str.split(':').str[0]
        position = data['Coord'].str.split(':').str[1]
        ref = data['Alleles'].apply(lambda x: x.split('/')[0][1])
        alt = data['Alleles'].apply(lambda x: x.split('/')[1][0])
        data['phased_column_maternal'] = coord_prefix + '_' + position + '_' + ref + '_' + alt + '_maternal'
        data['phased_column_paternal'] = coord_prefix + '_' + position + '_' + ref + '_' + alt + '_paternal'
        data['unphased_column'] = coord_prefix + '_' + position + '_' + ref + '_' + alt + '_combined'
        return data

    def create_output_dataframe(data):
        new_data = {
            **dict(zip(data["phased_column_maternal"], data["maternal_dosage"])),
            **dict(zip(data["phased_column_paternal"], data["paternal_dosage"])),
            **dict(zip(data["unphased_column"], data["unphased_dosage"]))
        }
        df = pd.DataFrame([new_data])
        return df.sort_index(axis=1)

    def save_to_csv(df, output_path):
        df.to_csv(output_path, index=False)

    # Pipeline execution
    position_info, andme_data = read_data(position_info_path, andme_data_path)
    merged_data = merge_data(position_info, andme_data)
    merged_data = create_allele_columns(merged_data)
    merged_data = apply_dosage_calculation(merged_data)
    merged_data = create_phased_columns(merged_data)
    output_df = create_output_dataframe(merged_data)
    save_to_csv(output_df, output_path)
    return output_df

# Example usage:
dat = process_23andme_data('../../Data/Filtered_unphased_training_data_union_final/23andMe_matching_variants_updated.csv', 
                     '../../Data/23andMe_files/11703.23andme.9619.txt', 
                     'output.csv')


  andme_data = pd.read_csv(andme_data_path, sep='\t', comment='#', header=None, names=['rsid', 'chromosome', 'position', 'genotype'])


In [39]:
import os
import pandas as pd
# Load and preprocess the data
data_directory = '../../Data/Filtered_unphased_training_data_union_final/'

# Initialize lists to store the performance metrics for each chromosome
r2_scores = []
iqs_scores = []
accuracy_scores = []


chromosome_number = 1

file_name = data_directory + f"23AndMe_PRS313_merged_chr{chromosome_number}_matching_combined.parquet"
data = pd.read_parquet(file_name)

# Get all the chromosome 1 variants
chr = dat.filter(regex=f'^chr{chromosome_number}_', axis=1)


In [40]:
chr.shape[1]/3

915.0

In [41]:
# Get the columns in data but not in chr
missing_columns = data.columns.difference(chr.columns)
len(missing_columns)

37

In [42]:
missing_columns

Index(['chr1_100880328_A_T_PRS313_Unknown_combined',
       'chr1_10566215_A_G_PRS313_Known_combined',
       'chr1_110198129_CAAA_C_PRS313_Unknown_combined',
       'chr1_114445880_G_A_PRS313_Unknown_combined',
       'chr1_118141492_A_C_PRS313_Unknown_combined',
       'chr1_120257110_T_C_PRS313_Known_combined',
       'chr1_121280613_A_G_PRS313_Known_combined',
       'chr1_121287994_A_G_PRS313_Unknown_combined',
       'chr1_145604302_C_CT_PRS313_Unknown_combined',
       'chr1_149906413_T_C_PRS313_Known_combined',
       'chr1_155556971_G_A_PRS313_Unknown_combined',
       'chr1_168133974_AC_A_combined',
       'chr1_168171052_CA_C_PRS313_Unknown_combined',
       'chr1_172328767_T_TA_PRS313_Unknown_combined',
       'chr1_172730160_T_TGTGC_combined',
       'chr1_18807339_T_C_PRS313_Known_combined',
       'chr1_201437832_C_T_PRS313_Known_combined',
       'chr1_202184600_C_T_PRS313_Unknown_combined',
       'chr1_203770448_T_A_PRS313_Unknown_combined',
       'chr1_204388877_GT_