In [22]:
import pandas as pd
import numpy as np

def process_23andme_data(position_info_path, andme_data_path, output_path):

    # Function to extract "chrN:pos"
    def extract_chr_pos(value):
        parts = value.split('_')
        return f"{parts[0]}:{parts[1]}"
        
    def extract_alt_allele(value):
        parts = value.split('_')
        return f"{parts[3]}"
    def extract_ref_allele(value):
        parts = value.split('_')
        return f"{parts[2]}"
    
        
    def read_data(position_info_path, andme_data_path):
        position_info = pd.read_csv(position_info_path)
        # Create new column with "chrN:pos"
        position_info['chr_pos'] = position_info['matching_columns'].apply(extract_chr_pos)
        position_info['Alt'] = position_info['matching_columns'].apply(extract_alt_allele)
        position_info['Ref'] = position_info['matching_columns'].apply(extract_ref_allele)


        andme_data = pd.read_csv(andme_data_path, sep='\t', comment='#', header=None, names=['rsid', 'chromosome', 'position', 'genotype'])
        andme_data['chr_pos'] = andme_data.apply(lambda row: f"chr{row['chromosome']}:{row['position']}", axis=1)
        return position_info, andme_data

    def merge_data(position_info, andme_data):
        return pd.merge(position_info, andme_data, left_on='chr_pos', right_on='chr_pos', how='left')

    def create_allele_columns(data):
        data['maternal_allele'] = data['genotype'].str[0]
        data['paternal_allele'] = data['genotype'].str[1]
        return data

    def calc_dosage(allele, alt_allele):
        return np.where(pd.isna(allele) | pd.isna(alt_allele), np.nan, np.where(allele == alt_allele, 1, 0))

    def apply_dosage_calculation(data):
        data['maternal_dosage'] = calc_dosage(data['maternal_allele'], data['Alt'])
        data['paternal_dosage'] = calc_dosage(data['paternal_allele'], data['Alt'])
        data['unphased_dosage'] = data['maternal_dosage'] + data['paternal_dosage']
        return data

    def create_phased_columns(data):
        coord_prefix = data["chr_pos"].str.split(":").str[0]
        position = data["chr_pos"].str.split(":").str[1]
        ref = data['Ref'].astype(str)
        alt = data['Alt'].astype(str)
        data['phased_column_maternal'] = coord_prefix + '_' + position + '_' + ref + '_' + alt + '_maternal'
        data['phased_column_paternal'] = coord_prefix + '_' + position + '_' + ref + '_' + alt + '_paternal'
        data['unphased_column'] = coord_prefix + '_' + position + '_' + ref + '_' + alt + '_combined'

        return data

    def create_output_dataframe(data):
        
        new_data = {
            **dict(zip(data["phased_column_maternal"], data["maternal_dosage"])),
            **dict(zip(data["phased_column_paternal"], data["paternal_dosage"])),
            **dict(zip(data["unphased_column"], data["unphased_dosage"]))
        }
        df = pd.DataFrame([new_data])
        return df.sort_index(axis=1)

    def save_to_csv(df, output_path):
        df.to_csv(output_path, index=False)

    # Pipeline execution
    position_info, andme_data = read_data(position_info_path, andme_data_path)
    merged_data = merge_data(position_info, andme_data)
    merged_data = create_allele_columns(merged_data)
    merged_data = apply_dosage_calculation(merged_data)
    merged_data = create_phased_columns(merged_data)
    output_df = create_output_dataframe(merged_data)
    save_to_csv(output_df, output_path)
    return output_df

# Example usage:
dat = process_23andme_data('../../Data/Filtered_raw_training_data_union/matching_columns_all.csv', 
                     '../../Data/23andMe_files/11703.23andme.9619.txt', 
                     'output.csv')


  andme_data = pd.read_csv(andme_data_path, sep='\t', comment='#', header=None, names=['rsid', 'chromosome', 'position', 'genotype'])


In [32]:
import os
import pandas as pd
# Load and preprocess the data
data_directory = '../../Data/Filtered_unphased_training_data_union_final/'



for i in range(1, 23):
    
    file_name = data_directory + f"23AndMe_PRS313_merged_chr{i}_matching_combined.parquet"
    data = pd.read_parquet(file_name)
    
    # Get all the chromosome 1 variants
    chr = dat.filter(regex=f'^chr{i}_', axis=1)
    print(len(data.columns) - chr.shape[1]/3)




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [64]:
all_variants = pd.read_csv('../../Data/Filtered_raw_training_data_union/23andMe_matching_variants_updated.csv')
all_variants.loc[all_variants["Coord"].str.contains("chr1:")]

Unnamed: 0,RS_Number,Coord,Alleles,MAF,Distance,Dprime,R2,Correlated_Alleles,FORGEdb,RegulomeDB,Function,Ref,Alt
15643,rs10493800,chr1:88177403,(G/A),0.3295,20480.0,0.6157,0.0712,,4.0,5,,G,A
15644,rs10922697,chr1:88127152,(T/C),0.2250,-29771.0,0.8004,0.0172,,3.0,7,,T,C
15645,rs305445,chr1:88208135,(G/A),0.2646,51212.0,0.8838,0.0259,,4.0,7,,G,A
15646,rs12137116,chr1:88109828,(G/A),0.3560,-47095.0,0.5007,0.0418,,6.0,3a,,G,A
15647,rs4655854,chr1:88086894,(C/A),0.4131,-70029.0,0.4678,0.0142,,2.0,5,,C,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17677,rs115461551,chr1:120690218,(C/T),0.1078,-597776.0,0.3968,0.1165,"A=T,G=C",6.0,7,,C,T
17678,rs115706173,chr1:121280485,(A/G),0.0048,-7509.0,1.0000,0.0295,,8.0,4,,A,"<CN2>,<CN3>"
17679,rs79368528,chr1:120689085,(C/T),0.0871,-598909.0,0.5304,0.1643,"A=T,G=C",4.0,5,,C,T
17680,rs587774691,chr1:121137155,(A/G),0.0028,-150839.0,1.0000,0.0172,,,.,,A,G


In [65]:
position_column = dat.columns[dat.columns.str.contains("10044339")]
position_column

Index(['chr1_100443397_0        T\n1        G\n2        T\n3        T\n4        C\n        ..\n17677    C\n17678    A\n17679    C\n17680    A\n17681    T\nName: Ref, Length: 17682, dtype: object_0                  C\n1                  A\n2                  G\n3                  G\n4                  A\n            ...     \n17677              T\n17678    <CN2>,<CN3>\n17679              T\n17680              G\n17681              C\nName: Alt, Length: 17682, dtype: object_combined',
       'chr1_100443397_0        T\n1        G\n2        T\n3        T\n4        C\n        ..\n17677    C\n17678    A\n17679    C\n17680    A\n17681    T\nName: Ref, Length: 17682, dtype: object_0                  C\n1                  A\n2                  G\n3                  G\n4                  A\n            ...     \n17677              T\n17678    <CN2>,<CN3>\n17679              T\n17680              G\n17681              C\nName: Alt, Length: 17682, dtype: object_maternal',
       'chr1_100443397_0