In [1]:
import pandas as pd
import numpy as np

def process_23andme_data(position_info_path, andme_data_path, output_path):

    # Function to extract "chrN:pos"
    def extract_chr_pos(value):
        parts = value.split('_')
        return f"{parts[0]}:{parts[1]}"
        
    def extract_alt_allele(value):
        parts = value.split('_')
        return f"{parts[3]}"
    def extract_ref_allele(value):
        parts = value.split('_')
        return f"{parts[2]}"
    
        
    def read_data(position_info_path, andme_data_path):
        position_info = pd.read_csv(position_info_path)
        # Create new column with "chrN:pos"
        position_info['chr_pos'] = position_info['matching_columns'].apply(extract_chr_pos)
        position_info['Alt'] = position_info['matching_columns'].apply(extract_alt_allele)
        position_info['Ref'] = position_info['matching_columns'].apply(extract_ref_allele)


        andme_data = pd.read_csv(andme_data_path, sep='\t', comment='#', header=None, names=['rsid', 'chromosome', 'position', 'genotype'])
        andme_data['chr_pos'] = andme_data.apply(lambda row: f"chr{row['chromosome']}:{row['position']}", axis=1)
        return position_info, andme_data

    def merge_data(position_info, andme_data):
        return pd.merge(position_info, andme_data, left_on='chr_pos', right_on='chr_pos', how='left')

    def create_allele_columns(data):
        data['maternal_allele'] = data['genotype'].str[0]
        data['paternal_allele'] = data['genotype'].str[1]
        return data

    def calc_dosage(allele, alt_allele):
        return np.where(pd.isna(allele) | pd.isna(alt_allele), np.nan, np.where(allele == alt_allele, 1, 0))

    def apply_dosage_calculation(data):
        data['maternal_dosage'] = calc_dosage(data['maternal_allele'], data['Alt'])
        data['paternal_dosage'] = calc_dosage(data['paternal_allele'], data['Alt'])
        data['unphased_dosage'] = data['maternal_dosage'] + data['paternal_dosage']
        return data

    def create_phased_columns(data):
        coord_prefix = data["chr_pos"].str.split(":").str[0]
        position = data["chr_pos"].str.split(":").str[1]
        ref = data['Ref'].astype(str)
        alt = data['Alt'].astype(str)
        data['phased_column_maternal'] = coord_prefix + '_' + position + '_' + ref + '_' + alt + '_maternal'
        data['phased_column_paternal'] = coord_prefix + '_' + position + '_' + ref + '_' + alt + '_paternal'
        data['unphased_column'] = coord_prefix + '_' + position + '_' + ref + '_' + alt + '_combined'

        return data

    def create_output_dataframe(data):
        
        new_data = {
            **dict(zip(data["phased_column_maternal"], data["maternal_dosage"])),
            **dict(zip(data["phased_column_paternal"], data["paternal_dosage"])),
            **dict(zip(data["unphased_column"], data["unphased_dosage"]))
        }
        df = pd.DataFrame([new_data])
        return df.sort_index(axis=1)

    def save_to_csv(df, output_path):
        df.to_csv(output_path, index=False)

    # Pipeline execution
    position_info, andme_data = read_data(position_info_path, andme_data_path)
    merged_data = merge_data(position_info, andme_data)
    merged_data = create_allele_columns(merged_data)
    merged_data = apply_dosage_calculation(merged_data)
    merged_data = create_phased_columns(merged_data)
    output_df = create_output_dataframe(merged_data)
    save_to_csv(output_df, output_path)
    return output_df

# Example usage:
dat = process_23andme_data('../../Data/Filtered_raw_training_data_union/matching_columns_all.csv', 
                     '../../Data/23andMe_files/11703.23andme.9619.txt', 
                     'output.csv')


  andme_data = pd.read_csv(andme_data_path, sep='\t', comment='#', header=None, names=['rsid', 'chromosome', 'position', 'genotype'])
