In [1]:
import pandas as pd
import numpy as np

# Read the CSV file with position information
position_info = pd.read_csv('../../Data/Filtered_unphased_training_data_union_final/23andMe_matching_variants_updated.csv')

# Read the 23andMe data file
andme_data = pd.read_csv('../../Data/23andMe_files/6800.23andme.10042.txt', sep='\t', comment='#', header=None, names=['rsid', 'chromosome', 'position', 'genotype'])

# Merge the position information with the 23andMe data based on rsid
merged_data = pd.merge(position_info, andme_data, left_on='RS_Number', right_on='rsid', how='left')

# Create columns for maternal and paternal dosages based on genotype
merged_data['maternal_allele'] = merged_data['genotype'].str[0]
merged_data['paternal_allele'] = merged_data['genotype'].str[1]

# Function to calculate dosage
def calc_dosage(allele, alt_allele):
    return np.where(allele == alt_allele, 1, 0)

# Apply the dosage calculation
merged_data['maternal_dosage'] = calc_dosage(merged_data['maternal_allele'], merged_data['Alt'])
merged_data['paternal_dosage'] = calc_dosage(merged_data['paternal_allele'], merged_data['Alt'])
merged_data['unphased_dosage'] = merged_data['maternal_dosage'] + merged_data['paternal_dosage']

# Create phased columns
merged_data['phased_column_maternal'] = 'chr' + merged_data['chromosome'].astype(str) + '_' + merged_data['position'].astype(str) + '_' + merged_data['Ref'] + '_' + merged_data['Alt'] + '_maternal'
merged_data['phased_column_paternal'] = 'chr' + merged_data['chromosome'].astype(str) + '_' + merged_data['position'].astype(str) + '_' + merged_data['Ref'] + '_' + merged_data['Alt'] + '_paternal'
merged_data['unphased_column'] = 'chr' + merged_data['chromosome'].astype(str) + '_' + merged_data['position'].astype(str) + '_combined'

# Filter rows with missing genotype data
missing_genotype = merged_data['genotype'].isna()
missing_rows = merged_data[missing_genotype]

# Prepare phased dosage data by grouping and pivoting
phased_dosage_data_maternal = merged_data.pivot_table(index='RS_Number', columns='phased_column_maternal', values='maternal_dosage', aggfunc='first').reset_index()
phased_dosage_data_paternal = merged_data.pivot_table(index='RS_Number', columns='phased_column_paternal', values='paternal_dosage', aggfunc='first').reset_index()

# Merge the maternal and paternal phased dosage data
phased_dosage_data = pd.merge(phased_dosage_data_maternal, phased_dosage_data_paternal, on='RS_Number', how='outer')

# Prepare unphased dosage data by grouping and pivoting
unphased_dosage_data = merged_data.pivot_table(index='RS_Number', columns='unphased_column', values='unphased_dosage', aggfunc='first').reset_index()

# Save the phased and unphased dosage data to separate CSV files
phased_dosage_data.to_csv('phased_dosage_data.csv', index=False)
unphased_dosage_data.to_csv('unphased_dosage_data.csv', index=False)


  andme_data = pd.read_csv('../../Data/23andMe_files/6800.23andme.10042.txt', sep='\t', comment='#', header=None, names=['rsid', 'chromosome', 'position', 'genotype'])


KeyboardInterrupt: 

In [49]:
import pandas as pd
import numpy as np

# Read the CSV file with position information
position_info = pd.read_csv('../../Data/Filtered_unphased_training_data_union_final/23andMe_matching_variants_updated.csv')

# Read the 23andMe data file
andme_data = pd.read_csv('../../Data/23andMe_files/6800.23andme.10042.txt', sep='\t', comment='#', header=None, names=['rsid', 'chromosome', 'position', 'genotype'])

# Merge the position information with the 23andMe data based on rsid
merged_data = pd.merge(position_info, andme_data, left_on='RS_Number', right_on='rsid', how='left')

# Create columns for maternal and paternal dosages based on genotype
merged_data['maternal_allele'] = merged_data['genotype'].str[0]
merged_data['paternal_allele'] = merged_data['genotype'].str[1]

# Function to calculate dosage
def calc_dosage(allele, alt_allele):
    return np.where(pd.isna(allele) | pd.isna(alt_allele), np.nan, np.where(allele == alt_allele, 1, 0))

# Apply the dosage calculation
merged_data['maternal_dosage'] = calc_dosage(merged_data['maternal_allele'], merged_data['Alt'])
merged_data['paternal_dosage'] = calc_dosage(merged_data['paternal_allele'], merged_data['Alt'])
merged_data['unphased_dosage'] = merged_data['maternal_dosage'] + merged_data['paternal_dosage']

# Create phased columns
merged_data['phased_column_maternal'] = 'chr' + merged_data['chromosome'].astype(str) + '_' + merged_data['position'].astype(str) + '_' + merged_data['Ref'] + '_' + merged_data['Alt'] + '_maternal'
merged_data['phased_column_paternal'] = 'chr' + merged_data['chromosome'].astype(str) + '_' + merged_data['position'].astype(str) + '_' + merged_data['Ref'] + '_' + merged_data['Alt'] + '_paternal'
merged_data['unphased_column'] = 'chr' + merged_data['chromosome'].astype(str) + '_' + merged_data['position'].astype(str) + '_combined'

# Get all the NA values from the genotype column
missing_genotype = merged_data['genotype'].isna()

# Create a new dataframe with the desired columns and values
new_data = {
    **dict(zip(merged_data["phased_column_maternal"], merged_data["maternal_dosage"])),
    **dict(zip(merged_data["phased_column_paternal"], merged_data["paternal_dosage"])),
    **dict(zip(merged_data["unphased_column"], merged_data["unphased_dosage"]))
}
df = pd.DataFrame([new_data])
df = df.sort_index(axis=1)


  andme_data = pd.read_csv('../../Data/23andMe_files/6800.23andme.10042.txt', sep='\t', comment='#', header=None, names=['rsid', 'chromosome', 'position', 'genotype'])


In [64]:
import pandas as pd
import numpy as np

def process_23andme_data(position_info_path, andme_data_path, output_path):
    def read_data(position_info_path, andme_data_path):
        position_info = pd.read_csv(position_info_path)
        andme_data = pd.read_csv(andme_data_path, sep='\t', comment='#', header=None, names=['rsid', 'chromosome', 'position', 'genotype'])
        return position_info, andme_data

    def merge_data(position_info, andme_data):
        return pd.merge(position_info, andme_data, left_on='RS_Number', right_on='rsid', how='left')

    def create_allele_columns(data):
        data['maternal_allele'] = data['genotype'].str[0]
        data['paternal_allele'] = data['genotype'].str[1]
        return data

    def calc_dosage(allele, alt_allele):
        return np.where(pd.isna(allele) | pd.isna(alt_allele), np.nan, np.where(allele == alt_allele, 1, 0))

    def apply_dosage_calculation(data):
        data['maternal_dosage'] = calc_dosage(data['maternal_allele'], data['Alt'])
        data['paternal_dosage'] = calc_dosage(data['paternal_allele'], data['Alt'])
        data['unphased_dosage'] = data['maternal_dosage'] + data['paternal_dosage']
        return data

    def create_phased_columns(data):
        coord_prefix = data['Coord'].str.split(':').str[0]
        position = data['Coord'].str.split(':').str[1]
        ref = data['Alleles'].apply(lambda x: x.split('/')[0][1])
        alt = data['Alleles'].apply(lambda x: x.split('/')[1][0])
        data['phased_column_maternal'] = coord_prefix + '_' + position + '_' + ref + '_' + alt + '_maternal'
        data['phased_column_paternal'] = coord_prefix + '_' + position + '_' + ref + '_' + alt + '_paternal'
        data['unphased_column'] = coord_prefix + '_' + position + '_' + ref + '_' + alt + '_combined'
        return data

    def create_output_dataframe(data):
        new_data = {
            **dict(zip(data["phased_column_maternal"], data["maternal_dosage"])),
            **dict(zip(data["phased_column_paternal"], data["paternal_dosage"])),
            **dict(zip(data["unphased_column"], data["unphased_dosage"]))
        }
        df = pd.DataFrame([new_data])
        return df.sort_index(axis=1)

    def save_to_csv(df, output_path):
        df.to_csv(output_path, index=False)

    # Pipeline execution
    position_info, andme_data = read_data(position_info_path, andme_data_path)
    merged_data = merge_data(position_info, andme_data)
    merged_data = create_allele_columns(merged_data)
    merged_data = apply_dosage_calculation(merged_data)
    merged_data = create_phased_columns(merged_data)
    output_df = create_output_dataframe(merged_data)
    save_to_csv(output_df, output_path)

# Example usage:
process_23andme_data('../../Data/Filtered_unphased_training_data_union_final/23andMe_matching_variants_updated.csv', 
                     '../../Data/23andMe_files/6800.23andme.10042.txt', 
                     'output.csv')


  andme_data = pd.read_csv(andme_data_path, sep='\t', comment='#', header=None, names=['rsid', 'chromosome', 'position', 'genotype'])


KeyboardInterrupt: 

In [None]:
# Create the new dictionary with the desired structure
