In [1]:
import numpy as np
import pandas as pd
import torch
import os
from sklearn.model_selection import train_test_split

# Load and preprocess the data
data_directory = '../../../Data/Filtered_unphased_training_data_union_final/'
output_folder = "../../../Data/model_results_unphased_all_PRS/linear_regression/"
csv_folder = output_folder + "csv_files/"
os.makedirs(csv_folder, exist_ok=True)

file_name = data_directory + "23AndMe_PRS313_merged_chr2_matching_combined.parquet"
data = pd.read_parquet(file_name)

# Split the data into features and target
X = data.filter(regex='^(?!.*PRS313_)')
y = data.filter(regex='PRS313_')
sample_ids = data.index.values  # Assuming sample IDs are in the index

# Save the y dataframe
y.to_csv(data_directory + "true_labels_y_test.csv")

# Split the data into train-validation and test sets
X_train_val, X_test, y_train_val, y_test, sample_train_val, sample_test = train_test_split(
    X, y, sample_ids, test_size=0.2, random_state=42)

# Save sample IDs of X_test to a file
x_test_sample_ids_file = csv_folder + "X_test_sample_ids.txt"
np.savetxt(x_test_sample_ids_file, sample_test, fmt='%s')

print(f"X_test sample IDs saved to {x_test_sample_ids_file}")


X_test sample IDs saved to ../../../Data/model_results_unphased_all_PRS/linear_regression/csv_files/X_test_sample_ids.txt


# Saving the Test Set Labels

In [2]:
import os
from sklearn.model_selection import train_test_split
import pandas as pd

data_directory = '../../../Data/Filtered_unphased_training_data_union_final/'
output_dir = "../../../Data/y_test_labels_unphased/"
output_dir_benchmark = "../../../scripts/Python/benchmarking/"
os.makedirs(output_dir, exist_ok=True)

# Initialize empty dataframes to store all y_test values
all_y_test = pd.DataFrame()
all_missing_values_y_test = pd.DataFrame()
all_simple_imputation_values_y_test = pd.DataFrame()

# Load the MAF values
maf_df = pd.read_csv("../../../Data/MAF_calculations/23AndMe_PRS313_all_chromosomes_MAF.csv")
maf_dict = maf_df.set_index('SNP')['MAF'].to_dict()

for i in range(1, 23):
    file_name = data_directory + f"23AndMe_PRS313_merged_chr{i}_matching_combined.parquet"
    data = pd.read_parquet(file_name)

    # Split the data into features and target
    X = data.filter(regex='^(?!.*PRS313_)')
    y = data.filter(regex='PRS313_')
    sample_ids = data.index.values  # Assuming sample IDs are in the index

    # Split the data into train-validation and test sets
    X_train_val, X_test, y_train_val, y_test, sample_train_val, sample_test = train_test_split(
        X, y, sample_ids, test_size=0.2, random_state=42)

    # Convert y_test to a dataframe with sample_ids as a column
    y_test['sample_id'] = sample_test
    y_test.set_index('sample_id', inplace=True)

    # Create missing_values_y_test
    missing_values_y_test = y_test.copy()
    missing_values_y_test.loc[:, missing_values_y_test.columns.str.contains('PRS313_Unknown')] = 0

    # Create simple_imputation_values_y_test
    simple_imputation_values_y_test = y_test.copy()
    for col in simple_imputation_values_y_test.columns:
        if "PRS313_Unknown" in col:
            maf_value = maf_dict.get(col, 0)  # Default to 0 if SNP not found in MAF dictionary
            simple_imputation_values_y_test[col] = simple_imputation_values_y_test[col].apply(
                lambda x: 2 * maf_value if "PRS313_Unknown" in col else x
            )

    # Merge y_test with all_y_test
    if all_y_test.empty:
        all_y_test = y_test
        all_missing_values_y_test = missing_values_y_test
        all_simple_imputation_values_y_test = simple_imputation_values_y_test
    else:
        all_y_test = all_y_test.join(y_test, how='outer')
        all_missing_values_y_test = all_missing_values_y_test.join(missing_values_y_test, how='outer')
        all_simple_imputation_values_y_test = all_simple_imputation_values_y_test.join(simple_imputation_values_y_test, how='outer')

    # Save the individual y dataframe
    y_test.to_csv(output_dir + f"chr{i}_true_labels_y_test.csv")
    # missing_values_y_test.to_csv(output_dir + f"chr{i}_missing_values_y_test.csv")
    # simple_imputation_values_y_test.to_csv(output_dir + f"chr{i}_simple_imputation_values_y_test.csv")

# Save all y_test values into single CSV files
all_y_test.to_csv(output_dir_benchmark + "all_chr_true_labels_y_test.csv")
all_missing_values_y_test.to_csv(output_dir_benchmark + "all_chr_missing_values_y_test.csv")
all_simple_imputation_values_y_test.to_csv(output_dir_benchmark + "all_chr_simple_imputation_values_y_test.csv")


# Save the X_train_val SNPs from 23andME

In [8]:
import os
from sklearn.model_selection import train_test_split
import pandas as pd
import re

data_directory = '../../../Data/Filtered_unphased_training_data_union_final/'

output_dir = "../../../Data/y_test_labels_unphased/"
os.makedirs(output_dir, exist_ok=True)
# Extract SNP IDs from column names
def extract_snp_ids(column_names):
    snp_ids = []
    for col in column_names:
        match = 
        if match:
            snp_ids.append(col)
    return snp_ids

for i in range(1, 23):
    file_name = data_directory + \
        f"23AndMe_PRS313_merged_chr{i}_matching_combined.parquet"
    data = pd.read_parquet(file_name)

    # Split the data into features and target
    X = data.filter(regex='^(?!.*PRS313_)')
    y = data.filter(regex='PRS313_')
    sample_ids = data.index.values  # Assuming sample IDs are in the index

    # Split the data into train-validation and test sets
    X_train_val, X_test, y_train_val, y_test, sample_train_val, sample_test = train_test_split(
        X, y, sample_ids, test_size=0.2, random_state=42)
    snp_ids = extract_snp_ids(X_train_val.columns)

    # Save the y dataframe
    # X_train_val.to_csv(output_dir + f"chr{i}_true_labels_y_test.csv")

KeyboardInterrupt: 