In [1]:
# Import dependencies
import pandas as pd
import numpy as np

## Construct original training set

In [None]:
# Load and merge data for training
## Modified from original pipeline
files = [
    '../data/metaSVM_train.anno.rare.HIS.reformat.csv',
    '../data/CADD_neg_train.anno.rare.HIS.reformat.csv',
    '../data/clinvar_pathogenic_1-4star.rare.HIS.reformat.csv',
    '../data/DiscovEHR_missense_sel.rare.HIS.reformat.csv'
]

df_list = []
for f in files:
    print(f"Loading: {f}")
    df = pd.read_csv(f, on_bad_lines='skip')
    df_list.append(df)

df_combined = pd.concat(df_list, axis=0, ignore_index=True)

print(f"Shape of combined dataset: {df_combined.shape}")

In [None]:
# Data cleansing and deduplication
## Deduplication
df_combined = df_combined.drop_duplicates(subset='var_id', keep='first')

## Set positive and negative samples
pos = df_combined[df_combined.target == 1]
neg = df_combined[df_combined.target == 0]

## Remove conflicts
var_pos = set(pos.var_id)
var_neg = set(neg.var_id)
pos = pos[~pos.var_id.isin(var_neg)]
neg = neg[~neg.var_id.isin(var_pos)]

## Merge positive and negative samples
df_clean = pd.concat([pos, neg], axis=0, ignore_index=True)

print(f"Shape after cleansing: {df_clean.shape}")

In [None]:
# Feature selection
exclude_cols = {'var_id', 'aaref', 'aaalt', 'target', 'Ensembl_transcriptid',
                'ref', 'alt', 'category',
                'source', 'INFO', 'disease', 'genename',
                '#chr', 'pos(1-based)', 'hg19_chr', 'hg19_pos(1-based)',
                'CADD_phred', '1000Gp3_AF', 'ExAC_AF', 'gnomad',
                'RVIS', 'mis_badness', 'MPC', 'REVEL', 'domino'}

## Keep the chosen columns
feature_cols = [col for col in df_clean.columns if col not in exclude_cols]

print(f"Number of features used for training: {len(feature_cols)}")

In [None]:
## Impute NaN
protein_complex_scores = ['complex_CORUM', 'preppi_counts', 'BioPlex'] # Missing values will be assigned 0

X_filled = df_clean[feature_cols].copy()

for col in X_filled.columns:
    if col in protein_complex_scores:
        X_filled[col] = pd.to_numeric(X_filled[col], errors='coerce').fillna(0.0)
    else:
        X_filled[col] = pd.to_numeric(X_filled[col], errors='coerce').fillna(-1.0)

X = X_filled.values
X = np.nan_to_num(X, nan=0.0)

# Check condition of X
print("Any NaN:", np.isnan(X).any())
print("Any Inf:", np.isinf(X).any())
print("Shape:", X.shape)

In [None]:
# Reorganize DataFrame
df_final = pd.DataFrame(X, columns=feature_cols)
df_final['target'] = df_clean['target'].values

# Save
df_final.to_csv('../data/mvp_input_data_cleaned.HIS.csv', index=False)

## Prepare benchmark dataset for training

In [2]:
from sklearn.preprocessing import MinMaxScaler

In [3]:
# Load benchmark dataset for training
df_training = pd.read_csv('../../../data/training_set.csv')
df_training = df_training.rename(columns={'label': 'target'}) # rename to match training pipeline
df_training.to_csv('../data/training_set_with_target.csv', index=False)

In [4]:
# Select features for training
exclude_cols_training = {
    'Uploaded_variation', 'level_1', 'Location', 'Allele', 'Gene',
    'Feature', 'Feature_type', 'Consequence',
    'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids',
    'Codons', 'Existing_variation', 'IMPACT', 'DISTANCE',
    'STRAND', 'FLAGS', 'ClinicalSignificance', 'target', 'MVP_score', 'CANONICAL'
}

feature_cols_training = [col for col in df_training.columns if col not in exclude_cols_training]
print(f"Number of features used for training (benchmark): {len(feature_cols_training)}")

Number of features used for training (benchmark): 42


In [5]:
# Normalization
X_training = df_training[feature_cols_training].copy()
scaler = MinMaxScaler()
X_training_scaled = scaler.fit_transform(X_training)

In [6]:
# Reconstruct DataFrame
X_scaled_df = pd.DataFrame(X_training_scaled, columns=feature_cols_training)
X_scaled_df['target'] = df_training['target'].values

# Save to file
X_scaled_df.to_csv("../data/training_set_scaled.csv", index=False)
print("Normalized benchmark data saved to: training_set_scaled.csv")

Normalized benchmark data saved to: training_set_scaled.csv


## Prepare benchmark dataset for test

In [7]:
# Load benchmark dataset for test
df_test = pd.read_csv('../../../data/test_set.csv')
df_test = df_test.rename(columns={'label': 'target'}) # rename to match the pipeline
df_test.to_csv('../data/test_set_with_target.csv', index=False)

In [8]:
# Select features for test
exclude_cols_test = {
    'Uploaded_variation', 'level_1', 'Location', 'Allele', 'Gene',
    'Feature', 'Feature_type', 'Consequence',
    'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids',
    'Codons', 'Existing_variation', 'IMPACT', 'DISTANCE',
    'STRAND', 'FLAGS', 'ClinicalSignificance', 'target', 'MVP_score', 'CANONICAL'
}

feature_cols_test = [col for col in df_test.columns if col not in exclude_cols_test]
print(f"Number of features used for test (benchmark): {len(feature_cols_test)}")

Number of features used for test (benchmark): 42


In [9]:
# Normalization
X_test = df_test[feature_cols_test].copy()
scaler = MinMaxScaler()
X_test_scaled = scaler.fit_transform(X_test)

In [10]:
# Reconstruct DataFrame
X_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_cols_test)
X_scaled_df['target'] = df_test['target'].values

# Save to file
X_scaled_df.to_csv("../data/test_set_scaled.csv", index=False)
print("Normalized benchmark data saved to: test_set_scaled.csv")

Normalized benchmark data saved to: test_set_scaled.csv
