In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample, shuffle
import pickle
import numpy as np

# Split Data 

In [None]:
final_combined = pd.read_pickle('observation_10_prediction_10_endo.pkl')

with open('train_pts.pkl', 'rb') as f:
    train_pts = pickle.load(f)
with open('test_pts.pkl', 'rb') as f:
    test_pts = pickle.load(f)

In [None]:
final_combined['ovarian_ca'] = final_combined['ovarian_ca'].fillna(0).astype(int)

In [None]:
train_data = final_combined.loc[final_combined['Masked_PersonID'].isin(train_pts)] 
test_data = final_combined.loc[final_combined['Masked_PersonID'].isin(test_pts)] 

## Downsample negative class to get original ovarian cancer prevalence of 0.003 (0.3%)

In [None]:
pos = train_data.loc[train_data['ovarian_ca'] == 1].copy()
neg = train_data.loc[train_data['ovarian_ca'] == 0].copy()

# Find number of negative cases needed to make the ovarian cancer ratio = 0.003 (original prevalence)
neg_samples = int((len(pos.index) - 0.003*len(pos.index))/0.003)
neg = resample(neg, replace=False, n_samples=neg_samples)
            
train_data = pd.concat([pos, neg]).reset_index(drop=True)
train_data = shuffle(train_data).reset_index(drop=True)

In [None]:
pos = test_data.loc[test_data['ovarian_ca'] == 1].copy()
neg = test_data.loc[test_data['ovarian_ca'] == 0].copy()

# Find number of negative cases needed to make the ovarian cancer ratio = 0.003 (original prevalence)
neg_samples = int((len(pos.index) - 0.003*len(pos.index))/0.003)
neg = resample(neg, replace=False, n_samples=neg_samples)
            
test_data = pd.concat([pos, neg]).reset_index(drop=True)
test_data = shuffle(test_data).reset_index(drop=True)

# Impute Data
Impute with medians for continuous variables and zero for categorical variables.

In [None]:
continuous_cols = ['BWgtGrams','MaAge','MotherBMI','age','BMI']
medians = train_data[continuous_cols].median().to_dict()
categorical_cols = [col for col in train_data.columns if col not in continuous_cols]
categorical_types = dict(zip(categorical_cols, [int] * len(categorical_cols)))

train_data = train_data.fillna(medians)
train_data = train_data.fillna(0)
train_data = train_data.astype(categorical_types)

In [None]:
test_data = test_data.fillna(medians)
test_data = test_data.fillna(0)
test_data = test_data.astype(categorical_types)

# Scale Data

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))
scaler.fit(train_data.drop(columns=['ovarian_ca', 'new_ID', 'Masked_PersonID']))

In [None]:
train_X = scaler.transform(train_data.drop(columns=['ovarian_ca', 'new_ID', 'Masked_PersonID']))
train_y = train_data['ovarian_ca'].values

In [None]:
test_X = scaler.transform(test_data.drop(columns=['ovarian_ca', 'new_ID', 'Masked_PersonID']))
test_y = test_data['ovarian_ca'].values

# Save Data

In [None]:
with open('X_columns.pkl', 'wb') as f:
    pickle.dump(train_data.drop(columns=['ovarian_ca', 'new_ID', 'Masked_PersonID']).columns.tolist(), f)
with open('medians.pkl', 'wb') as f:
    pickle.dump(medians, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('train_X.pkl', 'wb') as f:
    pickle.dump(train_X, f)
with open('train_y.pkl', 'wb') as f:
    pickle.dump(train_y, f)
    
with open('test_X.pkl', 'wb') as f:
    pickle.dump(test_X, f)
with open('test_y.pkl', 'wb') as f:
    pickle.dump(test_y, f)