# Split Data into Train - Val - Eval

Train set -> England 90%

Val set -> England 10% (internal validation)

Eval set -> Wales & Scotland (External validation)

In [None]:
#uncomment this below code to install imblearn package
# !pip install imbalanced-learn

In [2]:
# !pip install scikit-optimize

In [3]:
import pandas as pd
import numpy as np
import sklearn

#statistics
from scipy.stats import chi2_contingency, ttest_ind

import cudf #gpu-powered DataFrame (Pandas alternative)

#imbalance handling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, RepeatedEditedNearestNeighbours
from imblearn.pipeline import Pipeline

#preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

#hyperparameter search
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize

#internal validation
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedStratifiedKFold, cross_val_score, GridSearchCV, PredefinedSplit, RandomizedSearchCV


#performance metrices
from sklearn.metrics import make_scorer, confusion_matrix, classification_report, f1_score, balanced_accuracy_score, r2_score, auc, average_precision_score, roc_auc_score, recall_score, roc_curve, accuracy_score

#Models selection
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from cuml.svm import SVC #gpu-powered SVM


#save and load trained model
import pickle

#visualisation
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

import os

In [5]:
# Data loader
# features = pd.read_csv("../FinalData/cleaned_features_2vs1_15112023.csv")
features = pickle.load(open('../FinalData/cleaned_features_01122023.sav', 'rb'))
# outcomes = pd.read_csv("../FinalData/cleaned_outcomes_2vs1_15112023.cs")
outcomes = pickle.load(open('../FinalData/cleaned_outcomes_01122023.sav', 'rb'))
features = features.drop_duplicates(subset=["patid"])
features.reset_index(inplace=True, drop=True)
outcomes.reset_index(inplace=True, drop=True)
# features = features[features.columns[1:]]
# outcomes = outcomes[outcomes.columns[1:]]

In [6]:
print(features.shape)
print(outcomes.shape)

(696659, 100)
(696659, 15)


In [7]:
features.columns

Index(['patid', 'practice_id', 'sex', 'age', 'BMI', 'weight', 'height',
       'ethnicity', 'ethnic_group', 'smokingStatus', 'CharlsonScore',
       'count_rhinitis', 'count_cardiovascular', 'count_heartfailure',
       'count_psoriasis', 'count_anaphylaxis', 'count_diabetes', 'count_ihd',
       'count_anxiety', 'count_eczema', 'count_nasalpolyps',
       'count_paracetamol', 'count_nsaids', 'count_betablocker', 'id',
       'event_date', 'recorded_date', 'visit_id', 'code_id', 'snomed_id',
       'numeric_1', 'numeric_2', 'created_datetime', 'updated_datetime',
       'PEFStatus', 'EosinophilLevel', 'BTS_step', 'average_daily_dose_ICS',
       'prescribed_daily_dose_ICS', 'ICS_medication_possesion_ratio',
       'DeviceType', 'Spacer', 'numOCS', 'PriorEducation', 'numPCS',
       'numPCSAsthma', 'numAntibioticsEvents', 'numAntibioticswithLRTI',
       'numOCSEvents', 'numOCSwithLRTI', 'numAsthmaAttacks',
       'numAcuteRespEvents', 'numHospEvents', 'month_12', 'month_4', 'month_5',


In [8]:
features.head()

Unnamed: 0,patid,practice_id,sex,age,BMI,weight,height,ethnicity,ethnic_group,smokingStatus,...,psoriasis,anaphylaxis,diabetes,ihd,anxiety,eczema,nasalpolyps,paracetamol,nsaids,betablocker
0,43231452,39,0.0,48,26.609713,76.0,1.69,not_recorded,not_recorded,Active Smoker,...,0,0,0,0,0,0,0,0,0,0
1,43206365,39,1.0,58,23.94636,72.5,1.74,not_recorded,not_recorded,Former Smoker,...,0,0,0,0,1,0,0,0,0,0
2,43203606,559,0.0,51,17.104513,39.0,1.51,not_recorded,not_recorded,Active Smoker,...,0,0,0,0,0,1,0,0,0,0
3,43117348,502,0.0,69,35.303241,74.0,1.4478,not_recorded,not_recorded,Former Smoker,...,1,0,0,0,0,1,0,0,0,0
4,43105858,536,0.0,16,28.731747,85.0,1.72,not_recorded,not_recorded,Non Smoker,...,0,0,0,0,0,0,0,0,0,0


In [9]:
outcomes.head()

Unnamed: 0,patid,outcome_3months,outcome_6months,outcome_9months,outcome_12months,outcome_15months,outcome_18months,outcome_21months,outcome_24months,outcome_combined_6months,outcome_combined_9months,outcome_combined_12months,outcome_combined_15months,outcome_combined_18months,outcome_combined_24months
0,43231452,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,43206365,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,43203606,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,43117348,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,43105858,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
outcomes.patid.unique().shape

(696659,)

In [11]:
masterData = features.merge(outcomes, how = 'inner', left_on='patid', right_on='patid') #join table
# masterData = masterData.dropna() #NAs from Country
masterData = masterData.reset_index(drop=True)
exclude_columns = ['weight', 'height', 'id', 'event_date', 'recorded_date', 'visit_id', 'code_id', 'snomed_id',
       'numeric_1', 'numeric_2', 'created_datetime', 'updated_datetime',]
masterData = masterData.loc[:,~masterData.columns.isin(exclude_columns)]
print('original data shape: ', masterData.shape)

original data shape:  (696659, 102)


In [12]:
# pd.options.display.max_rows = 287
masterData.isna().sum()

patid                        0
practice_id                  0
sex                          0
age                          0
BMI                          0
                            ..
outcome_combined_9months     0
outcome_combined_12months    0
outcome_combined_15months    0
outcome_combined_18months    0
outcome_combined_24months    0
Length: 102, dtype: int64

In [13]:
#Positive vs negative class ratio

# print('3 months -> 1 : ', round(masterData['3MonthsOutcome'].value_counts()[0]/masterData['3MonthsOutcome'].value_counts()[1],2))
# print('6 months -> 1 : ', round(masterData['6MonthsOutcome'].value_counts()[0]/masterData['6MonthsOutcome'].value_counts()[1],2))
# print('9 months -> 1 : ', round(masterData['9MonthsOutcome'].value_counts()[0]/masterData['9MonthsOutcome'].value_counts()[1],2))
# print('12 months -> 1 : ', round(masterData['12MonthsOutcome'].value_counts()[0]/masterData['12MonthsOutcome'].value_counts()[1],2))
print('3 months -> 1 : ', round(masterData.outcome_3months.value_counts()[0]/masterData.outcome_3months.value_counts()[1],2))
print('6 months -> 1 : ', round(masterData.outcome_combined_6months.value_counts()[0]/masterData.outcome_combined_6months.value_counts()[1],2))
print('9 months -> 1 : ', round(masterData.outcome_combined_9months.value_counts()[0]/masterData.outcome_combined_9months.value_counts()[1],2))
print('12 months -> 1 : ', round(masterData.outcome_combined_12months.value_counts()[0]/masterData.outcome_combined_12months.value_counts()[1],2))
print('15 months -> 1 : ', round(masterData.outcome_combined_15months.value_counts()[0]/masterData.outcome_combined_15months.value_counts()[1],2))
print('18 months -> 1 : ', round(masterData.outcome_combined_18months.value_counts()[0]/masterData.outcome_combined_18months.value_counts()[1],2))
print('24 months -> 1 : ', round(masterData.outcome_combined_24months.value_counts()[0]/masterData.outcome_combined_24months.value_counts()[1],2))


3 months -> 1 :  20.08
6 months -> 1 :  12.86
9 months -> 1 :  9.63
12 months -> 1 :  6.95
15 months -> 1 :  5.52
18 months -> 1 :  4.93
24 months -> 1 :  4.28


In [15]:
#Proportion of asthma attack in each outcome

# print('3 months -> ', round(masterData['3MonthsOutcome'].value_counts()[1]/len(masterData)*100,2), '%')
# print('6 months -> ', round(masterData['6MonthsOutcome'].value_counts()[1]/len(masterData)*100,2), '%')
# print('9 months -> ', round(masterData['9MonthsOutcome'].value_counts()[1]/len(masterData)*100,2), '%')
# print('12 months -> ', round(masterData['12MonthsOutcome'].value_counts()[1]/len(masterData)*100,2), '%')
print('3 months -> ', round(masterData['outcome_3months'].value_counts()[1]/len(masterData)*100,2), '%')
print('6 months -> ', round(masterData['outcome_combined_6months'].value_counts()[1]/len(masterData)*100,2), '%')
print('9 months -> ', round(masterData['outcome_combined_9months'].value_counts()[1]/len(masterData)*100,2), '%')
print('12 months -> ', round(masterData['outcome_combined_12months'].value_counts()[1]/len(masterData)*100,2), '%')
print('24 months -> ', round(masterData['outcome_combined_24months'].value_counts()[1]/len(masterData)*100,2), '%')

3 months ->  4.74 %
6 months ->  7.22 %
9 months ->  9.41 %
12 months ->  12.58 %
24 months ->  18.95 %


In [16]:
#Data scenario
# 1: all data without ethnicity variable
# 2: all data with ethnicity variable (include all missing values in ethnicity as separate group)
# 3: filter data based on ethnicity (exclude missing values)

scenario = 1 #change it based on the scenario

if scenario == 1:
    #include all data
    allData = masterData
    
elif scenario == 2:
    #Exclude ethnic column
    allData = masterData.drop('ethnic_group', axis=1)
    
elif scenario == 3:
    #exclude missing values for ethnic variable
    allData = masterData[masterData.ethnic_group!='not_recorded']
    
allData = allData.reset_index(drop=True)
print('Data shape for scenario', str(scenario), allData.shape)



Data shape for scenario 1 (696659, 102)


In [18]:
#onehot encoding for categorical variables
onehot_vars = ['ethnic_group','smokingStatus', 'DeviceType', 'cat_BMI', 'imd_decile', 'PEFStatus','EosinophilLevel', 'BTS_step', 'system']
allData = pd.get_dummies(allData, columns=onehot_vars)
allData.columns

  allData = pd.get_dummies(allData, columns=onehot_vars)


Index(['patid', 'practice_id', 'sex', 'age', 'BMI', 'ethnicity',
       'CharlsonScore', 'count_rhinitis', 'count_cardiovascular',
       'count_heartfailure',
       ...
       'EosinophilLevel_unknown', 'BTS_step_0.0', 'BTS_step_1.0',
       'BTS_step_2.0', 'BTS_step_3.0', 'BTS_step_4.0', 'BTS_step_5.0',
       'system_EMIS', 'system_SystemOne', 'system_Vision'],
      dtype='object', length=139)

In [28]:
#Split data into training, validation, and evaluation set based on the country. Include only 18+ patients.

trainingData = allData[(allData.Country == 'England')] #& (allData.age>=18)
trainingData, validationData = train_test_split(trainingData, test_size=0.2, stratify=trainingData['outcome_combined_12months'])
trainingData, internalEvaluationData = train_test_split(trainingData, test_size=0.2, stratify=trainingData['outcome_combined_12months'])

In [29]:
#Proportion of asthma attack in each outcome - training

# print('3 months -> ', round(trainingData['3MonthsOutcome'].value_counts()[1]/len(trainingData)*100,2), '%')
# print('6 months -> ', round(trainingData['6MonthsOutcome'].value_counts()[1]/len(trainingData)*100,2), '%')
# print('9 months -> ', round(trainingData['9MonthsOutcome'].value_counts()[1]/len(trainingData)*100,2), '%')
# print('12 months -> ', round(trainingData['12MonthsOutcome'].value_counts()[1]/len(trainingData)*100,2), '%')
print('3 months -> ', round(trainingData['outcome_3months'].value_counts()[1]/len(trainingData)*100,2), '%')
print('6 months -> ', round(trainingData['outcome_combined_6months'].value_counts()[1]/len(trainingData)*100,2), '%')
print('9 months -> ', round(trainingData['outcome_combined_9months'].value_counts()[1]/len(trainingData)*100,2), '%')
print('12 months -> ', round(trainingData['outcome_combined_12months'].value_counts()[1]/len(trainingData)*100,2), '%')
print('24 months -> ', round(trainingData['outcome_combined_24months'].value_counts()[1]/len(trainingData)*100,2), '%')

3 months ->  4.69 %
6 months ->  7.12 %
9 months ->  9.3 %
12 months ->  12.41 %
24 months ->  18.71 %


In [30]:
#Proportion of asthma attack in each outcome - validation

# print('3 months -> ', round(validationData['3MonthsOutcome'].value_counts()[1]/len(validationData)*100,2), '%')
# print('6 months -> ', round(validationData['6MonthsOutcome'].value_counts()[1]/len(validationData)*100,2), '%')
# print('9 months -> ', round(validationData['9MonthsOutcome'].value_counts()[1]/len(validationData)*100,2), '%')
# print('12 months -> ', round(validationData['12MonthsOutcome'].value_counts()[1]/len(validationData)*100,2), '%')
print('3 months -> ', round(validationData['outcome_3months'].value_counts()[1]/len(validationData)*100,2), '%')
print('6 months -> ', round(validationData['outcome_combined_6months'].value_counts()[1]/len(validationData)*100,2), '%')
print('9 months -> ', round(validationData['outcome_combined_9months'].value_counts()[1]/len(validationData)*100,2), '%')
print('12 months -> ', round(validationData['outcome_combined_12months'].value_counts()[1]/len(validationData)*100,2), '%')
print('24 months -> ', round(validationData['outcome_combined_24months'].value_counts()[1]/len(validationData)*100,2), '%')

3 months ->  4.65 %
6 months ->  7.03 %
9 months ->  9.19 %
12 months ->  12.41 %
24 months ->  18.66 %


In [31]:
#Split data into training and evaluation set based on the country. Include only 18+ patients.

evaluationData = allData[((allData.Country == 'Scotland') | (allData.Country == 'Wales')) ] #used for validation & (allData.age>=18)
evaluationDataWales = allData[(allData.Country == 'Wales') ] #used for validation & (allData.age>=18)
evaluationDataScotland = allData[(allData.Country == 'Scotland') ] #used for validation & (allData.age>=18)

#remove country variable
trainingData = trainingData.drop('Country', axis=1)
validationData = validationData.drop('Country', axis=1)
internalEvaluationData = internalEvaluationData.drop('Country', axis=1)
evaluationData = evaluationData.drop('Country', axis=1)
evaluationDataWales = evaluationDataWales.drop('Country', axis=1)
evaluationDataScotland = evaluationDataScotland.drop('Country', axis=1)

trainingData = trainingData.reset_index(drop=True)
validationData = validationData.reset_index(drop=True)
internalEvaluationData = internalEvaluationData.reset_index(drop=True)
evaluationData = evaluationData.reset_index(drop=True)
evaluationDataWales = evaluationDataWales.reset_index(drop=True)
evaluationDataScotland = evaluationDataScotland.reset_index(drop=True)

print('Training data shape:', trainingData.shape)
print('Validation data shape:', validationData.shape)
print('Internal Evaluation data shape:', internalEvaluationData.shape)
print('Evaluation data shape: ', evaluationData.shape)
print('Evaluation data Wales shape: ', evaluationDataWales.shape)
print('Evaluation data Scotland shape: ', evaluationDataScotland.shape)

Training data shape: (429177, 138)
Validation data shape: (134119, 138)
Internal Evaluation data shape: (107295, 138)
Evaluation data shape:  (26068, 138)
Evaluation data Wales shape:  (16739, 138)
Evaluation data Scotland shape:  (9329, 138)


In [32]:
#Save original data
# trainingData.to_csv('../FinalData/trainingData_2vs1_16112023.csv', index_label=False, index=False)
# validationData.to_csv('../FinalData/validationData_2vs1_16112023.csv', index_label=False, index=False)
# internalEvaluationData.to_csv('../FinalData/internalEvalulationData_2vs1_16112023.csv', index_label=False, index=False)
# evaluationData.to_csv('../FinalData/evaluationData_2vs1_16112023.csv', index_label=False, index=False)
# evaluationDataWales.to_csv('../FinalData/evaluationDataWales_2vs1_16112023.csv', index_label=False, index=False)
# evaluationDataScotland.to_csv('../FinalData/evaluationDataScotland_2vs1_16112023.csv', index_label=False, index=False)
sets = [trainingData, validationData, internalEvaluationData, evaluationData, evaluationDataWales, evaluationDataScotland]
pickle.dump(sets, open('../FinalData/dataset_01122023.sav', 'wb'))

In [None]:
# #encode categorical data
# # 
# # cat_vars = [, 'PEFStatus','EosinophilLevel']
# # data_categorical = trainingData[cat_vars]
# onehot_vars = ['ethnic_group','smokingStatus', 'DeviceType', 'cat_BMI', 'imd_decile', 'PEFStatus','EosinophilLevel', 'BTS_step']
# data_onehot = trainingData[onehot_vars]

# #ordinal encoder
# # encoder = OrdinalEncoder(categories=[['not_recorded','less than 60', '60-80', 'more than 80'], ['unknown', 'normal', 'high']]).set_output(transform="pandas")
# # data_encoded = encoder.fit_transform(data_categorical)
# # pickle.dump(encoder, open('../Models/cat_encoder.pkl', 'wb'))
    
# #one hot encoder
# onehot_encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
# onehot_encoded = onehot_encoder.fit_transform(data_onehot)
# pickle.dump(onehot_encoder, open('../Models/onehot_encoder.pkl', 'wb'))

# # trainingData = pd.concat([trainingData.drop(cat_vars, axis=1), data_encoded], axis=1)
# trainingData = pd.concat([trainingData.drop(onehot_vars, axis=1), onehot_encoded], axis=1)

# print('Data shape after encoding: ', trainingData.shape)

In [38]:
# #encode cat vars for validation and evaluation set
# # data_val_categorical = validationData[cat_vars]
# data_val_onehot = validationData[onehot_vars]
# data_internaleval_onehot = internalEvaluationData[onehot_vars]
# # data_eval_categorical = evaluationData[cat_vars]
# data_eval_onehot = evaluationData[onehot_vars]
# # data_eval_Wales_categorical = evaluationDataWales[cat_vars]
# data_eval_Wales_onehot = evaluationDataWales[onehot_vars]
# # data_eval_Scotland_categorical = evaluationDataScotland[cat_vars]
# data_eval_Scotland_onehot = evaluationDataScotland[onehot_vars]

# # encoder = pickle.load(open('../Models/cat_encoder.pkl', 'rb'))
# # data_val_encoded = encoder.transform(data_val_categorical)
# # data_eval_encoded = encoder.transform(data_eval_categorical)
# # data_eval_Wales_encoded = encoder.transform(data_eval_Wales_categorical)
# # data_eval_Scotland_encoded = encoder.transform(data_eval_Scotland_categorical)

# onehot_encoder = pickle.load(open('../Models/onehot_encoder.pkl', 'rb'))
# onehot_val_encoded = onehot_encoder.transform(data_val_onehot)
# onehot_internaleval_encoded = onehot_encoder.transform(data_internaleval_onehot)
# onehot_eval_encoded = onehot_encoder.transform(data_eval_onehot)
# onehot_eval_Wales_encoded = onehot_encoder.transform(data_eval_Wales_onehot)
# onehot_eval_Scotland_encoded = onehot_encoder.transform(data_eval_Scotland_onehot)

# # validationData = pd.concat([validationData.drop(cat_vars, axis=1), data_val_encoded], axis=1)
# validationData = pd.concat([validationData.drop(onehot_vars, axis=1), onehot_val_encoded], axis=1)

# # evaluationData = pd.concat([evaluationData.drop(cat_vars, axis=1), data_eval_encoded], axis=1)
# internalEvaluationData = pd.concat([internalEvaluationData.drop(onehot_vars, axis=1), onehot_internaleval_encoded], axis=1)

# # evaluationData = pd.concat([evaluationData.drop(cat_vars, axis=1), data_eval_encoded], axis=1)
# evaluationData = pd.concat([evaluationData.drop(onehot_vars, axis=1), onehot_eval_encoded], axis=1)

# # evaluationDataWales = pd.concat([evaluationDataWales.drop(cat_vars, axis=1), data_eval_Wales_encoded], axis=1)
# evaluationDataWales = pd.concat([evaluationDataWales.drop(onehot_vars, axis=1), onehot_eval_Wales_encoded], axis=1)

# # evaluationDataScotland = pd.concat([evaluationDataScotland.drop(cat_vars, axis=1), data_eval_Scotland_encoded], axis=1)
# evaluationDataScotland = pd.concat([evaluationDataScotland.drop(onehot_vars, axis=1), onehot_eval_Scotland_encoded], axis=1)

print('Val data shape after encoding: ', validationData.shape)
print('Internal Eval data shape after encoding: ', internalEvaluationData.shape)
print('Eval data shape after encoding: ', evaluationData.shape)
print('Evaluation data Wales shape: ', evaluationDataWales.shape)
print('Evaluation data Scotland shape: ', evaluationDataScotland.shape)

Val data shape after encoding:  (134119, 138)
Internal Eval data shape after encoding:  (107295, 138)
Eval data shape after encoding:  (26068, 138)
Evaluation data Wales shape:  (16739, 138)
Evaluation data Scotland shape:  (9329, 138)


In [39]:
#Data normalisation for continous variable into 0-1 range


continuous_vars = ['age', 'CharlsonScore', 'average_daily_dose_ICS', 'prescribed_daily_dose_ICS', 'ICS_medication_possesion_ratio', 
                   'numOCS', 'numPCS', 'numPCSAsthma', 'numAntibioticsEvents', 'numAntibioticswithLRTI', 'numOCSEvents', 'numOCSwithLRTI', 
                   'numAsthmaAttacks', 'numAcuteRespEvents', 'numHospEvents']

# define scaler
scaler = StandardScaler()
data_scaled = scaler.fit_transform(trainingData[continuous_vars])
pickle.dump(scaler, open('../Models/cont_scaler.pkl', 'wb'))


data_scaled = pd.DataFrame(data_scaled, columns=scaler.get_feature_names_out())
trainingData = pd.concat([trainingData.drop(continuous_vars, axis=1), data_scaled], axis=1)

print('Data shape after scaling: ', trainingData.shape)

Data shape after scaling:  (429177, 138)


In [40]:
#Normalisation for evaluation data

scaler = pickle.load(open('../Models/cont_scaler.pkl', 'rb'))
data_val_scaled = scaler.transform(validationData[continuous_vars])
data_eval_scaled = scaler.transform(evaluationData[continuous_vars])
data_eval_Wales_scaled = scaler.transform(evaluationDataWales[continuous_vars])
data_eval_Scotland_scaled = scaler.transform(evaluationDataScotland[continuous_vars])

data_val_scaled = pd.DataFrame(data_val_scaled, columns=scaler.get_feature_names_out())
validationData = pd.concat([validationData.drop(continuous_vars, axis=1), data_val_scaled], axis=1)

data_eval_scaled = pd.DataFrame(data_eval_scaled, columns=scaler.get_feature_names_out())
evaluationData = pd.concat([evaluationData.drop(continuous_vars, axis=1), data_eval_scaled], axis=1)

data_eval_Wales_scaled = pd.DataFrame(data_eval_Wales_scaled, columns=scaler.get_feature_names_out())
evaluationDataWales = pd.concat([evaluationDataWales.drop(continuous_vars, axis=1), data_eval_Wales_scaled], axis=1)

data_eval_Scotland_scaled = pd.DataFrame(data_eval_Scotland_scaled, columns=scaler.get_feature_names_out())
evaluationDataScotland = pd.concat([evaluationDataScotland.drop(continuous_vars, axis=1), data_eval_Scotland_scaled], axis=1)

print('Val data shape after scaling: ', validationData.shape)
print('Eval data shape after scaling: ', evaluationData.shape)
print('Evaluation data Wales shape: ', evaluationDataWales.shape)
print('Evaluation data Scotland shape: ', evaluationDataScotland.shape)

Val data shape after scaling:  (134119, 138)
Eval data shape after scaling:  (26068, 138)
Evaluation data Wales shape:  (16739, 138)
Evaluation data Scotland shape:  (9329, 138)


In [41]:
#Define feature candidates

features_columns = trainingData.columns.to_list()
exclude_columns = ['patid', 'practice_id', #identifier
                   'BMI', #use the categorical instead
                   'ethnicity', #use ethnic_group instead
                   'Spacer',  #all zero
                   
                   'outcome_3months', 'outcome_6months', 'outcome_9months', 'outcome_12months', 'outcome_15months', 'outcome_18months', 
                   'outcome_21months', 'outcome_24months', 'outcome_combined_6months', 'outcome_combined_9months', 'outcome_combined_12months', 
                   'outcome_combined_15months', 'outcome_combined_18months', 'outcome_combined_24months', '3months', '6months', '12months', '24months', #outcomes variable
                   
                   'postcode_district', 'County', 'LocalAuthority', 'OutputAreaClassification', #location related variables, use IMD decile only
                   
                   'cat_age', 'cat_average_daily_dose_ICS', 'cat_prescribed_daily_dose_ICS', 'cat_ICS_medication_possesion_ratio', 'cat_numOCS', 'cat_numOCSEvents', 
                   'cat_numOCSwithLRTI', 'cat_numAcuteRespEvents', 'cat_numAntibioticsEvents', 'cat_numAntibioticswithLRTI', 'cat_numAsthmaAttacks', 'cat_numHospEvents', 
                   'cat_numPCS', 'cat_numPCSAsthma', #use continous vars instead
                   
                   'count_rhinitis', 'count_cardiovascular', 'count_heartfailure',
                   'count_psoriasis', 'count_anaphylaxis', 'count_diabetes', 'count_ihd',
                   'count_anxiety', 'count_eczema', 'count_nasalpolyps',
                   'count_paracetamol', 'count_nsaids', 'count_betablocker', #use binary ones
                   
                   'paracetamol', 'nsaids', 'betablocker', #no data in evaluation
                   
                   'numOCSEvents', #duplicate with numOCS
                  ]
exclude_columns = exclude_columns + [x for x in features_columns if '_count' in x] #filter out commorbid count variables
features_columns = [x for x in features_columns if x not in exclude_columns]
print('Features size: ', len(features_columns))
print(features_columns)

Features size:  84
['sex', 'PriorEducation', 'month_12', 'month_4', 'month_5', 'month_10', 'month_1', 'month_6', 'month_3', 'month_11', 'month_8', 'month_9', 'month_7', 'month_2', 'rhinitis', 'cardiovascular', 'heartfailure', 'psoriasis', 'anaphylaxis', 'diabetes', 'ihd', 'anxiety', 'eczema', 'nasalpolyps', 'ethnic_group_Asian - ethnic group', 'ethnic_group_Black - ethnic group', 'ethnic_group_Mixed ethnic census group', 'ethnic_group_Other ethnic group', 'ethnic_group_White - ethnic group', 'ethnic_group_not_recorded', 'smokingStatus_Active Smoker', 'smokingStatus_Former Smoker', 'smokingStatus_Non Smoker', 'DeviceType_BAI', 'DeviceType_DPI', 'DeviceType_NEB', 'DeviceType_pMDI', 'DeviceType_unknown', 'cat_BMI_normal', 'cat_BMI_not recorded', 'cat_BMI_obese', 'cat_BMI_overweight', 'cat_BMI_underweight', 'imd_decile_0', 'imd_decile_1', 'imd_decile_2', 'imd_decile_3', 'imd_decile_4', 'imd_decile_5', 'imd_decile_6', 'imd_decile_7', 'imd_decile_8', 'imd_decile_9', 'imd_decile_10', 'PEFStat

In [43]:
#make sure no data leak between sets
print(list(set(trainingData.patid.values).intersection(set(validationData.patid.values))))
print(list(set(trainingData.patid.values).intersection(set(internalEvaluationData.patid.values))))
print(list(set(validationData.patid.values).intersection(set(internalEvaluationData.patid.values))))
print(list(set(validationData.patid.values).intersection(set(evaluationData.patid.values))))
print(list(set(trainingData.patid.values).intersection(set(evaluationData.patid.values))))
print(len(list(set(evaluationData.patid.values).intersection(set(evaluationDataScotland.patid.values))))) # here data leak is expected)

[]
[]
[]
[]
[]
9329


In [45]:
#Save encoded data
# trainingData.to_csv('../FinalData/trainingDataEncoded_2vs1_16112023.csv', index_label=False, index=False)
# validationData.to_csv('../FinalData/validationDataEncoded_2vs1_16112023.csv', index_label=False, index=False)
# internalEvaluationData.to_csv('../FinalData/internalEvaluationDataEncoded_2vs1_16112023.csv', index_label=False, index=False)
# evaluationData.to_csv('../FinalData/evaluationDataEncoded_2vs1_16112023.csv', index_label=False, index=False)
# evaluationDataWales.to_csv('../FinalData/evaluationDataWalesEncoded_2vs1_16112023.csv', index_label=False, index=False)
# evaluationDataScotland.to_csv('../FinalData/evaluationDataScotlandEncoded_2vs1_16112023.csv', index_label=False, index=False)
scaled_sets = [trainingData, validationData, internalEvaluationData, evaluationData, evaluationDataWales, evaluationDataScotland]
pickle.dump(sets, open('../FinalData/dataset_scaled_01122023.sav', 'wb'))