# Split Data into Train - Val - Eval; For Ordinal Variables (BTS, Charlson, etc)

Grid search -> England 10%

Train set -> England 70%

Internal val set -> England 20% (internal validation)

External val set -> Wales & Scotland (External validation)

In [None]:
#uncomment this below code to install imblearn package
# !pip install imbalanced-learn

In [None]:
# !pip install scikit-optimize

In [1]:
import pandas as pd
import numpy as np
import sklearn

#statistics
from scipy.stats import chi2_contingency, ttest_ind

# import cudf #gpu-powered DataFrame (Pandas alternative)

#imbalance handling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, RepeatedEditedNearestNeighbours
from imblearn.pipeline import Pipeline

#preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split

#hyperparameter search
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize

#internal validation
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedStratifiedKFold, cross_val_score, GridSearchCV, PredefinedSplit, RandomizedSearchCV


#performance metrices
from sklearn.metrics import make_scorer, confusion_matrix, classification_report, f1_score, balanced_accuracy_score, r2_score, auc, average_precision_score, roc_auc_score, recall_score, roc_curve, accuracy_score

#Models selection
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
# from cuml.svm import SVC #gpu-powered SVM


#save and load trained model
import pickle

#visualisation
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

import os

random_state = 42

In [5]:
# Data loader
features = pd.read_csv("../../Clean_data/cleaned_features_24062025.csv")
# features = pickle.load(open('../../FinalData/cleaned_features_0112202.sav', 'rb'))
outcomes = pd.read_csv("../../Clean_data/cleaned_outcomes_24062025.csv")
# outcomes = pickle.load(open('../../FinalData/cleaned_outcomes_01122023.sav', 'rb'))
features = features.drop_duplicates(subset=["patid"])
outcomes = outcomes.drop_duplicates(subset=["patid"])
features.reset_index(inplace=True, drop=True)
outcomes.reset_index(inplace=True, drop=True)
# features = features[features.columns[1:]]
# outcomes = outcomes[outcomes.columns[1:]]

  features = pd.read_csv("../../Clean_data/cleaned_features_24062025.csv")


In [6]:
print(features.shape)
print(outcomes.shape)

(610470, 71)
(610470, 5)


In [7]:
features['asthmaPlan'] = features.numAsthmaManagement.apply(lambda x: 0 if x==0 else 1)

In [8]:
features.columns

Index(['patid', 'set', 'sex', 'age', 'BMI', 'ethnic_group', 'smokingStatus',
       'imd_decile', 'CharlsonScore', 'count_rhinitis', 'count_cardiovascular',
       'count_heartfailure', 'count_psoriasis', 'count_anaphylaxis',
       'count_diabetes', 'count_ihd', 'count_anxiety', 'count_eczema',
       'count_nasalpolyps', 'count_paracetamol', 'count_nsaids',
       'count_betablocker', 'PEFStatus', 'EosinophilLevel', 'BTS_step',
       'DeviceType', 'PriorEducation', 'average_daily_dose_ICS',
       'prescribed_daily_dose_ICS', 'ICS_medication_possesion_ratio', 'numPCS',
       'numPCSAsthma', 'numAntibioticsEvents', 'numAntibioticswithLRTI',
       'numOCSEvents', 'numOCSwithLRTI', 'numAsthmaAttacks',
       'numAcuteRespEvents', 'numHospEvents', 'numAsthmaManagement',
       'numAsthmaReview', 'BMI_cat', 'age_cat', 'average_daily_dose_ICS_cat',
       'prescribed_daily_dose_ICS_cat', 'ICS_medication_possesion_ratio_cat',
       'numOCS_cat', 'numOCSEvents_cat', 'numOCSwithLRTI_cat',

In [9]:
features.head()

Unnamed: 0,patid,set,sex,age,BMI,ethnic_group,smokingStatus,imd_decile,CharlsonScore,count_rhinitis,...,anaphylaxis,diabetes,ihd,anxiety,eczema,nasalpolyps,paracetamol,nsaids,betablocker,asthmaPlan
0,43231452,training,0,48.0,0.0,not recorded,never,8,1.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,43206365,training,1,58.0,0.0,not recorded,never,8,1.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,43203606,training,0,51.0,0.0,not recorded,current,6,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,43117348,training,0,69.0,35.3,not recorded,never,9,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,43105858,training,0,16.0,27.0,not recorded,never,3,1.0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [10]:
outcomes.head()

Unnamed: 0,patid,outcome_3months,outcome_6months,outcome_9months,outcome_12months
0,43231452,0,0,0,0
1,43206365,0,0,0,0
2,43203606,0,0,0,0
3,43117348,0,0,0,0
4,43105858,0,0,0,0


In [11]:
masterData = features.merge(outcomes, how = 'inner', left_on='patid', right_on='patid') #join table
# masterData = masterData.dropna() #NAs from Country
masterData = masterData.reset_index(drop=True)
exclude_columns = ['weight', 'height', 'id', 'event_date', 'recorded_date', 'visit_id', 'code_id', 'snomed_id',
       'numeric_1', 'numeric_2', 'created_datetime', 'updated_datetime',]
masterData = masterData.loc[:,~masterData.columns.isin(exclude_columns)]
print('original data shape: ', masterData.shape)

original data shape:  (610470, 76)


In [12]:
pd.options.display.max_rows = 287
masterData.isna().sum()

patid                                 0
set                                   0
sex                                   0
age                                   0
BMI                                   0
ethnic_group                          0
smokingStatus                         0
imd_decile                            0
CharlsonScore                         0
count_rhinitis                        0
count_cardiovascular                  0
count_heartfailure                    0
count_psoriasis                       0
count_anaphylaxis                     0
count_diabetes                        0
count_ihd                             0
count_anxiety                         0
count_eczema                          0
count_nasalpolyps                     0
count_paracetamol                     0
count_nsaids                          0
count_betablocker                     0
PEFStatus                             0
EosinophilLevel                       0
BTS_step                              0


In [13]:
#Positive vs negative class ratio

print('3 months -> 1 : ', round(masterData['outcome_3months'].value_counts()[0]/masterData['outcome_3months'].value_counts()[1],2))
print('6 months -> 1 : ', round(masterData['outcome_6months'].value_counts()[0]/masterData['outcome_6months'].value_counts()[1],2))
print('9 months -> 1 : ', round(masterData['outcome_9months'].value_counts()[0]/masterData['outcome_9months'].value_counts()[1],2))
print('12 months -> 1 : ', round(masterData['outcome_12months'].value_counts()[0]/masterData['outcome_12months'].value_counts()[1],2))
# print('3 months -> 1 : ', round(masterData.outcome_3months.value_counts()[0]/masterData.outcome_3months.value_counts()[1],2))
# print('6 months -> 1 : ', round(masterData.outcome_combined_6months.value_counts()[0]/masterData.outcome_combined_6months.value_counts()[1],2))
# print('9 months -> 1 : ', round(masterData.outcome_combined_9months.value_counts()[0]/masterData.outcome_combined_9months.value_counts()[1],2))
# print('12 months -> 1 : ', round(masterData.outcome_combined_12months.value_counts()[0]/masterData.outcome_combined_12months.value_counts()[1],2))
# print('15 months -> 1 : ', round(masterData.outcome_combined_15months.value_counts()[0]/masterData.outcome_combined_15months.value_counts()[1],2))
# print('18 months -> 1 : ', round(masterData.outcome_combined_18months.value_counts()[0]/masterData.outcome_combined_18months.value_counts()[1],2))
# print('24 months -> 1 : ', round(masterData.outcome_combined_24months.value_counts()[0]/masterData.outcome_combined_24months.value_counts()[1],2))


3 months -> 1 :  20.3
6 months -> 1 :  13.05
9 months -> 1 :  10.1
12 months -> 1 :  7.59


In [11]:
#Proportion of asthma attack in each outcome

print('3 months -> ', round(masterData.outcome_3months.value_counts(normalize=True)[1]*100,2), '%')
print('6 months -> ', round(masterData.outcome_6months.value_counts(normalize=True)[1]*100,2), '%')
print('9 months -> ', round(masterData.outcome_9months.value_counts(normalize=True)[1]*100,2), '%')
print('12 months -> ', round(masterData.outcome_12months.value_counts(normalize=True)[1]*100,2), '%')
# print('3 months -> ', round(masterData['outcome_3months'].value_counts()[1]/len(masterData)*100,2), '%')
# print('6 months -> ', round(masterData['outcome_combined_6months'].value_counts()[1]/len(masterData)*100,2), '%')
# print('9 months -> ', round(masterData['outcome_combined_9months'].value_counts()[1]/len(masterData)*100,2), '%')
# print('12 months -> ', round(masterData['outcome_combined_12months'].value_counts()[1]/len(masterData)*100,2), '%')
# print('24 months -> ', round(masterData['outcome_combined_24months'].value_counts()[1]/len(masterData)*100,2), '%')

3 months ->  4.69 %
6 months ->  7.11 %
9 months ->  9.0 %
12 months ->  11.64 %


# Start HERE

In [14]:
#Data scenario
# 1: all data with ethnicity variable (include all missing values in ethnicity as separate group)
# 2: all data without ethnicity variable
# 3: filter data based on ethnicity (exclude missing values)

scenario = 1 #change it based on the scenario

if scenario == 1:
    #include all data
    allData = masterData
    
elif scenario == 2:
    #Exclude ethnic column
    allData = masterData.drop('ethnic_group', axis=1)
    
elif scenario == 3:
    #exclude missing values for ethnic variable
    allData = masterData[masterData.ethnic_group!='not_recorded']
    
allData = allData.reset_index(drop=True)
print('Data shape for scenario', str(scenario), allData.shape)



Data shape for scenario 1 (610470, 76)


In [13]:
allData.columns

Index(['patid', 'set', 'sex', 'age', 'BMI', 'ethnic_group', 'smokingStatus',
       'imd_decile', 'CharlsonScore', 'count_rhinitis', 'count_cardiovascular',
       'count_heartfailure', 'count_psoriasis', 'count_anaphylaxis',
       'count_diabetes', 'count_ihd', 'count_anxiety', 'count_eczema',
       'count_nasalpolyps', 'count_paracetamol', 'count_nsaids',
       'count_betablocker', 'PEFStatus', 'EosinophilLevel', 'BTS_step',
       'DeviceType', 'PriorEducation', 'average_daily_dose_ICS',
       'prescribed_daily_dose_ICS', 'ICS_medication_possesion_ratio', 'numPCS',
       'numPCSAsthma', 'numAntibioticsEvents', 'numAntibioticswithLRTI',
       'numOCSEvents', 'numOCSwithLRTI', 'numAsthmaAttacks',
       'numAcuteRespEvents', 'numHospEvents', 'numAsthmaManagement',
       'numAsthmaReview', 'numAsthmaMedReview', 'numAsthmaReviewRCP',
       'BMI_cat', 'age_cat', 'average_daily_dose_ICS_cat',
       'prescribed_daily_dose_ICS_cat', 'ICS_medication_possesion_ratio_cat',
       'num

In [15]:
features_candidate = ['sex', 'age', 'BMI', 'ethnic_group', 'smokingStatus',
       'imd_decile', 'CharlsonScore', 'count_rhinitis', 'count_cardiovascular',
       'count_heartfailure', 'count_psoriasis', 'count_anaphylaxis',
       'count_diabetes', 'count_ihd', 'count_anxiety', 'count_eczema',
       'count_nasalpolyps', 'count_paracetamol', 'count_nsaids',
       'count_betablocker', 'PEFStatus', 'EosinophilLevel', 'BTS_step',
       'DeviceType', 'PriorEducation', 'average_daily_dose_ICS',
       'prescribed_daily_dose_ICS', 'ICS_medication_possesion_ratio', 'numPCS',
       'numPCSAsthma', 'numAntibioticsEvents', 'numAntibioticswithLRTI',
       'numOCSEvents', 'numOCSwithLRTI', 'numAsthmaAttacks',
       'numAcuteRespEvents', 'numHospEvents', 'numAsthmaManagement',
       'numAsthmaReview', 'numAsthmaMedReview', 'numAsthmaReviewRCP']
len(features_candidate)

41

In [16]:
#Split training set into grid search, cross val, and internal validation

trainingData = allData[(allData.set == 'training')] #& (allData.age>=18)
gridSearchData, crossValData = train_test_split(trainingData, train_size=0.05, stratify=trainingData['outcome_12months'], random_state=random_state)
crossValData, internalEvaluationData = train_test_split(crossValData, test_size=0.215, stratify=crossValData['outcome_12months'],  random_state=random_state)
externalEvaluationData = allData[allData.set == 'evaluation']
gridSearchData.reset_index(drop=True, inplace=True)
crossValData.reset_index(drop=True, inplace=True)
internalEvaluationData.reset_index(drop=True, inplace=True)
externalEvaluationData = externalEvaluationData.drop_duplicates('patid')
externalEvaluationData.reset_index(drop=True, inplace=True)


In [18]:
#save data before onehot encoding and scalling
sets = [gridSearchData, crossValData, internalEvaluationData, externalEvaluationData]
pickle.dump(sets, open('../../Clean_data/dataset_2vs1_24062025.sav', 'wb'))

# One Hot Encoding

In [19]:
#onehot encoding for categorical variables
onehot_vars = ['ethnic_group', 'smokingStatus','DeviceType',  'PriorEducation',]

allData = pd.get_dummies(allData, columns=onehot_vars, dtype='int')
allData.columns

Index(['patid', 'set', 'sex', 'age', 'BMI', 'imd_decile', 'CharlsonScore',
       'count_rhinitis', 'count_cardiovascular', 'count_heartfailure',
       'count_psoriasis', 'count_anaphylaxis', 'count_diabetes', 'count_ihd',
       'count_anxiety', 'count_eczema', 'count_nasalpolyps',
       'count_paracetamol', 'count_nsaids', 'count_betablocker', 'PEFStatus',
       'EosinophilLevel', 'BTS_step', 'average_daily_dose_ICS',
       'prescribed_daily_dose_ICS', 'ICS_medication_possesion_ratio', 'numPCS',
       'numPCSAsthma', 'numAntibioticsEvents', 'numAntibioticswithLRTI',
       'numOCSEvents', 'numOCSwithLRTI', 'numAsthmaAttacks',
       'numAcuteRespEvents', 'numHospEvents', 'numAsthmaManagement',
       'numAsthmaReview', 'BMI_cat', 'age_cat', 'average_daily_dose_ICS_cat',
       'prescribed_daily_dose_ICS_cat', 'ICS_medication_possesion_ratio_cat',
       'numOCS_cat', 'numOCSEvents_cat', 'numOCSwithLRTI_cat',
       'numAcuteRespEvents_cat', 'numAntibioticsEvents_cat',
       'nu

In [20]:
gridSearchData = allData.merge(gridSearchData[['patid']], on='patid', how='inner', sort=False)
crossValData = allData.merge(crossValData[['patid']], on='patid', how='inner', sort=False)
internalEvaluationData = allData.merge(internalEvaluationData[['patid']], on='patid', how='inner', sort=False)
externalEvaluationData = allData.merge(externalEvaluationData[['patid']], on='patid', how='inner', sort=False)
gridSearchData.reset_index(drop=True, inplace=True)
crossValData.reset_index(drop=True, inplace=True)
internalEvaluationData.reset_index(drop=True, inplace=True)
externalEvaluationData.reset_index(drop=True, inplace=True)

In [21]:
#Proportion of asthma attack in each outcome - training

print('3 months -> ', round(gridSearchData['outcome_3months'].value_counts()[1]/len(gridSearchData)*100,2), '%')
print('6 months -> ', round(gridSearchData['outcome_6months'].value_counts()[1]/len(gridSearchData)*100,2), '%')
print('9 months -> ', round(gridSearchData['outcome_9months'].value_counts()[1]/len(gridSearchData)*100,2), '%')
print('12 months -> ', round(gridSearchData['outcome_12months'].value_counts()[1]/len(gridSearchData)*100,2), '%')
# print('3 months -> ', round(trainingData['outcome_3months'].value_counts()[1]/len(trainingData)*100,2), '%')
# print('6 months -> ', round(trainingData['outcome_combined_6months'].value_counts()[1]/len(trainingData)*100,2), '%')
# print('9 months -> ', round(trainingData['outcome_combined_9months'].value_counts()[1]/len(trainingData)*100,2), '%')
# print('12 months -> ', round(trainingData['outcome_combined_12months'].value_counts()[1]/len(trainingData)*100,2), '%')
# print('24 months -> ', round(trainingData['outcome_combined_24months'].value_counts()[1]/len(trainingData)*100,2), '%')

3 months ->  4.75 %
6 months ->  7.02 %
9 months ->  8.85 %
12 months ->  11.52 %


In [22]:
#Proportion of asthma attack in each outcome - validation

print('3 months -> ', round(crossValData['outcome_3months'].value_counts()[1]/len(crossValData)*100,2), '%')
print('6 months -> ', round(crossValData['outcome_6months'].value_counts()[1]/len(crossValData)*100,2), '%')
print('9 months -> ', round(crossValData['outcome_9months'].value_counts()[1]/len(crossValData)*100,2), '%')
print('12 months -> ', round(crossValData['outcome_12months'].value_counts()[1]/len(crossValData)*100,2), '%')
# print('3 months -> ', round(validationData['outcome_3months'].value_counts()[1]/len(validationData)*100,2), '%')
# print('6 months -> ', round(validationData['outcome_combined_6months'].value_counts()[1]/len(validationData)*100,2), '%')
# print('9 months -> ', round(validationData['outcome_combined_9months'].value_counts()[1]/len(validationData)*100,2), '%')
# print('12 months -> ', round(validationData['outcome_combined_12months'].value_counts()[1]/len(validationData)*100,2), '%')
# print('24 months -> ', round(validationData['outcome_combined_24months'].value_counts()[1]/len(validationData)*100,2), '%')

3 months ->  4.64 %
6 months ->  7.04 %
9 months ->  8.92 %
12 months ->  11.52 %


In [23]:
#Proportion of asthma attack in each outcome - internalEvaluation

print('3 months -> ', round(internalEvaluationData['outcome_3months'].value_counts()[1]/len(internalEvaluationData)*100,2), '%')
print('6 months -> ', round(internalEvaluationData['outcome_6months'].value_counts()[1]/len(internalEvaluationData)*100,2), '%')
print('9 months -> ', round(internalEvaluationData['outcome_9months'].value_counts()[1]/len(internalEvaluationData)*100,2), '%')
print('12 months -> ', round(internalEvaluationData['outcome_12months'].value_counts()[1]/len(internalEvaluationData)*100,2), '%')

3 months ->  4.65 %
6 months ->  7.02 %
9 months ->  8.87 %
12 months ->  11.52 %


In [24]:

#Proportion of asthma attack in each outcome - internalEvaluation

print('3 months -> ', round(externalEvaluationData['outcome_3months'].value_counts()[1]/len(externalEvaluationData)*100,2), '%')
print('6 months -> ', round(externalEvaluationData['outcome_6months'].value_counts()[1]/len(externalEvaluationData)*100,2), '%')
print('9 months -> ', round(externalEvaluationData['outcome_9months'].value_counts()[1]/len(externalEvaluationData)*100,2), '%')
print('12 months -> ', round(externalEvaluationData['outcome_12months'].value_counts()[1]/len(externalEvaluationData)*100,2), '%')

3 months ->  6.12 %
6 months ->  9.57 %
9 months ->  12.07 %
12 months ->  15.51 %


In [24]:
print(f'Grid search size: {gridSearchData.shape}')
print(f'Cross val size: {crossValData.shape}')
print(f'Internal val size: {internalEvaluationData.shape}')
print(f'External val size: {externalEvaluationData.shape}')

Grid search size: (29795, 92)
Cross val size: (444393, 92)
Internal val size: (121714, 92)
External val size: (19860, 92)


In [25]:
#Save one-hot encoded data

sets = [gridSearchData, crossValData, internalEvaluationData, externalEvaluationData]
pickle.dump(sets, open('../../Clean_data/dataset_onehotencoded_ordinalversion_24062025.sav', 'wb'))

# Ordinal var encoding

In [26]:
ordinal_vars = ['BMI_cat', 'PEFStatus', 'EosinophilLevel']
len(ordinal_vars)

3

In [27]:
ordinalenc = OrdinalEncoder(categories=[['not recorded', 'underweight', 'normal', 'overweight', 'obese'], #bmi_cat
                                       ['not recorded', 'less than 60', '60-80', 'more than 80'], #PEFStatus
                                        ['not recorded', 'normal', 'high'], #EosinophilLevel
                                       ] 
                           )

data_ordinal = ordinalenc.fit_transform(gridSearchData[ordinal_vars])
data_ordinal = pd.DataFrame(data_ordinal, columns=ordinalenc.get_feature_names_out())
gridSearchData = pd.concat([gridSearchData.drop(ordinal_vars, axis=1), data_ordinal], axis=1)

In [28]:
#encode for other sets

# ordinalenc = pickle.load(open('../../Models/ordinal_encoder.pkl', 'rb'))
data_val_ordinal = ordinalenc.transform(crossValData[ordinal_vars])
data_internal_eval_ordinal = ordinalenc.transform(internalEvaluationData[ordinal_vars])
data_external_eval_ordinal = ordinalenc.transform(externalEvaluationData[ordinal_vars])

data_val_ordinal = pd.DataFrame(data_val_ordinal, columns=ordinalenc.get_feature_names_out())
crossValData = pd.concat([crossValData.drop(ordinal_vars, axis=1), data_val_ordinal], axis=1)

data_internal_eval_ordinal = pd.DataFrame(data_internal_eval_ordinal, columns=ordinalenc.get_feature_names_out())
internalEvaluationData = pd.concat([internalEvaluationData.drop(ordinal_vars, axis=1), data_internal_eval_ordinal], axis=1)

data_external_eval_ordinal = pd.DataFrame(data_external_eval_ordinal, columns=ordinalenc.get_feature_names_out())
externalEvaluationData = pd.concat([externalEvaluationData.drop(ordinal_vars, axis=1), data_external_eval_ordinal], axis=1)


print('Val data shape after scaling: ', crossValData.shape)
print('Eval data shape after scaling: ', internalEvaluationData.shape)
print('Evaluation data Wales shape: ', externalEvaluationData.shape)

pickle.dump(ordinalenc, open('../../Models/ordinal_encoder.pkl', 'wb'))

Val data shape after scaling:  (440477, 88)
Eval data shape after scaling:  (120641, 88)
Evaluation data Wales shape:  (19820, 88)


In [31]:
gridSearchData[ordinal_vars].head(5)

Unnamed: 0,BMI_cat,PEFStatus,EosinophilLevel
0,0.0,0.0,0.0
1,2.0,0.0,0.0
2,2.0,0.0,1.0
3,4.0,0.0,2.0
4,4.0,0.0,1.0


In [29]:
sets = [gridSearchData, crossValData, internalEvaluationData, externalEvaluationData]
pickle.dump(sets, open('../../Clean_data/dataset_onehotencoded_ordinalencoded_24062025.sav', 'wb'))

# Cont Var Scalling

In [32]:
continuous_vars = ['age', 'average_daily_dose_ICS', 'prescribed_daily_dose_ICS',
                   'ICS_medication_possesion_ratio', 'numPCS', 'numPCSAsthma',
                   'numAntibioticsEvents', 'numAntibioticswithLRTI', 'numOCSEvents',
                   'numOCSwithLRTI', 'numAsthmaAttacks', 'numAcuteRespEvents',
                   'numHospEvents', 'numAsthmaManagement', 'numAsthmaReview',
                  'imd_decile', 'CharlsonScore', 'BTS_step', #from ordinal vars
                  ]
continuous_vars = continuous_vars+ordinal_vars #the ordinal vars is also scalled into 0-1
len(continuous_vars)

21

In [33]:
#Data normalisation for continous variable into 0-1 range based on gridSearch set

# define scaler
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(gridSearchData[continuous_vars])
pickle.dump(scaler, open('../../Models/cont_scaler_ordinal.pkl', 'wb'))


data_scaled = pd.DataFrame(data_scaled, columns=scaler.get_feature_names_out())
gridSearchData = pd.concat([gridSearchData.drop(continuous_vars, axis=1), data_scaled], axis=1)

print('Data shape after scaling: ', gridSearchData.shape)

Data shape after scaling:  (29532, 88)


In [35]:
#Normalisation for other sets

scaler = pickle.load(open('../../Models/cont_scaler_ordinal.pkl', 'rb'))
data_val_scaled = scaler.transform(crossValData[continuous_vars])
data_internal_eval_scaled = scaler.transform(internalEvaluationData[continuous_vars])
data_external_eval_scaled = scaler.transform(externalEvaluationData[continuous_vars])

data_val_scaled = pd.DataFrame(data_val_scaled, columns=scaler.get_feature_names_out())
crossValData = pd.concat([crossValData.drop(continuous_vars, axis=1), data_val_scaled], axis=1)

data_internal_eval_scaled = pd.DataFrame(data_internal_eval_scaled, columns=scaler.get_feature_names_out())
internalEvaluationData = pd.concat([internalEvaluationData.drop(continuous_vars, axis=1), data_internal_eval_scaled], axis=1)

data_external_eval_scaled = pd.DataFrame(data_external_eval_scaled, columns=scaler.get_feature_names_out())
externalEvaluationData = pd.concat([externalEvaluationData.drop(continuous_vars, axis=1), data_external_eval_scaled], axis=1)


print('Val data shape after scaling: ', crossValData.shape)
print('Eval data shape after scaling: ', internalEvaluationData.shape)
print('Evaluation data Wales shape: ', externalEvaluationData.shape)

Val data shape after scaling:  (444393, 92)
Eval data shape after scaling:  (121714, 92)
Evaluation data Wales shape:  (19860, 92)


In [34]:
#make sure no data leak between sets
print(list(set(gridSearchData.patid.values).intersection(set(crossValData.patid.values))))
print(list(set(gridSearchData.patid.values).intersection(set(internalEvaluationData.patid.values))))
print(list(set(crossValData.patid.values).intersection(set(internalEvaluationData.patid.values))))
print(list(set(crossValData.patid.values).intersection(set(externalEvaluationData.patid.values))))
print(list(set(gridSearchData.patid.values).intersection(set(externalEvaluationData.patid.values))))


[]
[]
[]
[]
[]


In [35]:
#Save encoded data
# trainingData.to_csv('../FinalData/trainingDataEncoded_2vs1_16112023.csv', index_label=False, index=False)
# validationData.to_csv('../FinalData/validationDataEncoded_2vs1_16112023.csv', index_label=False, index=False)
# internalEvaluationData.to_csv('../FinalData/internalEvaluationDataEncoded_2vs1_16112023.csv', index_label=False, index=False)
# evaluationData.to_csv('../FinalData/evaluationDataEncoded_2vs1_16112023.csv', index_label=False, index=False)
# evaluationDataWales.to_csv('../FinalData/evaluationDataWalesEncoded_2vs1_16112023.csv', index_label=False, index=False)
# evaluationDataScotland.to_csv('../FinalData/evaluationDataScotlandEncoded_2vs1_16112023.csv', index_label=False, index=False)
sets = [gridSearchData, crossValData, internalEvaluationData, externalEvaluationData]
pickle.dump(sets, open('../../Clean_data/dataset_scaled_ordinal_24062025.sav', 'wb'))