In [585]:
# import packages
import pandas as pd
import numpy as np
import pickle
from sklearn import metrics
import tensorflow as tf
from tensorflow import keras
from sklearn.pipeline import Pipeline
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [586]:
# load test data
input_path = 'H:\RediMinds\VCQI'
#train = pd.read_csv(input_path+"\VCQI_clean_train.csv")
test = pd.read_csv(input_path+"\VCQI_clean_test.csv")

In [587]:
# Generate prediction for the Random Forest Model
with open (input_path+'\cat_col', 'rb') as fp:
    cat_col = pickle.load(fp)

model_path = 'output/models/'
from joblib import load
encoder = load(model_path+'OHE.joblib')

In [588]:
x_test = test.drop(labels='INTRA_OP_COMPLICATIONS', axis = 'columns').copy()
y_test = test['INTRA_OP_COMPLICATIONS'].copy() 

In [589]:
# Create dummy variables
one_hot_encoded_array = encoder.transform(x_test[cat_col]).toarray()
column_name = encoder.get_feature_names(cat_col)
x_test_OHE =  pd.DataFrame(one_hot_encoded_array, columns= column_name)
x_test = x_test.merge(x_test_OHE, how = 'left', left_index = True, right_index =True) # create dummy variables
x_test = x_test.drop(labels = cat_col, axis = 'columns') # drop original variables

### Load Models

In [590]:
model_path = 'output/models/'

In [591]:
# import Random Forest Classifer
from joblib import load
RFR = load(model_path+'RFR.joblib')

In [592]:
# Generate prediction for the Random Forest Model
results_RFR = pd.DataFrame(RFR.predict(x_test), columns=['pred_label'])
results_RFR['pred_prob'] =  pd.DataFrame(RFR.predict_proba(x_test))[1]
results_RFR['true_label'] = np.array(y_test)

In [593]:
metrics.average_precision_score(results_RFR['true_label'], results_RFR['pred_prob'])

0.5878218789088239

In [594]:
results_RFR['pred_prob'].describe()

count    507.000000
mean       0.089702
std        0.106310
min        0.000000
25%        0.029015
50%        0.063834
75%        0.104336
max        0.749533
Name: pred_prob, dtype: float64

In [596]:
results_RFR['pred_label'].sum()

10

### Test Script

In [683]:
import pandas as pd
import numpy as np

In [684]:
list_columns = [
    'CENTERCODE',
    'GENDER',
    'AGEATSURGERY',
    'MARITALSTATUS',
    'RACE',
    'EDUCATION',
    'BMI',
    'CLINICALSIZEmm',
    'CHARLSONSCORE',
    'SYMPTOMS',
    'SOLITARYKIDNEY',
    'BILATERALITYOFTUMOR',
    'SIDEOFTUMOR',
    'SIDEOFSURGERY',
    'FACE',
    'TUMORlOCATION',
    'PREOPHB',
    'PREOPHT',
    'PREOPWBC',
    'PREOPCREAT',
    'PADUARISK',
    'POLARLOCATION',
    'RIMLOCATION',
    'RENALSINUS',
    'EXOPHYTICRATE',
    'CLINICALSIZEGROUP',
    'CT',
    'CN',
    'R.E.N.A.L.NEPHRORISKSTRATIFICATION',
    'RADIUSmaximaldiameterincm',
    'NEARNESSOFTUMOUR',
    'ANTERIORORPOSTERIOR',
    'LOCATIONTOPOLARLINE',
    'ASASCORE',
    'PARTIALNEPHROINDICATION',
    'MULTIFOCALITY',
    'NOOFLESIONS',
    'PATIENTNUMBER'
               ]

In [685]:
# Import prospective dataset
df= pd.read_excel('input/test_data.xlsx', 
                  sheet_name='test_data', 
                  usecols=list_columns)

In [686]:
df.head()

Unnamed: 0,PATIENTNUMBER,CENTERCODE,GENDER,AGEATSURGERY,MARITALSTATUS,RACE,EDUCATION,BMI,CLINICALSIZEmm,CHARLSONSCORE,...,CN,R.E.N.A.L.NEPHRORISKSTRATIFICATION,RADIUSmaximaldiameterincm,NEARNESSOFTUMOUR,ANTERIORORPOSTERIOR,LOCATIONTOPOLARLINE,ASASCORE,PARTIALNEPHROINDICATION,MULTIFOCALITY,NOOFLESIONS
0,22PAT000131,WGC7XYXZ,Male,58,Married,Mongoloid/Asian,Not Provided,23.18,44,0,...,N0,3(High Risk),4 but <7,=<4,Anterior (a),3,Category 1,Elective,No,1
1,15PAT000095,VNO3BPES,Female,23,Single,White/Caucasian,Not Provided,19.65,23,0,...,N0,3(High Risk),=<4,=<4,Posterior (p),3,Category 1,Elective,No,1


### Define numeric columns and replace encoded missing values with NaN

In [687]:
# List of columns tobre converted to numeric
numeric_col_list = ['AGEATSURGERY','BMI','CLINICALSIZEmm','CHARLSONSCORE','PREOPHB',
                    'PREOPHT','PREOPWBC','PREOPCREAT','NOOFLESIONS'
                   ] 

In [688]:
# Convert columns in numeric_col_list to numeric and invalid values are set NaN 
for col in numeric_col_list:
    df[col]= pd.to_numeric(df[col], errors='coerce', downcast = 'float')

In [690]:
#replace missing values such as 999 in the dataframe with NaN
df = df.replace([99,999,9999,99999,999999,-99,-999,-9999,-99999,-999999],np.nan)

In [691]:
# replacing negative numbers in the dataframe with nan as given variables cannot contain negative numbers
for col in list(df.select_dtypes('float64')):
    df[col] = df[col].apply(lambda x: np.nan if x<0 else x)

In [692]:
# Correcting the units for erroroneously entered data
def clean_WBC(x):
    if len(str(x))<6:
        x = x*1000
    return x
        

# if the value of PRE-OP WBC value contains is less the 4 digits then multiply it by 1000
df['PREOPWBC'] = df['PREOPWBC'].apply(lambda x: clean_WBC(x))


In [693]:
# Correcting the units for erroroneously entered data for PREOPHB
df['PREOPHB'] = df['PREOPHB'].apply(lambda x: x*100 if x<10 else x)

# Correcting the units for erroroneously entered data for PREOPHT
df['PREOPHT'] = df['PREOPHT'].apply(lambda x: x*100 if x<10 else x)

In [694]:
# define categorical columns
cat_col = []
for i in list_columns:
    if i not in numeric_col_list:
        cat_col.append(i)
cat_col.remove('PATIENTNUMBER')

### Data Cleaning

In [695]:
import json

In [696]:
with open(model_path + "outlier_dict.json", "r") as read_file:
    outlier_dict = json.load(read_file)

In [697]:
outlier_dict

{'BMI': {'LL': 13.899999999999997, 'UL': 42.38},
 'CLINICALSIZEmm': {'LL': -8.0, 'UL': 72.0},
 'PREOPHB': {'LL': 9.75, 'UL': 18.15},
 'PREOPHT': {'LL': 28.999999999999993, 'UL': 53.800000000000004},
 'PREOPWBC': {'LL': 1525.0, 'UL': 12845.0},
 'PREOPCREAT': {'LL': 0.29999999999999993, 'UL': 1.58},
 'PREOPEGFR': {'LL': 22.104999999999983, 'UL': 136.50500000000002}}

In [698]:
df['PREOPWBC']

0     5100.0
1    12100.0
Name: PREOPWBC, dtype: float64

In [699]:
# Remove outlier data from numeric columns
for i in numeric_col_list:
    if i in outlier_dict:
        LL = outlier_dict[i]['LL']
        UL = outlier_dict[i]['UL']
        df.drop(df.loc[(df[i]<=LL)|(df[i]>=UL),[i]].index, inplace=True)

In [700]:
df

Unnamed: 0,PATIENTNUMBER,CENTERCODE,GENDER,AGEATSURGERY,MARITALSTATUS,RACE,EDUCATION,BMI,CLINICALSIZEmm,CHARLSONSCORE,...,CN,R.E.N.A.L.NEPHRORISKSTRATIFICATION,RADIUSmaximaldiameterincm,NEARNESSOFTUMOUR,ANTERIORORPOSTERIOR,LOCATIONTOPOLARLINE,ASASCORE,PARTIALNEPHROINDICATION,MULTIFOCALITY,NOOFLESIONS
0,22PAT000131,WGC7XYXZ,Male,58.0,Married,Mongoloid/Asian,Not Provided,23.18,44.0,0.0,...,N0,3(High Risk),4 but <7,=<4,Anterior (a),3,Category 1,Elective,No,1.0
1,15PAT000095,VNO3BPES,Female,23.0,Single,White/Caucasian,Not Provided,19.65,23.0,0.0,...,N0,3(High Risk),=<4,=<4,Posterior (p),3,Category 1,Elective,No,1.0


In [702]:
# Calculate number missing values per row
df.reset_index(inplace=True, drop=True)
missing = {}

for i in range(len(df)):
    miss_cnt = 0
    for col in df.columns:
        if pd.isna(df[col][i]) == True:
            miss_cnt = miss_cnt+1
    df.loc[i,'Missing'] = miss_cnt

In [704]:
# calculate number of records with over 30% missing data
print("Total Records {}".format(len(df)))
print("Records with >=30% missing data {}".format(sum(df['Missing']<= round((len(df.columns))*.30))))
print("Records to be dropped {}".format(len(df) - sum(df['Missing']<= round((len(df.columns))*.30))))

Total Records 2
Records with >=30% missing data 2
Records to be dropped 0


In [705]:
#removing patients with more than 75% missing data
df = df[df['Missing']<= round((len(df.columns))*.30) ].copy()

In [706]:
# remove column 'Missing'
df.drop(labels=['Missing'], axis = 'columns',inplace=True)

In [707]:
# Replcaing missing values in categorical column with NA
for k in cat_col:
    if k in df.columns:
        df[k].fillna('NA',inplace = True)

In [712]:
# Import dict with mean value for numeric columns
with open(model_path + "numeric_col_mean_dict.json", "r") as read_file:
    numeric_col_mean_dict = json.load(read_file)

In [713]:
# Replacing missing values in numerical columns with their respective mean 
for k in numeric_col_list:
    if k in df.columns:
        df[k].fillna(numeric_col_mean_dict[k], inplace = True)

In [714]:
from joblib import load
le_dict = load(model_path+'Label_enc_dict.joblib')

In [715]:
# Convert all cateogir
for i in cat_col:
    df[i] = df[i].apply(str)

In [716]:
# check for unknow labels and map unknown labels to 'unknown_label'
for i in le_dict:
    if i in df.columns:
        df[i].map(lambda x: x if x in le_dict[i].classes_ else 'unknown_label')

In [717]:
df_codes = df.copy()

In [718]:
# mapping categorical column values to integer labels
for col in cat_col:
    df_codes[col] = le_dict[col].transform(df_codes[col]).copy()

ValueError: y contains previously unseen labels: '3'

In [719]:
from joblib import load
patient_list_train = load(model_path+'patient_list_test.joblib')

In [626]:
x_test = df_codes[df_codes['PATIENTNUMBER'].isin(patient_list_train)]

In [627]:
patient_number = x_test['PATIENTNUMBER'].to_list()
x_test.drop(labels= 'PATIENTNUMBER', axis = 'columns', inplace = True)
x_test.reset_index(inplace=True, drop = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


### Prediction

In [628]:
# Generate prediction for the Random Forest Model
with open (input_path+'\cat_col', 'rb') as fp:
    cat_col = pickle.load(fp)

model_path = 'output/models/'
from joblib import load
encoder = load(model_path+'OHE.joblib')

In [629]:
#x_test = test.drop(labels='INTRA_OP_COMPLICATIONS', axis = 'columns').copy()
#y_test = test['INTRA_OP_COMPLICATIONS'].copy() 

In [630]:
# Create dummy variables
one_hot_encoded_array = encoder.transform(x_test[cat_col]).toarray()
column_name = encoder.get_feature_names(cat_col)
x_test_OHE =  pd.DataFrame(one_hot_encoded_array, columns= column_name)
x_test = x_test.merge(x_test_OHE, how = 'left', left_index = True, right_index =True) # create dummy variables
x_test = x_test.drop(labels = cat_col, axis = 'columns') # drop original variables

### Load Models

In [631]:
model_path = 'output/models/'

In [632]:
# import Random Forest Classifer
from joblib import load
RFR = load(model_path+'RFR.joblib')

In [633]:
# Generate prediction for the Random Forest Model
results_RFR = pd.DataFrame(RFR.predict(x_test), columns=['pred_label'])
results_RFR['pred_prob'] =  pd.DataFrame(RFR.predict_proba(x_test))[1]
results_RFR['PATIENTNUMBER'] = patient_number
#results_RFR['true_label'] = np.array(y_test)

In [634]:
print(results_RFR[['PATIENTNUMBER','pred_prob','pred_label']])

    PATIENTNUMBER  pred_prob  pred_label
0     22PAT000131   0.067908           0
1     15PAT000095   0.164061           0
2     11PAT000161   0.046213           0
3     15PAT000077   0.035738           0
4     15PAT000089   0.018937           0
..            ...        ...         ...
502      6M000306   0.106120           0
503      6M000308   0.096344           0
504      6M000309   0.046461           0
505      6M000322   0.074516           0
506      6M000323   0.081580           0

[507 rows x 3 columns]
