In [None]:
import pandas as pd
import torch
import torch.nn as nn
import scipy
import sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from keras.models import load_model
from scipy import stats
import warnings
import requests
from io import StringIO

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
INPUT_DIRECTORY=''
OUTPUT_DIRECTORY=''
MODEL_DIRECTORY=''
MODEL_NAME='Dense_Sequential_Model'

In [None]:
import glob

# get data file names
folder_path = INPUT_DIRECTORY
filenames = glob.glob(folder_path + "/*.csv")
dfs = [pd.read_csv(filename) for filename in filenames]

# concatenate all data into one DataFrame
df = pd.concat(dfs, ignore_index=True)

In [None]:
# Drop data that appears in less than half of the samples
df = df.drop(labels=['covid19_test_results', 'rapid_flu_results', 'rapid_strep_results', 'rr', 'rhonchi', 'wheezes', 'sys', 'dia', 'sats', 'ctab', 'days_since_symptom_onset', 'cough_severity',
                     'sob_severity', 'cxr_findings', 'cxr_impression', 'cxr_label', 'cxr_link', 'er_referral', 'test_name', 'swab_type', 'batch_date'], axis=1)

In [None]:
columns = ['age',
       'high_risk_exposure_occupation', 'high_risk_interactions', 'diabetes',
       'chd', 'htn', 'cancer', 'asthma', 'copd', 'autoimmune_dis', 'smoker',
       'temperature', 'pulse', 'labored_respiration', 'cough', 'fever', 'sob',
       'diarrhea', 'fatigue', 'headache', 'loss_of_smell', 'loss_of_taste',
       'runny_nose', 'muscle_sore', 'sore_throat']

# categorical_features = ['test_name', 'swab_type']

continuous_features = ['temperature', 'pulse']

binary_features = ['diabetes', 'chd', 'htn', 'cancer', 'asthma', 'copd', 'autoimmune_dis', 'high_risk_exposure_occupation', 'high_risk_interactions', 'smoker', 'labored_respiration', 'cough',
                   'fever', 'sob', 'diarrhea', 'fatigue', 'headache', 'loss_of_smell', 'loss_of_taste', 'runny_nose', 'muscle_sore', 'sore_throat']

# One Hot Encoding
Our categorical data is mostly strings, we need to convert these to numerical types.

 - Binary Data
    - We want these to be labelled as 0s and 1s
 - Multi-categorical
    - We'll label these in increments, ie 0, 1, 2, ..

In [None]:
for col in binary_features:
    df[col] = df[col] * 1


In [None]:
def one_hot(X):
    # dummy_cols = list(set(X[categorical_features + binary_features]))
    dummy_cols = list(set(X[binary_features]))
    X = pd.get_dummies(X, columns=dummy_cols)
    return X

df = one_hot(df)

# Normalize Continuous Features

In [None]:
# from sklearn import preprocessing

x = df[continuous_features].values #returns a numpy array
min_max_scaler = MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df[continuous_features] = pd.DataFrame(x_scaled)
df['age'] = df['age']/100

In [None]:
df.head()

Unnamed: 0,age,temperature,pulse,diarrhea_0,diarrhea_1,asthma_0,asthma_1,runny_nose_0,runny_nose_1,loss_of_smell_0,loss_of_smell_1,sob_0,sob_1,labored_respiration_0,labored_respiration_1,copd_0,copd_1,muscle_sore_0,muscle_sore_1,sore_throat_0,sore_throat_1,fatigue_0,fatigue_1,high_risk_exposure_occupation_0,high_risk_exposure_occupation_1,headache_0,headache_1,fever_0,fever_1,cancer_0,cancer_1,smoker_0,smoker_1,diabetes_0,diabetes_1,loss_of_taste_0,loss_of_taste_1,chd_0,chd_1,high_risk_interactions_0,high_risk_interactions_1,cough_0,cough_1,autoimmune_dis_0,autoimmune_dis_1,htn_0,htn_1
0,0.31,0.590164,0.224,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0,1,0,1,0,1,0
1,0.54,0.532787,0.288,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,0,0,1
2,0.32,0.483607,0.216,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0
3,0.25,0.54918,0.48,1,0,1,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1,1,0,1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,0
4,0.33,0.532787,0.28,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0


In [None]:
def load_trained_model(filepath):
    new_model = load_model(filepath)
    return new_model

model = load_trained_model(MODEL_DIRECTORY+MODEL_NAME+".hdf5")
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_20 (Dense)             (None, 47)                2256      
_________________________________________________________________
dense_21 (Dense)             (None, 25)                1200      
_________________________________________________________________
dense_22 (Dense)             (None, 25)                650       
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 26        
Total params: 4,132
Trainable params: 4,132
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
y_pred = model.predict(df)

In [None]:
y_pred = pd.DataFrame(y_pred)
y_pred.head()

Unnamed: 0,0
0,0.1615692
1,2.96761e-12
2,0.2706215
3,3.253529e-24
4,0.3176436


In [None]:
y_pred.to_csv(OUTPUT_DIRECTORY+'predictions.csv', index=False)