In [25]:
import pandas as pd
from pandas.plotting import scatter_matrix
# from matplotlib import pyplot

from keras.models import Sequential
from keras.layers import *

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [24]:
# Load training data set from CSV file
training_data_df = pd.read_csv('DataWrangling/ML_training.csv', index_col=0, na_values=[' ']).fillna(0)

# Load testing data set from CSV file
test_data_df = pd.read_csv('DataWrangling/ML_testing.csv', index_col=0, na_values=[' ']).fillna(0)


# Convert object to 0 and 1
cols = ['Cough', 'Fever', 'Active.Breathing.Shortness', 'Weight.Loss', 'Haemoptysis', 'TB.Medication.History']
training_data_df[cols] = training_data_df[cols].replace({'Yes':1, 'No': 0})
test_data_df[cols] = test_data_df[cols].replace({'Yes':1, 'No': 0})

# Convert Female to -, male to 1
training_data_df['Gender'] = training_data_df['Gender'].replace({'F': '0', 'M': '1'})
test_data_df['Gender'] = test_data_df['Gender'].replace({'F': '0', 'M': '1'})

training_data_df.info()
test_data_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 18853 entries, 201141200106-2 to 203150600208-1
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Gender                      18853 non-null  object 
 1   Age                         18853 non-null  int64  
 2   Cough                       18853 non-null  int64  
 3   Fever                       18853 non-null  int64  
 4   Active.Breathing.Shortness  18853 non-null  int64  
 5   Weight.Loss                 18853 non-null  int64  
 6   Haemoptysis                 18853 non-null  int64  
 7   TB.Medication.History       18853 non-null  int64  
 8   qXRv2                       18853 non-null  float64
 9   CAD4TB6                     18853 non-null  int64  
 10  JF1                         18853 non-null  float64
 11  IF2                         18853 non-null  float64
 12  Xpert2Outcome_num           18853 non-null  int64  
dtypes: float64(3),

Unnamed: 0_level_0,Gender,Age,Cough,Fever,Active.Breathing.Shortness,Weight.Loss,Haemoptysis,TB.Medication.History,qXRv2,CAD4TB6,JF1,IF2,Xpert2Outcome_num
PID_OMRS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
201140600008-8,0,32,0,0,0,1,0,0,0.368823,59,0.99778,0.758563,0
201140600021-1,1,44,1,1,1,1,0,1,0.950277,77,0.99987,0.845887,1
201140600027-8,1,26,1,1,0,1,0,0,0.131479,44,0.00407,0.103325,0
201140600082-3,1,39,1,1,0,1,0,0,0.061645,35,0.0137,0.16608,0
201140600030-2,0,59,1,1,0,1,0,1,0.840019,75,0.99829,0.695038,0


# Data needs to be scaled to a small range like 0 to 1 for the neural network to work well.

In [26]:
scaler = MinMaxScaler(feature_range=(0, 1))

# Scale both the training inputs and outputs
scaled_training = scaler.fit_transform(training_data_df)
scaled_testing = scaler.transform(test_data_df)

# Print out the adjustment that the scaler applied to the total_earnings column of data
print("Note: total_earnings values were scaled by multiplying by {:.10f} and adding {:.6f}".format(scaler.scale_[8], scaler.min_[8]))

# Create new pandas DataFrame objects from the scaled data
scaled_training_df = pd.DataFrame(scaled_training, columns=training_data_df.columns.values)
scaled_testing_df = pd.DataFrame(scaled_testing, columns=test_data_df.columns.values)

# Save scaled data dataframes to new CSV files
scaled_training_df.to_csv("DataWrangling/training_scaled.csv", index=False)
scaled_testing_df.to_csv("DataWrangling/testing_scaled.csv", index=False)


Note: total_earnings values were scaled by multiplying by 1.0398469453 and adding -0.021729


In [31]:
training_data_df = pd.read_csv("DataWrangling/training_scaled.csv")

X = training_data_df.drop(['Xpert2Outcome_num', 'IF2', 'JF1', 'CAD4TB6'], axis=1).values
Y = training_data_df[['Xpert2Outcome_num']].values

[[0.]
 [0.]
 [0.]
 ...
 [1.]
 [0.]
 [0.]]


# Define the model

In [47]:
# Define the model
model = Sequential()
model.add(Dense(9, input_dim=9, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# model.compile(loss='mean_squared_error', optimizer='adam')
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(
    X,
    Y,
    epochs=100,
    shuffle=True,
    verbose=2
)

Epoch 1/100
 - 1s - loss: 0.3351 - accuracy: 0.8587
Epoch 2/100
 - 1s - loss: 0.2940 - accuracy: 0.8704
Epoch 3/100
 - 1s - loss: 0.2912 - accuracy: 0.8707
Epoch 4/100
 - 1s - loss: 0.2885 - accuracy: 0.8716
Epoch 5/100
 - 1s - loss: 0.2881 - accuracy: 0.8725
Epoch 6/100
 - 1s - loss: 0.2873 - accuracy: 0.8733
Epoch 7/100
 - 1s - loss: 0.2859 - accuracy: 0.8741
Epoch 8/100
 - 1s - loss: 0.2854 - accuracy: 0.8755
Epoch 9/100
 - 1s - loss: 0.2850 - accuracy: 0.8738
Epoch 10/100
 - 1s - loss: 0.2838 - accuracy: 0.8735
Epoch 11/100
 - 1s - loss: 0.2853 - accuracy: 0.8735
Epoch 12/100
 - 1s - loss: 0.2825 - accuracy: 0.8767
Epoch 13/100
 - 1s - loss: 0.2827 - accuracy: 0.8760
Epoch 14/100
 - 1s - loss: 0.2826 - accuracy: 0.8757
Epoch 15/100
 - 1s - loss: 0.2810 - accuracy: 0.8774
Epoch 16/100
 - 1s - loss: 0.2816 - accuracy: 0.8766
Epoch 17/100
 - 1s - loss: 0.2818 - accuracy: 0.8762
Epoch 18/100
 - 1s - loss: 0.2815 - accuracy: 0.8775
Epoch 19/100
 - 1s - loss: 0.2803 - accuracy: 0.8779
Ep

<keras.callbacks.callbacks.History at 0x2bb87c35288>

# Load the separate test data set

In [48]:
test_data_df = pd.read_csv("DataWrangling/testing_scaled.csv")
X_test = test_data_df.drop(['Xpert2Outcome_num', 'IF2', 'JF1', 'CAD4TB6'], axis=1).values
Y_test = test_data_df[['Xpert2Outcome_num']].values

test_error_rate = model.evaluate(X_test, Y_test, verbose=0)
print("The mean squared error (MSE) for the test data set is: {}".format(test_error_rate))

# Save the model to disk. It will save both the structure of the neural network but also the trained weights that  determined how the NN works.
model.save("Results/trained_model.h5") #hdf5 it is designed to store python array data.
print("Model saved to disk.")

The mean squared error (MSE) for the test data set is: [0.31367626332842113, 0.8673880696296692]


ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [None]:
# summarize the first 5 cases
for i in range(5):
	print('%s => %d (expected %d)' % (X[i].tolist(), predictions[i], y[i]))

# Make Prediction

In [None]:
X = pd.read_csv().values
prediction = model.predict(X)
prediction = prediction[0][0]

#  total_earnings values were scaled by multiplying by 1.0398469453 and adding -0.021729
prediction = (prediction + 0.021729)/1.0398469453