KENNEDY BWIRE 21/02900

About the Assignment
The dataset was derived from kaggle.com. The premise of the dataset is to predict whether a patient who has been diagonised with stroke heart-related complications is likely to be hit by a stroke.The work will be anchored on the following;
1. Loading the dataset and performing the preprocesing activities
2. Creating Training and Validation set.
3. Defining the Archictecture of the Model
4. Training the Model
5. Evaluating the model performance on the validation set
6. Visualize the model. 
7. Conclusion. 

In [3]:
pip install sklearn

Note: you may need to restart the kernel to use updated packages.


# Loading the dataset and performing the preprocesing activities. 

1. Selection
Importing the necessary dependencies.

In [4]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline
from matplotlib import style
style.use("ggplot")
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Dense,Embedding,LSTM, GRU, Bidirectional
from keras.optimizers import SGD
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

Loading the csv data to a panda dataframe for data inspection.

In [5]:
stroke=pd.read_csv("C:\pydatafiles\healthcare-dataset-stroke-data.csv")
print(stroke)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\pydatafiles\\healthcare-dataset-stroke-data.csv'

In [None]:
#Understanding the dataset

Printing the first 5 rows of the dataset

In [None]:
stroke.head()

Getting some info about the data

In [None]:
stroke.info()

In [None]:
#statistical measures about the dataset
stroke.describe()

2. Data cleaning

#Checking for missing values in the dataset.

In [None]:
stroke.isnull().sum()

Filling missing values with the mean. Float data type are denoted by the numerical variable. The data is filled with the mean value

In [None]:
stroke['bmi'].fillna(stroke['bmi'].mean(),inplace=True)

In [None]:
#Checking the dataset
stroke.isnull().sum()

#converting categories to Numbers

In [None]:
#Converting the categories into numbers using map function.
stroke['gender']=stroke['gender'].map({'Male':0,'Female':1})
stroke['ever_married']=stroke['ever_married'].map({'Yes':0,'No':1})
stroke['work_type']=stroke['work_type'].map({'Private':0,'Self-employed':1})
stroke['Residence_type']=stroke['Residence_type'].map({'Rural':0,'Urban':1})
stroke['smoking_status']=stroke['smoking_status'].map({'smokes':0,'formerly smoked':1,'never smoked':2})

In [None]:
#Bringing all the variables in range of 0 and 1
stroke['gender']=stroke['gender']-stroke['gender'].min()/stroke['gender'].max()-stroke['gender'].min()
stroke['age']=stroke['age']-stroke['age'].min()/stroke['age'].max()-stroke['age'].min()
stroke['hypertension']=stroke['hypertension']-stroke['hypertension'].min()/stroke['hypertension'].max()-stroke['hypertension'].min()
stroke['heart_disease']=stroke['heart_disease']-stroke['heart_disease'].min()/stroke['heart_disease'].max()-stroke['heart_disease'].min()
stroke['ever_married']=stroke['ever_married']-stroke['ever_married'].min()/stroke['ever_married'].max()-stroke['ever_married'].min()
stroke['work_type']=stroke['work_type']-stroke['work_type'].min()/stroke['work_type'].max()-stroke['work_type'].min()
stroke['Residence_type']=stroke['Residence_type']-stroke['Residence_type'].min()/stroke['Residence_type'].max()-stroke['Residence_type'].min()
stroke['avg_glucose_level']=stroke['avg_glucose_level']-stroke['avg_glucose_level'].min()/stroke['avg_glucose_level'].max()-stroke['avg_glucose_level'].min()
stroke['bmi']=stroke['bmi']-stroke['bmi'].min()/stroke['bmi'].max()-stroke['bmi'].min()
stroke['smoking_status']=stroke['smoking_status']-stroke['smoking_status'].min()/stroke['smoking_status'].max()-stroke['smoking_status'].min()
stroke['stroke']=stroke['stroke']-stroke['stroke'].min()/stroke['stroke'].max()-stroke['stroke'].min()

In [None]:
stroke.head()

Printng the numbers of rows and columns in the dataset

In [None]:
stroke.shape

Checking the distribution of the target variable

In [None]:
stroke['stroke'].value_counts()

Data Visualization. 

In [None]:
# seaborn has an easy method to showcase heatmap
plt.figure(figsize=(20,10))
p = sns.heatmap(stroke.corr(), annot=True, cmap='RdYlGn')

In [None]:
# Reducing the number of Rows
stroke.drop(stroke.index[4000:538674], inplace=True)
stroke.shape

In [None]:
p= stroke.hist(figsize=(20,10),color='green')

In [None]:
sns.scatterplot(x='age', y='gender', data=stroke)

3. Data Transformation
This refers to modifying data so that can be ready for predictive analytics

Transform Target variables to Numeric array

In [None]:
Encoder=preprocessing.LabelEncoder()
Encoded_stroke=stroke.apply(preprocessing.LabelEncoder().fit_transform)
print("Transformed Data:\n",Encoded_stroke)
Numeric_Array=Encoded_stroke.values
print("Numeric Array\n",Numeric_Array)

# Create the Training and Validation set

In [None]:
Training_Sample,Test_Sample=train_test_split(Numeric_Array,test_size=0.2,random_state=2)
print("Training Sample:\n",Training_Sample)
print("Test Sample:\n",Test_Sample)

Select Input and output variable from training sample and test sample

In [None]:
XTrain_Sample=Training_Sample[:,1]
print("Input Attributes of Training Sample\n",XTrain_Sample)
YTrain_Sample=Training_Sample[:,-1]
print("Output Attributes of Training Sample\n\n",YTrain_Sample)
XTest_Sample=Test_Sample[:,:-1]
print("Input Attributes of Test Sample\n",XTest_Sample)
Actual_YTest_Sample=Test_Sample[:,-1]
print("Actual Test Sample Classes\n\n",Actual_YTest_Sample)
XTrain_Sample.shape

In using the PCA the input attributes of both Training and Test samples sre compressed into two attributes.

In [None]:
pca=PCA(n_components=2)
XTrain_Sample=Training_Sample[:,0:-1]
pca.fit(XTrain_Sample)
Decomposed_XTrain_Sample=pca.transform(XTrain_Sample)
print("\nDecomposed Input Attributes\n",Decomposed_XTrain_Sample)

# Defining the Architecture of the Model
This involves creating the model and specidy the number of the input neutrons, defining the input neutrons,specify the no of output neutrons and the number of hidden layers and the hidden neutrons. For instance the XTrain_Sample.shape[1] defines the input neutrons, and since the data is binary classification, we have a single neutron in the output layer hence the output neutron is 1. 

In [None]:
#The LSTM Architecture
regressor= Sequential()
#First LSTM layer with Dropout regularisation
regressor.add(LSTM(units=50, return_sequences=True, input_shape=(XTrain_Sample.shape[1],1)))
regressor.add(Dropout(0.2))
# Second LSTM layer
regressor.add(LSTM(units=50, return_sequences=True))
regressor.add(Dropout(0.2))
# Third LSTM layer
regressor.add(LSTM(units=50, return_sequences=True))
regressor.add(Dropout(0.2))
# Fourth LSTM layer
regressor.add(LSTM(units=50))
regressor.add(Dropout(0.2))
# The output layer
regressor.add(Dense(units=1))

In [None]:
regressor.summary()

Compiling the RNN

In [None]:
regressor.compile(optimizer='rmsprop',loss='mean_squared_error',metrics='accuracy')

# Training the Model
This involve passing the independent and dependent variable features for the training set for training the model. Validation data will be evaluated at the end of each epoch. The espoch is set at 50. The trained model in the model history variable will be used to visualize the training process. 

Fitting to the training set

In [None]:
regressor.fit(XTrain_Sample,YTrain_Sample,epochs=50,batch_size=32)

In [None]:
regressor.history=regressor.fit(XTrain_Sample,YTrain_Sample,validation_data=(XTest_Sample,Actual_YTest_Sample),epochs=50)

In [None]:
# The GRU architecture
regressorGRU = Sequential()
# First GRU layer with Dropout regularisation
regressorGRU.add(GRU(units=50, return_sequences=True, input_shape=(XTrain_Sample.shape[1],1), activation='tanh'))
regressorGRU.add(Dropout(0.2))
# Second GRU layer
regressorGRU.add(GRU(units=50, return_sequences=True, input_shape=(XTrain_Sample.shape[1],1), activation='tanh'))
regressorGRU.add(Dropout(0.2))
# Third GRU layer
regressorGRU.add(GRU(units=50, return_sequences=True, input_shape=(XTrain_Sample.shape[1],1), activation='tanh'))
regressorGRU.add(Dropout(0.2))
# Fourth GRU layer
regressorGRU.add(GRU(units=50, activation='tanh'))
regressorGRU.add(Dropout(0.2))
# The output layer
regressorGRU.add(Dense(units=1))
# Compiling the RNN
regressorGRU.compile(optimizer=SGD(lr=0.01, decay=1e-7, momentum=0.9, nesterov=False),loss='mean_squared_error',metrics='Accuracy')
# Fitting to the training set
regressorGRU.fit(XTrain_Sample,YTrain_Sample,epochs=50,batch_size=150)

In [None]:
regressor.history=regressor.fit(XTrain_Sample,YTrain_Sample,validation_data=(XTest_Sample,Actual_YTest_Sample),epochs=50)

# Evaluation of the model Performance on Validation set.

The validation set had a accuracy score of 94.06% and a validation accuracy of 92.25%.This signifies that the model was trained well on the training data.

# Visualizing the model performance

Loss refers to the loss value over the training data after each epoch. This is what the optimization process is trying to minimize with the training so, the lower, the better.Accuracy refers to the ratio between correct predictions and the total number of predictions in the training data. The higher, the better. This is normally inversely correlated with the loss, but not always

In [None]:
#Summarize history for Loss. 
plt.figure(figsize=(15,10))
plt.plot(regressor.history.history['loss'])
plt.plot(regressor.history.history['val_loss'])
plt.title("A GRAPH SHOWING LOSS AGAINST EPOCH")
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train Accuracy','Validation Accuracy'],loc='upper left')
plt.show()

In [None]:
#Summarize the history for Accuracy
plt.figure(figsize=(15,10))
plt.plot(regressor.history.history['accuracy'])
plt.plot(regressor.history.history['val_accuracy'])
plt.title("A MODEL OF LOSS VS EPOCH")
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train Accuracy','Validation Accuracy'],loc='upper right')
plt.show()

# Conclusion. 

From the data,the accuracy score of 94.06% and a validation accuracy of 92.25% signifies a perfect relationship between the independent variable and dependent variable. The positive result on validation data means that the model can be utilised to predict and confirm the likelihood of a patient diagonosed with heart related disease might or might not suffer from the stroke. 

In [None]:
pip install pycaret

In [None]:
import gradio as gr
import pycaret
from pycaret.classification import *
import pandas as pd
import category_encoders as ce
stroke=pd.read_csv("C:\pydatafiles\healthcare-dataset-stroke-data.csv")

encoder= ce.OrdinalEncoder(cols=['gender'],return_df=True, mapping=[{'col':'gender', 'mapping':{0: 1, 1: 2,'Other': 3}}])
stroke['gender'] = encoder.fit_transform(stroke['gender'])
encoder= ce.OrdinalEncoder(cols=['work_type'],return_df=True, mapping=[{'col':'work_type', 'mapping':{0: 1, 1: 2, 'children': 3, '2': 4, 'Never_worked': 5}}])
stroke['work_type'] = encoder.fit_transform(stroke['work_type'])

s = setup(data =stroke, target = 'stroke', fix_imbalance = True, session_id=123)

best = compare_models()
compare_model_results = pull()
            
model = gr.inputs.Dropdown(list(compare_model_results['Model']),label="Model")
gender = gr.inputs.Dropdown(choices=["Male", "Female"],label = 'gender')
age = gr.inputs.Slider(minimum=1, maximum=100, default=data['age'].mean(), label = 'age')
hypertension = gr.inputs.Dropdown(choices=["1", "0"],label = 'hypertension')
heart_disease = gr.inputs.Dropdown(choices=["1", "0"],label ='heart_disease')
ever_married = gr.inputs.Dropdown(choices=["Yes", "No"], label ='ever_married')
work_type = gr.inputs.Dropdown(choices=["children", "Govt_job","Never_worked","Private","Self-employed"],label = 'work_type')
Residence_type = gr.inputs.Dropdown(choices=["Urban", "Rural"],label = 'Residence_type')
avg_glucose_level =	gr.inputs.Slider(minimum=-55, maximum=300, default=data['avg_glucose_level'].mean(), label = 'avg_glucose_level')
bmi = gr.inputs.Slider(minimum=-10, maximum=100, default=data['bmi'].mean(), label = 'bmi')
smoking_status = gr.inputs.Dropdown(choices=["Unknown", "smokes","never_smoked", "formerly_smoked"], label ='smoking_status')

gr.Interface(predict,[model, gender, age, hypertension, heart_disease, ever_married, work_type, Residence_type, avg_glucose_level, bmi, smoking_status], "label",live=True).launch()

