# Importing the dependencies 

In [429]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler # To standardize data to a common range
from sklearn.model_selection import train_test_split # To split data into training and testing
from sklearn import svm # support vector machine
from sklearn.metrics import accuracy_score # To check the accuracy

# Data collection and Analysis 

In [430]:
# Loadin the heart stroke dataset to a pandas DataFrame
original_dataset = pd.read_csv('../Stroke_prediction/healthcare-dataset-stroke-data.csv')
stroke_dataset = original_dataset.copy()

# Data Cleaning

In [436]:
# After testing for all the columns, we observed that "bmi" column has some NaN
# values, so we replace all the NaN's with mean value

bmi_mean_value = stroke_dataset["bmi"].mean()
stroke_dataset["bmi"].fillna(bmi_mean_value,inplace=True)


# Encoding all the categorical columns into numerical columns by manual labeling

In [437]:
def encode_categorical_columns(df):
    
    df["gender"] = df["gender"].map({"Male":0, "Female":1,"Other":2}) # Encode "gender" column
    df["ever_married"] = df["ever_married"].map({"Yes":1,"No":0}) # Encode "marital status" column
    df["work_type"] = df["work_type"].map({"Private":0,"Self-employed":1,"Govt_job":2,"children":3,"Never_worked":4})
    df["Residence_type"] = df["Residence_type"].map({"Urban":0,"Rural":1})
    df["smoking_status"] = df["smoking_status"].map({"formerly smoked":0,"never smoked":1,"smokes":2,"Unknown":3}) 


encode_categorical_columns(stroke_dataset)
stroke_dataset

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,0,67.0,0,1,1,0,0,228.69,36.600000,0,1
1,51676,1,61.0,0,0,1,1,1,202.21,28.893237,1,1
2,31112,0,80.0,0,1,1,0,1,105.92,32.500000,1,1
3,60182,1,49.0,0,0,1,0,0,171.23,34.400000,2,1
4,1665,1,79.0,1,0,1,1,1,174.12,24.000000,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,1,80.0,1,0,1,0,0,83.75,28.893237,1,0
5106,44873,1,81.0,0,0,1,1,0,125.20,40.000000,1,0
5107,19723,1,35.0,0,0,1,1,1,82.99,30.600000,1,0
5108,37544,0,51.0,0,0,1,0,1,166.29,25.600000,0,0


# features and target variable data separation

In [439]:
# Separting the data and labels
X = stroke_dataset.drop(columns = "stroke",axis = 1) # Contains all the columns except result(stroke) column
Y = stroke_dataset["stroke"] # Constains only resultant(stroke) column


# Scaling the data from an irregular range to proper common range b/w (0 to 1)

In [440]:
scaler = StandardScaler()
scaler.fit(X)
scaler.transform(X)

array([[-1.29831203e+00, -1.18951055e+00,  1.05143428e+00, ...,
         2.70637544e+00,  1.00123401e+00, -1.45138793e+00],
       [ 7.16371490e-01,  8.39327541e-01,  7.86070073e-01, ...,
         2.12155854e+00,  4.61555355e-16, -5.35985274e-01],
       [-2.55478192e-01, -1.18951055e+00,  1.62639008e+00, ...,
        -5.02830130e-03,  4.68577254e-01, -5.35985274e-01],
       ...,
       [-7.93719586e-01,  8.39327541e-01, -3.63841511e-01, ...,
        -5.11442636e-01,  2.21736316e-01, -5.35985274e-01],
       [ 4.84965773e-02, -1.18951055e+00,  3.43796387e-01, ...,
         1.32825706e+00, -4.27845098e-01, -1.45138793e+00],
       [ 3.85694957e-01,  8.39327541e-01,  3.42048064e-02, ...,
        -4.60867458e-01, -3.49895329e-01,  1.29482004e+00]])

# Splitting data into Training and Testing data

In [446]:
# X_train ==> feature training set
# X_test ==> feature testing set
# Y_train ==>target variable training set
# Y_test ==> target varibale testing set

X_train, X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,stratify = Y,random_state = 2)

print(X.shape,X_train.shape,X_test.shape)

(5110, 11) (4088, 11) (1022, 11)


# Training the model using support vector machine

In [447]:
classifier = svm.SVC(kernel = "linear")

In [448]:
# Training the support vector machine classifier by giving the training data
classifier.fit(X_train,Y_train)

# Model Evaluation

In [449]:
# Accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)
print(training_data_accuracy)

0.946917808219178


In [450]:
# Accuracy score on the testing data
X_test_prediction = classifier.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction,Y_test)
print(testing_data_accuracy)

0.9412915851272016


# Making a user input predictive system

In [451]:
# Accuracy score on the user input data
input_data = (9046,"Male","67.0",0,1,"Yes","Private","Urban",228.69,36.6,"formerly smoked")

# Assigning columns to the new input record dataframe
input_df = pd.DataFrame.from_records([input_data], columns=X.columns)

encode_categorical_columns(input_df)

# Chaning the array into numpy array
input_array = np.array(input_df.values)

# Reshape the array as we are predicting for only one instance
input_data_reshaped = input_array.reshape(1,-1)

# Scaling the data
scaled_data = scaler.transform(input_data_reshaped)


prediction = classifier.predict(scaled_data)
print(prediction)

[0]


