In [1]:
import pandas as pd
import numpy as np
import json
import copy
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

from google.colab import drive
drive.mount("/content/drive")

# Replace with correct path
path = "/content/drive/MyDrive/CSC 466 Project/stroke.csv"

df = pd.read_csv(path)
df.dropna(inplace=True)
df.drop(columns=['id', 'work_type'], inplace=True)
df = df[df['smoking_status'] != 'Unknown']
df.reset_index(inplace=True, drop=True)
df.head()

Mounted at /content/drive


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Urban,186.21,29.0,formerly smoked,1


In [2]:
df_numeric = df.copy()

for col in df_numeric.columns:
    if df_numeric[col].dtype != 'object':
        continue
    if col == "smoking_status":
      mapping = {"never smoked": 0, "formerly smoked": 1, "smokes": 2}
    else:
      mapping = {category: i for i, category in enumerate(df_numeric[col].unique())}
    df_numeric[col] = df_numeric[col].map(mapping)

df_numeric.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,0,0,228.69,36.6,1,1
1,0,80.0,0,1,0,1,105.92,32.5,0,1
2,1,49.0,0,0,0,0,171.23,34.4,2,1
3,1,79.0,1,0,0,1,174.12,24.0,0,1
4,0,81.0,0,0,0,0,186.21,29.0,1,1


In [3]:
X = df_numeric.drop(columns=['stroke'])
y = df_numeric['stroke']

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status
0,-1.247686,0.973768,-0.36768,3.953615,-0.564092,-0.981491,2.523621,0.864982,0.403745
1,-1.247686,1.663479,-0.36768,3.953615,-0.564092,1.018858,-0.050358,0.302945,-0.83682
2,0.799523,0.018784,-0.36768,-0.252933,-0.564092,-0.981491,1.318923,0.563401,1.644309
3,0.799523,1.610424,2.719753,-0.252933,-0.564092,1.018858,1.379514,-0.862253,-0.83682
4,-1.247686,1.716533,-0.36768,-0.252933,-0.564092,-0.981491,1.632992,-0.176842,0.403745


In [4]:
from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
def activation(net):
    return 1/(1+np.exp(-net))


def train(X,t,nepochs=200,n=0.5,test_size=0.3,val_size=0.3,seed=0):
    X_train, X_test, t_train, t_test = train_test_split(X, t, test_size=test_size,random_state=seed)
    X_train2, X_val, t_train2, t_val = train_test_split(X_train, t_train, test_size=val_size,random_state=seed)

    train_accuracy = []
    val_accuracy = []
    nfeatures = X.shape[1]
    np.random.seed(seed)
    w = 2*np.random.uniform(size=(nfeatures,)) - 1

    for epoch in range(nepochs):
        y_train2 = X_train2.apply(lambda x: activation(np.dot(w,x)),axis=1)
        y_val = X_val.apply(lambda x: activation(np.dot(w,x)),axis=1)

        train_accuracy.append(sum(t_train2 == np.round(y_train2))/len(t_train2))
        val_accuracy.append(sum(t_val == np.round(y_val))/len(t_val))

        for j in range(len(w)):
            w[j] -= n*np.dot((y_train2 - t_train2)*y_train2*(1-y_train2),X_train2.iloc[:,j])

    results = pd.DataFrame({"epoch": np.arange(nepochs)+1, 'train_accuracy':train_accuracy,'val_accuracy':val_accuracy,
                            "n":n,'test_size':test_size,'val_size':val_size,'seed':seed
                           }).set_index(['n','test_size','val_size','seed'])
    return w,X_test,t_test,results


# def evaluate_baseline(t_test, t_train2, t_val):
#     get_accuracy = lambda x: x.value_counts(normalize=True).sort_values(ascending=False).iloc[0]
#     accuracy_test, accuracy_train2, accuracy_val = get_accuracy(t_test), get_accuracy(t_train2), get_accuracy(t_val)
#     return accuracy_test, accuracy_train2, accuracy_val


def predict(w,X,threshold=0.5):
    y = activation(X.dot(w)).apply(lambda x: 0 if x <= threshold else 1)
    return y

In [6]:
w,X_test,t_test,results = train(X,y,nepochs=200,n=0.5,test_size=0.2,val_size=0.2,seed=0)

In [7]:
preds = predict(w,X_test)

In [8]:
cm = confusion_matrix(t_test, preds)
pd.DataFrame(cm)

Unnamed: 0,0,1
0,499,155
1,30,2


In [9]:
precision, recall, f1_score, support = precision_recall_fscore_support(t_test, preds)
metrics = pd.DataFrame({"precision": precision, "recall": recall, "f1_score": f1_score, "support": support})
metrics

Unnamed: 0,precision,recall,f1_score,support
0,0.943289,0.762997,0.843618,654
1,0.012739,0.0625,0.021164,32


## SciKitLearn Implementation

In [10]:
from sklearn.neural_network import MLPClassifier

X_train, X_test, t_train, t_test = train_test_split(X, y, test_size=0.2,random_state=0)

model = MLPClassifier(activation="logistic", solver="lbfgs", hidden_layer_sizes=(100,), learning_rate="invscaling", alpha=0.0001, random_state=1)
# model = MLPClassifier(hidden_layer_sizes=(1,), activation="logistic", solver="lbfgs", learning_rate="constant", learning_rate_init=0.5, batch_size=len(X_train), random_state=0)

model.fit(X_train, t_train)

preds = model.predict(X_test)

cm = confusion_matrix(t_test, preds)
pd.DataFrame(cm)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Unnamed: 0,0,1
0,631,23
1,30,2


In [11]:
precision, recall, f1_score, support = precision_recall_fscore_support(t_test, preds)
metrics = pd.DataFrame({"precision": precision, "recall": recall, "f1_score": f1_score, "support": support})
metrics

Unnamed: 0,precision,recall,f1_score,support
0,0.954614,0.964832,0.959696,654
1,0.08,0.0625,0.070175,32


In [12]:
model.score(X_test, t_test)

0.922740524781341