In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sqlalchemy import create_engine
from config import db_password

In [2]:
path = 'Resources/stroke_data.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
1,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
2,Female,52.0,0,0,Yes,Private,Urban,77.59,17.7,formerly smoked,0
3,Female,75.0,0,1,Yes,Self-employed,Rural,243.53,27.0,never smoked,0
4,Female,32.0,0,0,Yes,Private,Rural,77.67,32.3,smokes,0


In [3]:
len(df)

29065

In [4]:
df.dtypes

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [5]:
# Find null values
for column in df.columns:
    print(f'{column}: {df[column].isnull().sum()} null values')

gender: 0 null values
age: 0 null values
hypertension: 0 null values
heart_disease: 0 null values
ever_married: 0 null values
work_type: 0 null values
residence_type: 0 null values
avg_glucose_level: 0 null values
bmi: 0 null values
smoking_status: 0 null values
stroke: 0 null values


In [6]:
df = df.dropna()
len(df)

29065

In [7]:
# Find duplicate entries
print(f'Duplicate entries: {df.duplicated().sum()}')

Duplicate entries: 0


In [9]:
# Remove the id column
#df.drop(columns = ['id'], inplace = True)
#df.head()

### Fixing gender column

In [10]:
df['gender'].value_counts()

Female    17852
Male      11213
Name: gender, dtype: int64

In [11]:
not_other_gender = df['gender'] != 'Other'
df = df.loc[not_other_gender]
df['gender'].value_counts()

Female    17852
Male      11213
Name: gender, dtype: int64

### Fixing work_type column

In [12]:
df['work_type'].value_counts()

Private          18950
Self-employed     5204
Govt_job          4195
Never_worked       716
Name: work_type, dtype: int64

In [13]:
x = {'children': 'Never_worked'}   
df = df.replace(x)
df['work_type'].value_counts()

Private          18950
Self-employed     5204
Govt_job          4195
Never_worked       716
Name: work_type, dtype: int64

### Fixing Residence_type column

In [15]:
df['residence_type'].value_counts()

Urban    14592
Rural    14473
Name: residence_type, dtype: int64

In [16]:
df.rename(columns = {'Residence_type':'residence_type'}, inplace = True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
1,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
2,Female,52.0,0,0,Yes,Private,Urban,77.59,17.7,formerly smoked,0
3,Female,75.0,0,1,Yes,Self-employed,Rural,243.53,27.0,never smoked,0
4,Female,32.0,0,0,Yes,Private,Rural,77.67,32.3,smokes,0


### Looking at other categorical columns

In [17]:
df['ever_married'].value_counts()

Yes    21687
No      7378
Name: ever_married, dtype: int64

In [18]:
df['smoking_status'].value_counts()

never smoked       15746
formerly smoked     7093
smokes              6226
Name: smoking_status, dtype: int64

## Encoding categorical variables

In [19]:
# Generate our categorical variable lists
df_cat = df.dtypes[df.dtypes == "object"].index.tolist()

In [20]:
# Create the OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit the encoder and produce encoded DataFrame
encode_df = pd.DataFrame(enc.fit_transform(df[df_cat]))

# Rename encoded columns
encode_df.columns = enc.get_feature_names_out(df_cat)
encode_df.head()

Unnamed: 0,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,residence_type_Rural,residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [21]:
# Merge one-hot encoded features and drop the originals
encoded_df = df.merge(encode_df, left_index=True, right_index=True).drop(df_cat, 1)
encoded_df.head()

  encoded_df = df.merge(encode_df, left_index=True, right_index=True).drop(df_cat, 1)


Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,residence_type_Rural,residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,58.0,1,0,87.96,39.2,0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,70.0,0,0,69.04,35.9,0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,52.0,0,0,77.59,17.7,0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
3,75.0,0,1,243.53,27.0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
4,32.0,0,0,77.67,32.3,0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


# Connection to database

In [27]:
# Postgres username, password, and database name
pg_ip_address = 'database.czuikfxo2wqk.us-west-1.rds.amazonaws.com'
pg_port = '5432'
pg_username = 'postgres'
pg_password = 'test1234'
pg_db_name = 'heart_disease'

# Variable that contains the necessary Postgres login information
db_string = f'postgresql://{pg_username}:{pg_password}@{pg_ip_address}:{pg_port}/{pg_db_name}'
             

# Create the connection
engine = create_engine(db_string)

In [28]:
# Importing the data to the database
df.to_sql(name='stroke_data', con = engine)

65

In [29]:
# Importing the encoded data to the database
encoded_df.to_sql(name='encoded_data', con = engine)

65

# ML Model

In [30]:
# Remove diabetes outcome target from features data
X = encoded_df.drop(columns="stroke")
y = encoded_df.stroke

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify = y)

In [31]:
# Normalizing the data

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [32]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver = "lbfgs", max_iter = 200)

# Train the model
log_classifier.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test_scaled)
print(f" Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}")

 Logistic regression model accuracy: 0.981


In [34]:
# Define the Support Vector Machine model
svm_classifier = SVC()

# Train the model
svm_classifier.fit(X_train_scaled, y_train)

# Evaluate the model
svm_pred = svm_classifier.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test, svm_pred):.3f}")

 SVM model accuracy: 0.981
