In [None]:
import pandas as pd
import hvplot.pandas
from sqlalchemy import create_engine
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import getpass
import psycopg2

In [None]:
# Reading Data
username = input("What is your Postgres Username? (postgres by default)")

In [None]:
# Reading Data
password = getpass.getpass(prompt= "What is your Postgres Password?")

In [None]:
# Reading Data
port = input("What is your Postgres Port number?")

In [None]:
# Reading Data
conn = f'postgresql://{username}:{password}@localhost:{port}/SanAntonio_Stroke_Pred'.format(username, password, port)

In [None]:
engine = create_engine(conn)

In [None]:
# Get Data
medical_df = pd.read_sql_query('SELECT personal."Identifier", medical."Age", \
                           medical."Gender", personal."Work_Type", personal."Residence_Type", \
                           personal."Ever_Married", medical."Hypertension", medical."Heart_Disease",\
                           medical."Avg_Glucose_Lvl", medical."BMI", medical."Smoker", personal."Stroke"\
                                FROM personal\
                                INNER JOIN medical\
                                ON personal."Identifier" = medical."Identifier";', conn)

print(medical_df.shape)
medical_df.head(10)

In [None]:
# Drop ID# column
medical_df.drop(columns=['Identifier'], inplace=True)
print(medical_df.shape)
medical_df.head()

In [None]:
# Create array to store diabetes status based on glucose level
glucose_status = []

# Add diabetes status to array based on glucose level with for loop
for g in medical_df['Avg_Glucose_Lvl']:
    if g > 125.0:
        glucose_status.append('Diabetic')
    if g > 99.0 and g <= 125.0:
        glucose_status.append('Prediabetic')
    if g <= 99.0:
        glucose_status.append('Normal')
        
# Ensure array is equivalent in length to medical_df = 5109 rows
print(len(glucose_status))
glucose_status

In [None]:
# Put array into a df
glucose_status_df = pd.DataFrame(glucose_status, columns=['Glucose_Status'])
glucose_status_df.head(10)

In [None]:
# Ensure no null values 
glucose_status_df.isnull().sum()

In [None]:
# Add glucose_status diabetes values to medical_df
medical_df = medical_df.merge(glucose_status_df, left_index=True, right_index=True)
print(medical_df.shape)
medical_df.head(10)

In [None]:
# Get a list of categorical columns
categorical_columns = medical_df.dtypes[medical_df.dtypes=='object'].index.tolist()
categorical_columns

In [None]:
# Create OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit & transform OneHotEncoder using categorical columns
encode_df = pd.DataFrame(enc.fit_transform(medical_df[categorical_columns]))

# Add column names
encode_df.columns = enc.get_feature_names(categorical_columns)
print(encode_df.shape)
encode_df.head(10)

In [None]:
# Get encode_df info
encode_df.info()

In [None]:
# Drop redundant columns
encode_df.drop(columns=['Gender_Female', 'Ever_Married_No', 'Residence_Type_Rural'], inplace=True)
encode_df.info()

In [None]:
# Merge encoded df with medical df
numerical_medical_df = medical_df.merge(encode_df, left_index=True, right_index=True).drop(categorical_columns, axis=1)
print(medical_df.shape)
medical_df.head(10)

In [None]:
# Create SimpleImputer instance to replace missing BMI feature values with median BMI
imputer = SimpleImputer(strategy='mean')

# Fit SimpleImputer & transform data
med_transformed = imputer.fit_transform(numerical_medical_df)

In [None]:
# Add SimpleImputer outcome to dataframe
med_df_transformed = pd.DataFrame(med_transformed, columns=numerical_medical_df.columns)
print(med_df_transformed.shape)
med_df_transformed.head(10)

In [None]:
med_df_transformed.isnull().sum()

In [None]:
X_df = med_df_transformed.drop(columns=['Stroke'])
print(X_df.shape)
X_df.head(10)

In [None]:
X_scaled = StandardScaler().fit_transform(X_df)
print(X_scaled[:5])

In [None]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)
print(X_pca)

In [None]:
# Create a DataFrame with the three principal components.
X_pca_df = pd.DataFrame(
    data=X_pca, columns=['PC 1', 'PC 2', 'PC 3'])
X_pca_df.head()

In [None]:
X_df = X_df.merge(X_pca_df, left_index=True, right_index=True)
print(X_df.shape)
X_df.head(10)

In [None]:
# Initialize the K-Means model.
model = KMeans(n_clusters=2, random_state=0)

# Fit the model
model.fit(X_pca_df)

# Predict clusters
predictions = model.predict(X_pca_df)
predictions

In [None]:
X_df['class'] = model.labels_
print(X_df.shape)
X_df.head(10)

In [None]:
X_df['class'].value_counts()

In [None]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    X_df,
    x='PC 1',
    y='PC 2',
    z='PC 3',
    color='class',
    symbol='class',
    width=800)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [None]:
medical_df['class'] = model.labels_
medical_df = medical_df[['Age', 'Gender', 'Work_Type', 'Residence_Type', 'Ever_Married', 'Hypertension', 'Heart_Disease',
                        'Avg_Glucose_Lvl', 'BMI', 'Glucose_Status', 'Stroke', 'class']]
medical_df.head(10)

In [None]:
print(medical_df['class'].value_counts())
print(medical_df['Stroke'].value_counts())

In [None]:
false_negatives = medical_df.loc[(medical_df['Stroke'] == 1) & (medical_df['class'] == 0)]
false_negatives

In [None]:
true_positives = medical_df.loc[(medical_df['Stroke'] == 1) & (medical_df['class'] == 1)]
true_positives

In [None]:
false_positives = medical_df.loc[(medical_df['Stroke'] == 0) & (medical_df['class'] == 1)]
false_positives

In [None]:
true_negatives = medical_df.loc[(medical_df['Stroke'] == 0) & (medical_df['class'] == 0)]
true_negatives

In [None]:
positive_recall = len(true_positives) / (len(true_positives) + len(false_negatives))
positive_recall

In [None]:
accuracy = (len(true_positives + true_negatives)) / len(medical_df)
accuracy