# <center> Stroke Prediction </center>
#### <center> Zach Hanson </center>

## Import Libraries and Data

### Libraries

In [135]:
#Pandas, numpy, matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Seaborn for better visual on graphs
import seaborn as sns
#Default theme as seaborn
sns.set_theme()

#Transformers
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

#Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

#Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticeRegression
from sklearn.neighbors import KNeighborsClassifier

#PCA
from sklearn.decomposition import PCA

#Evaluation Metrics
from sklearn.metrics import classification_report

#Setting global SciKit-Learn Configuration
#Easier to visualize pipelines
from sklearn import set_config
set_config(display='diagram')

ImportError: cannot import name 'LogisticeRegression' from 'sklearn.linear_model' (C:\Users\Zach\anaconda3\envs\dojo-env\lib\site-packages\sklearn\linear_model\__init__.py)

### Data

In [None]:
#Loading in data
filename = 'healthcare-dataset-stroke-data.csv'
df = pd.read_csv(filename)

In [None]:
#Checking to see if loaded properly
df.head()

- Looks to be loaded properly

## Data Cleaning

### Removing Unnecessary Columns

In [None]:
df = df.drop(columns = 'id')
df.head()

- ID column was dropped successfully
  - This column is not important to predicting stroke risk as each identifier will be different

### Removing Unnecessary Rows

In [None]:
duprow = df.duplicated().sum()
print(f"There are {duprow} duplicated rows.")

### Missing Values

In [None]:
df.isna().sum()

- The only missing values in this data set are in the "bmi" column.
- This will be addressed later using an imputer during our model creations.

### Inconsistencies in Data

#### Gender

In [None]:
df['gender'].value_counts()

- No inconsistencies in this column.

#### Ever Married

In [None]:
df['ever_married'].value_counts()

- No inconsistencies in this column.

#### Work Type

In [None]:
df['work_type'].value_counts()

- No inconsistencies in this column.

#### Residence Type

In [None]:
df['Residence_type'].value_counts()

- No inconsistencies in this column.

#### Smoking Status

In [None]:
df['smoking_status'].value_counts()

- No inconsistencies in this column.

## Exploratory Analysis

In [None]:
#Copying data frame to prevent data leakage
ea_df = df.copy()

#Checking to see if it copied correctly
ea_df.head()

- Looks to be copied correctly

### Categorical Features

In [None]:
#Creating Subplot for features
fig, ax = plt.subplots(nrows = 2,              #Number of Rows
                         ncols = 3,            #Number of Columns
                         figsize = (20, 15))   #Size of each subplot


#Adding Gender subplot
sp1 = sns.histplot(ax = ax[0][0], data = ea_df, x = 'gender');

#Adding married subplot
sp2 = sns.histplot(ax = ax[0][1], data = ea_df, x = 'ever_married')

#Adding work type subplot
sp3 = sns.histplot(ax = ax[0][2], data = ea_df, x = 'work_type')

#Adding residence type subplot
sp4 = sns.histplot(ax = ax[1][0], data = ea_df, x = 'Residence_type')

#adding smoking status subplot
sp5 = sns.histplot(ax = ax[1][1], data = ea_df, x = 'smoking_status')


#Adding space between subplots for readability
fig.subplots_adjust(hspace = 0.5)

#Rotating tick labels for readability
sp1.set_xticklabels(sp1.get_xticklabels(),
                    rotation = 45);
sp2.set_xticklabels(sp2.get_xticklabels(),
                    rotation = 45);
sp3.set_xticklabels(sp3.get_xticklabels(),
                    rotation = 45);
sp4.set_xticklabels(sp4.get_xticklabels(),
                    rotation = 45);
sp5.set_xticklabels(sp5.get_xticklabels(),
                    rotation = 45);


#Since we only need 5 subplots, hiding 6th one
ax[1][2].set_visible(False)
#Moving bottom two subplots to be more centered for better viewing
ax[1][0].set_position([0.24, 0.125, 0.228, 0.343])
ax[1][1].set_position([0.55, 0.125, 0.228, 0.343])

- Code for the positioning of subplots 4 and 5 was taken from: https://stackoverflow.com/questions/26767281/position-5-subplots-in-matplotlib

- Gender
    - We have a fairly even split of male/female, with slightly more females.
- Ever Married
    - We have about double the amount of people who were married vs. not married.
- Job Type
    - Many of the responses chose to not disclose their job type
    - People who did respond have a very even split between self employed, government job, and stay-at-home parents, with a very small amount of people who have not worked.
- Residence Type
    - Very even split between urban and rural residence
- Smoking Status
    - About half of the people who responded to this have never smoked
    - The other half is fairly evenly split between currently smokes and formerly smokes

### Numerical Features

In [None]:
#Creating Subplot for features
fig, ax = plt.subplots(nrows = 3,            #Number of Rows
                       ncols = 1,            #Number of Columns
                       figsize = (15, 10))   #Size of each subplot

#Adjusting padding between the 3 subplots for readability
fig.subplots_adjust(hspace=0.5)


#Adding Age subplot
sns.boxplot(ax=ax[0], data=ea_df, x='age', notch=True);

#Adding avg Glucose Level subplot
sns.boxplot(ax=ax[1], data=ea_df, x='avg_glucose_level', notch=True);

#Adding bmi subplot
sns.boxplot(ax=ax[2], data=ea_df, x='bmi', notch=True);

- Age
    - Fairly even distribution, some very low values that may need to be looked at more
    - Could just be thorough sampling and including a very young baby
- Average Glucose Level
    - Majority of data lies between about 50 and 170
    - Quite a few outliers reaching over 250
    - Mean of around 90, 50% of the data between about 75 and 115
- BMI
    - Majority of data between about 5 and 50
    - Outliers for the most part are not much higher than the max of our boxplot
    - There are a few outliers up around 90 which may need to be addressed, might not be realistic value
    - Most of the data appears to be normal, with the average BMI in the high 20s

In [None]:
#Changing hypertension and heart disease responses from 1/0 to yes/no
ea_df['hypertension'] = ea_df['hypertension'].replace({0: "No",
                                                       1: "Yes"})
ea_df['heart_disease'] = ea_df['heart_disease'].replace({0: "No",
                                                         1: "Yes"})
ea_df.head()

In [None]:
#Creating Subplot for features
fig, ax = plt.subplots(nrows = 1,            #Number of Rows
                       ncols = 2,            #Number of Columns
                       figsize = (15, 10))   #Size of each subplot

#Adding hypertension to subplots
sns.histplot(ax=ax[0], data=ea_df, x='hypertension');

#Adding heart disease to subplots
sns.histplot(ax=ax[1], data=ea_df, x='heart_disease');

- Hypertension
    - Majority of responses had no hypertension, may need to be considered during modeling later
- Heart Disease
    - Again, majority of responses had no history of heart disease. Also may need to be considered during modeling

### Target

In [None]:
#Changing 0's and 1's to "yes" or "no" for graphing
ea_df['stroke'] = ea_df['stroke'].replace({0: "No",
                                           1: "Yes"})

ea_df['stroke'].head()

In [None]:
#Plotting
sns.histplot(data=ea_df, x='stroke');

- We can see the large majority of the people did not have a stroke
    - This will need to be taken into account during machine learning models

In [None]:
ea_df['stroke'].value_counts()

### Multivariate Analysis


In [None]:
#Copying original dataframe
mv_df = df.copy()
mv_df.head()

In [None]:
#Changing married column to 1 for yes, 0 for no
mv_df['ever_married'] = mv_df['ever_married'].replace({'Yes': 1,
                                                 'No': 0})

mv_df.head()

#### Heatmap for Correlations

In [None]:
#Finding correlations between numerical variables
corr = mv_df[['age', 'hypertension', 'heart_disease', 'ever_married',
              'avg_glucose_level', 'bmi', 'stroke']].corr()

#Creating heatmap of correlations
sns.heatmap(corr, annot = True)

#rotating and shifting labels for readability
plt.xticks(rotation=45,ha = 'right');

- No large correlations between our target, stroke, and the other features of our data
    - Weak positive correlation between age and stroke
- Strong positive correlation between age and whether or not the person was married
- Weak positive correlation between age and bmi
- Weak positive correlation between bmi and if the person was married

#### Multivariate Visualizations

In [None]:
ea_df.groupby('stroke')['work_type'].value_counts().plot(kind='bar');
plt.xticks(rotation=45, ha='right');

In [None]:
ea_df.groupby('stroke')['Residence_type'].value_counts().plot(kind='bar');
plt.xticks(rotation=45, ha='right');

In [None]:
ea_df.groupby('stroke')['smoking_status'].value_counts().plot(kind='bar');
plt.xticks(rotation=45, ha='right');

- Work Type
    - People who recorded a "Private" work type tended to have the highest number of strokes, could be skewed because it is such a large range of jobs
    - Self-Employed people had a very small amount more strokes than people who worked a government job
    - People who worked with children recorded very little (2) strokes
- Residence Type
    - There seems to be extremely little if any correlation between residence type and whether or not the person had a stroke or not
    - Very even split between rural and urban
- Smoking Status
    - If we combine people who have or have previously smoked, there is a small amount more of these people who have had strokes versus non smokers

### Key Trends

#### Trend 1

In [None]:
#Initial graph
ea_df.groupby('stroke')['smoking_status'].value_counts().unstack().plot(
                                kind='barh', color=['#E80000',    #Red
                                                    '#1E73BE',    #Blue
                                                    '#C850B0',    #Pinkish
                                                    '#660A60']);  #Purple
                                                    

#Adding label to x axis
plt.xlabel('Count');

#Rotating y label
plt.ylabel('Stroke', rotation=0, labelpad=25);

#Adding title
plt.title('Stroke vs. Smoking Status Among Respondants');

- People that have had a stroke are slightly more likely to currently or formerly smoke than non smokers.
- More non-smokers have not had strokes compared to current or former smokers.
- Unknown entries may swing the data more drastically in one direction, especially with the small amount of data we have for people who have had a stroke.

#### Trend 2

In [None]:
#Getting median value of age for both stroke categories
ea_df.groupby('stroke')['age'].median().plot(kind='bar')

#Making x label capitalized
plt.xlabel('Stroke', labelpad=10)
#Rotatingi x tick labels for readability
plt.xticks(rotation=0)

#Adding y label
plt.ylabel('Age', rotation=0, labelpad=25)

#Adding title
plt.title('Median Age of People who Suffered a Stroke vs. No Stroke');

- People who did not have a stroke have a fairly average age of just over 40.
- People who did have a stroke have a median age of just over 70.
- We can see the median age of people who have had a stroke is much higher, about 30 years, than people who have not had a stroke. 

## Machine Learning

### Preparing for Preprocessing

In [None]:
#Copying data to prevent leakage
ml_df = df.copy()
ml_df.head()

In [None]:
ml_df.info()

- Looks to have copied successfully 

- None of our data requires ordinal encoding so we can OneHotEncode all of our categorical data.
- Want to update a few columns from integer datatype to object because they do not need to be scaled (Hypertension, etc)

In [None]:
#Updating datatypes
ml_df['hypertension'] = ml_df['hypertension'].astype(object)
ml_df['heart_disease'] = ml_df['heart_disease'].astype(object)
ml_df['stroke'] = ml_df['stroke'].astype(object)

ml_df.info()

- Converted successfully

### Preprocessing

#### Defining Target and Features

In [None]:
target = 'stroke'
y = ml_df[target].copy()
X = ml_df.drop(columns = [target]).copy()

#### Splitting Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

#### Column Selectors

In [None]:
#Numerical selector
num_selector = make_column_selector(dtype_include='number')

#Categorical selector
cat_selector = make_column_selector(dtype_include='object')

#### Transformers

In [None]:
#Instantiating Transformers, scaler for num, OHE for cat
scaler = StandardScaler()
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

#### Tuples

In [None]:
#Numerical tuple
num_tuple = (scaler, num_selector)

#Categorical tuple
cat_tuple = (ohe, cat_selector)

- Do not need to make pipelines here because we do not need to impute any data, can skip ahead to the tuples for column transformer

#### Column Transformer

In [None]:
preprocessor = make_column_transformer(num_tuple,
                                       cat_tuple,
                                       remainder='drop')
preprocessor

### Models

#### Decision Tree

In [None]:
#Instantiate Decision Tree
dec_tree = DecisionTreeClassifier(random_state=42)

#### Logistic Regression

In [None]:
#Instantiate Logistic Regression
logreg = LogisticRegression(random_state=42)

#### KNN

In [None]:
#Instantiate KNN
knn = KNeighborsClassifier()

#### Dec tree or log reg (based on performance) with PCA

In [None]:
#Instantiate PCA
pca = PCA(n_components = 0.95)

- Retaining 95% of the variance

#### KNN with PCA

- Instantiated PCA above as "pca", do not need to instantiate again