In [None]:
#Abhi Vinnakota and Gnandeep Chintala
#Quarter 3 Project
#3-24-21

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Reading Dataset

In [None]:
stroke_df = pd.read_csv(".\healthcare-dataset-stroke-data.csv") #reading the data file
stroke_df['smoking_status'].replace('Unknown', np.nan, inplace=True) #replacing unknown values in smoking status column with NaN
stroke_df.head()

# Data Cleaning

In [None]:
print(stroke_df['stroke'].value_counts(), '\n')  #counting the amount of stroke values vs missing values or NaN
print(stroke_df['stroke'].value_counts(normalize=True)) #percent of stroke values vs missing values
print("Samples:", stroke_df.shape[0]) #number of total samples

### Therefore, there are 5110 total stroke data samples, with only about 5% or 249 being positive stroke results. These rows are valuable, and it would be detrimental to remove these rows when cleaning the data. 

In [None]:
stroke_df.isna().sum()

### Since the smoking_status feature has 1544 missing values, which is more than 20 percent of the total samples, it would not be beneficial to remove just those missing rows. Instead, removing the entire smoking_status feature may result in a better analysis. Similarly, removing the bmi column entirely would also be useful. It would not help to remove all of these missing value rows, because they may contain many of the positive stroke results, which would negatively impact the classification. For this reason, we decided to drop both smoking_status and bmi, even though these are factors that could affect a stroke. 

# Preprocessing Steps

In [None]:
X = stroke_df.drop(['id', 'ever_married', 'bmi', 'smoking_status', 'work_type', 'gender', 'Residence_type', 'stroke'], axis = 1)
y = stroke_df['stroke']

In [None]:
X.head()

### We chose these features for classification after considering which columns would be most useful. After observing that the bmi and smoking_status columns had many missing values, we decided to drop both of them because we did not want to sacrifice samples that could be useful during the classification. The id column was dropped because it was simply not needed. The rest of the columns were dropped because they did not seem very relevant to the stroke data. For example, even though one's work type could influence their chance of getting a stroke, it is an external factor that isn't a measure of someone's health. Therefore, we chose to exclude such features and instead focus on features such as age that have the most impact on the chance of a stroke. 

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0) #20 percent train test split

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

# Prediction of Stroke Occurrence with Classification:

## Method 1: KNeighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=5)
classifier = knn_clf.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(classifier, X_test, y_test)  

### The KNeighbors Classifier correctly predicted 94% of the samples with a .94 accuracy value. This means that it correctly predicted 94 percent of the time that a patient would not get a stroke. Because the recall is is .99, there is a high number of true positives compared to false negatives. The precision at .95 is also very high, meaning there were a more true positives compared to false positives. A confusion matrix is also plotted above to visualize the overwhelming amount of true positives returned by the classifier. 

## Method 2: Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state = 42, n_estimators = 500)
classifier = rf_clf.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(classifier, X_test, y_test)  

### The Random Forest classifier had similar results to the KNeighbors classifier with a .98 recall score and .95 precision score. It ran almost as well as the KNeighbors but just a little bit worse. For this reason, we concluded that the KNeighbors classifier best predicted whether the patient had a stroke or not. As shown by the confusion matrix, the Random Forest classifier also did well at getting true positives. 

# Predicting Average Glucose Level with Regression:

## Preprocessing Steps

In [None]:
X = stroke_df.drop(['id', 'ever_married', 'bmi', 'smoking_status', 'work_type', 'gender', 'Residence_type', 'avg_glucose_level'], axis = 1)
y = stroke_df['avg_glucose_level']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
#scaling the training set
X_train = sc.fit_transform(X_train)
#scaling the test set
X_test = sc.transform(X_test)

## KNeighbors Regression

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn_reg = KNeighborsRegressor(n_neighbors=5)
regressor = knn_reg.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

from sklearn import metrics 

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

## Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(random_state=42, n_estimators=500)
regressor = rf_reg.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

## SVM Regression

In [None]:
from sklearn import svm
svm_reg = svm.SVR()

regressor = svm_reg.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

from sklearn import metrics 

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

### After running the three regression methods, we found that SVM regression returned the lowest Mean Absolute Error, meaning that it best predicted the average glucose level of a patient based on stroke occurence, age, and history of hypertension and heart disease. With a mean absolute error of about 29.5, the SVM regressor was able to predict the average glucose level of a patient with an error of 29.5 mmol/L. 

# Data Visualization of Stroke Patient Data:

## Data Clustering to find Insights/Correlations:

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
features = ['age', 'avg_glucose_level', 'bmi'] #these numerical columns are health factors that can affect stroke likelihood
sns.pairplot(stroke_df[['stroke'] + features].sort_values('stroke'), hue='stroke', height=2) #creating a pairplot for the stroke column

### The pairplot returned various plots which relate occurrence of a stroke to either BMI, age, or average glucose level. We concluded from the pairplot that age and glucose level have a large impact on stroke occurrence while BMI has the least impact. This is also supported by the clustering done below. 

In [None]:
X = stroke_df.drop(['id', 'ever_married', 'bmi', 'smoking_status', 'work_type', 'gender', 'Residence_type', 'stroke'], axis = 1)
y = stroke_df['stroke']

#dividing data into features and labels
features = X.filter(['age','avg_glucose_level'],axis = 1)
labels = y
#training KMeans model
features = features.values
km_model = KMeans(n_clusters = 2) #creating two clusters, one for positive stroke occurrence and one for negative
km_model.fit(features)

In [None]:
plt.xlabel('Age')
plt.ylabel('Glucose Level')
plt.title('Relationship between Age, Glucose Levels, and the Event of Having a Stroke')

plt.scatter(features[:,0], features[:,1], c = km_model.labels_, cmap = 'rainbow')

plt.scatter(km_model.cluster_centers_[:,0], km_model.cluster_centers_[:,1], s = 100, c = 'black')

### The first data clustering we ran was to find the relationship between age, average glucose level, and the occurence of a stroke. The points in red represent a patient who had a stroke, and the point in blue represent a patient who did not have a stroke. As you can see on the plot above, there seems to be a divide between the red and blue points around the 150 glucose level mark. This must mean there is a threshold where patients with glucose levels above 150 can have a stroke. We also noticed that the density of the red points increases as the age gets higher, which supports the conclusion that older people are more at risk for a stroke. 

In [None]:
stroke_df = stroke_df.dropna() #dropping all NaN values from BMI column
stroke_df = stroke_df.reset_index(drop=True)

stroke_df.head()

In [None]:
X = stroke_df.drop(['id', 'ever_married', 'smoking_status', 'work_type', 'gender', 'Residence_type', 'stroke'], axis = 1)
y = stroke_df['stroke']

In [None]:
#dividing data into features and labels
features = X.filter(['avg_glucose_level','bmi'],axis = 1)
labels = y
#training KMeans model
features = features.values
km_model = KMeans(n_clusters = 2)
km_model.fit(features)
#print the data points
#plt.scatter(ages,glucose_levels, c = 'red')

plt.xlabel('Glucose Level')
plt.ylabel('BMI')
plt.title('Relationship between BMI, Glucose Levels, and the Event of Having a Stroke')
#print the data points
plt.scatter(features[:,0], features[:,1], c = km_model.labels_, cmap = 'rainbow')

#print the centroids
plt.scatter(km_model.cluster_centers_[:,0], km_model.cluster_centers_[:,1], s=100, c = 'black')

### After plotting clusters relating BMI to glucose level, we found a similar threshold at 150 mmol/L where patients above this level had a stroke. We also observed that the BMI didn't seem to have a large impact on the clustering, however it is interesting to note that the red cluster is most dense between 20 and 50 BMI. This either means that most of the patients were at this BMI level, or that stroke happen more frequenly at that level. 

# Conclusions

### Overall, we were happy with the results. With correlation we were able to predict the occurence of a stroke, with regression we were able to predict the glucose level based on stroke occurence and other factors, and finally with clustering we were able to visualize and prove which factors have the most impact in the likelihood of a stroke. Our conclusions are discussed more in depth in the write up for the project. 