# **Pre-processing and Data Training**

In [2]:
# Import relevant Libraries and load cleaned csv file
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/healthcare-dataset-stroke-data-cleaned.csv')

In [3]:
# Printing head
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.1,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


## *Encoding Data*

In [5]:
# Creating dummy variables
df_encoded = pd.get_dummies(df, drop_first=True)

# Verifying results
df_encoded.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,0,1,228.69,36.6,1,True,False,True,False,True,False,False,True,True,False,False
1,51676,61.0,0,0,202.21,28.1,1,False,False,True,False,False,True,False,False,False,True,False
2,31112,80.0,0,1,105.92,32.5,1,True,False,True,False,True,False,False,False,False,True,False
3,60182,49.0,0,0,171.23,34.4,1,False,False,True,False,True,False,False,True,False,False,True
4,1665,79.0,1,0,174.12,24.0,1,False,False,True,False,False,True,False,False,False,True,False


## *Scaling Data*

In [7]:
# Creating X and y variables
X = df_encoded.drop("stroke", axis=1).values
y = df_encoded["stroke"].values

# Initiating scaler object
scaler = StandardScaler()

# Fitting to data
X = scaler.fit_transform(X)

# Formatting data in a DataFrame and attaching 'stroke' column on the end
df_scaled = pd.DataFrame(X, columns=df_encoded.drop("stroke", axis=1).columns)
df_scaled["stroke"] = df_encoded["stroke"]

# Verifying results
df_scaled.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,stroke
0,-1.298312,1.051434,-0.328602,4.185032,2.706375,1.005086,1.18999,-0.01399,0.723884,-0.065756,0.864297,-0.436881,-0.394112,0.98408,2.184951,-0.766774,-0.427313,1
1,0.716371,0.78607,-0.328602,-0.238947,2.121559,-0.098981,-0.840343,-0.01399,0.723884,-0.065756,-1.15701,2.288955,-0.394112,-1.016178,-0.457676,1.304165,-0.427313,1
2,-0.255478,1.62639,-0.328602,4.185032,-0.005028,0.472536,1.18999,-0.01399,0.723884,-0.065756,0.864297,-0.436881,-0.394112,-1.016178,-0.457676,1.304165,-0.427313,1
3,1.118363,0.255342,-0.328602,-0.238947,1.437358,0.719327,-0.840343,-0.01399,0.723884,-0.065756,0.864297,-0.436881,-0.394112,0.98408,-0.457676,-0.766774,2.340204,1
4,-1.647136,1.582163,3.043196,-0.238947,1.501184,-0.631531,-0.840343,-0.01399,0.723884,-0.065756,-1.15701,2.288955,-0.394112,-1.016178,-0.457676,1.304165,-0.427313,1


In [8]:
# Saving encoded and scaled DataFrame to csv file
df_scaled.to_csv('../data/healthcare-dataset-stroke-data-scaled.csv', index=False)

## *Train/Test Split*

In [10]:
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

## *Storing variables*

In [12]:
# Stroring variables to use in final notebook
%store X_train
%store X_test
%store y_train
%store y_test

# Deleting variables from memory
del X_train
del X_test
del y_train
del y_test

Stored 'X_train' (ndarray)
Stored 'X_test' (ndarray)
Stored 'y_train' (ndarray)
Stored 'y_test' (ndarray)
