In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
path = 'Resources/stroke_data.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [3]:
len(df)

43400

In [4]:
df.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [7]:
# Find null values
for column in df.columns:
    print(f'{column}: {df[column].isnull().sum()} null values')

id: 0 null values
gender: 0 null values
age: 0 null values
hypertension: 0 null values
heart_disease: 0 null values
ever_married: 0 null values
work_type: 0 null values
Residence_type: 0 null values
avg_glucose_level: 0 null values
bmi: 1462 null values
smoking_status: 13292 null values
stroke: 0 null values


In [8]:
df = df.dropna()
len(df)

29072

In [10]:
# Find duplicate entries
print(f'Duplicate entries: {df.duplicated().sum()}')

Duplicate entries: 0


In [11]:
# Remove the id column
df.drop(columns = ['id'], inplace = True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
6,Female,52.0,0,0,Yes,Private,Urban,77.59,17.7,formerly smoked,0
7,Female,75.0,0,1,Yes,Self-employed,Rural,243.53,27.0,never smoked,0
8,Female,32.0,0,0,Yes,Private,Rural,77.67,32.3,smokes,0


### Fixing gender column

In [13]:
df['gender'].value_counts()

Female    17852
Male      11213
Other         7
Name: gender, dtype: int64

In [14]:
not_other_gender = df['gender'] != 'Other'
df = df.loc[not_other_gender]
df['gender'].value_counts()

Female    17852
Male      11213
Name: gender, dtype: int64

### Looking at other categorical columns

In [17]:
df['ever_married'].value_counts()

Yes    21687
No      7378
Name: ever_married, dtype: int64

In [16]:
df['Residence_type'].value_counts()

Urban    14592
Rural    14473
Name: Residence_type, dtype: int64

In [18]:
df['smoking_status'].value_counts()

never smoked       15746
formerly smoked     7093
smokes              6226
Name: smoking_status, dtype: int64

### Fixing work_type column

In [15]:
df['work_type'].value_counts()

Private          18950
Self-employed     5204
Govt_job          4195
children           615
Never_worked       101
Name: work_type, dtype: int64

In [19]:
x = {'children': 'Never_worked'}   
df = df.replace(x)
df['work_type'].value_counts()

Private          18950
Self-employed     5204
Govt_job          4195
Never_worked       716
Name: work_type, dtype: int64