In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model  import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os

In [2]:
file_path = os.path.join("dataset",'stroke_data.csv')

In [3]:
df = pd.read_csv(file_path)
df

Unnamed: 0.1,Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
1,3,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
2,6,Female,52.0,0,0,Yes,Private,Urban,77.59,17.7,formerly smoked,0
3,7,Female,75.0,0,1,Yes,Self-employed,Rural,243.53,27.0,never smoked,0
4,8,Female,32.0,0,0,Yes,Private,Rural,77.67,32.3,smokes,0
...,...,...,...,...,...,...,...,...,...,...,...,...
29060,43395,Female,10.0,0,0,No,Never_worked,Urban,58.64,20.4,never smoked,0
29061,43396,Female,56.0,0,0,Yes,Govt_job,Urban,213.61,55.4,formerly smoked,0
29062,43397,Female,82.0,1,0,Yes,Private,Urban,91.94,28.9,formerly smoked,0
29063,43398,Male,40.0,0,0,Yes,Private,Urban,99.16,33.2,never smoked,0


In [4]:
def clean_smoking(smoking_status):
    
    never_smoked_list = []
    formerly_smoked_list = []
    smokes_list = []
    
    for x in smoking_status:
        if x == 'never smoked':
            never_smoked_list.append(1)
            formerly_smoked_list.append(0)
            smokes_list.append(0)

        elif x == 'formerly smoked':
            never_smoked_list.append(0)
            formerly_smoked_list.append(1)
            smokes_list.append(0)

        elif x == 'smokes':
            never_smoked_list.append(0)
            formerly_smoked_list.append(0)
            smokes_list.append(1)

    return(never_smoked_list, formerly_smoked_list, smokes_list)

never_smoked, formerly_smoked, smokes = clean_smoking(df['smoking_status'])

df['never_smoked'] = never_smoked
df['formerly_smoked'] = formerly_smoked
df['smokes'] = smokes

In [5]:
def clean_work(work_type):

    Private_list = []
    Self_employed_list = []
    Govt_job_list = []
    Never_worked_list = []
    
    for x in work_type:
        if x == 'Private':
            Private_list.append(1)
            Self_employed_list.append(0)
            Govt_job_list.append(0)
            Never_worked_list.append(0)

        elif x == 'Self-employed':
            Private_list.append(0)
            Self_employed_list.append(1)
            Govt_job_list.append(0)
            Never_worked_list.append(0)

        elif x == 'Govt_job':
            Private_list.append(0)
            Self_employed_list.append(0)
            Govt_job_list.append(1)
            Never_worked_list.append(0)
            
        elif x == 'Never_worked':
            Private_list.append(0)
            Self_employed_list.append(0)
            Govt_job_list.append(0)
            Never_worked_list.append(1)

    return(Private_list, Self_employed_list, Govt_job_list,Never_worked_list)

Private, Self_employed, Govt_job, Never_worked = clean_work(df['work_type'])

df['Private'] = Private
df['Self_employed'] = Self_employed
df['Govt_job'] = Govt_job
df['Never_worked'] = Never_worked

In [6]:
df['ever_married'] = df['ever_married'].replace({'Yes': 1, 'No': 0})
df['residence_type'] = df['residence_type'].replace({'Urban': 1, 'Rural': 0})
df['gender'] = df['gender'].replace({'Male': 1, 'Female': 0})

In [7]:
df = df.drop(['work_type', 'smoking_status', 'Unnamed: 0'], axis=1)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,residence_type,avg_glucose_level,bmi,stroke,never_smoked,formerly_smoked,smokes,Private,Self_employed,Govt_job,Never_worked
0,1,58.0,1,0,1,1,87.96,39.2,0,1,0,0,1,0,0,0
1,0,70.0,0,0,1,0,69.04,35.9,0,0,1,0,1,0,0,0
2,0,52.0,0,0,1,1,77.59,17.7,0,0,1,0,1,0,0,0
3,0,75.0,0,1,1,0,243.53,27.0,0,1,0,0,0,1,0,0
4,0,32.0,0,0,1,0,77.67,32.3,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29060,0,10.0,0,0,0,1,58.64,20.4,0,1,0,0,0,0,0,1
29061,0,56.0,0,0,1,1,213.61,55.4,0,0,1,0,0,0,1,0
29062,0,82.0,1,0,1,1,91.94,28.9,0,0,1,0,1,0,0,0
29063,1,40.0,0,0,1,1,99.16,33.2,0,1,0,0,1,0,0,0


In [8]:
corrrelation = df.corr(method="pearson");

# PEARSON CORRELATION COEFFICIENT

In [9]:
corrrelation['stroke']

gender               0.012257
age                  0.154059
hypertension         0.078674
heart_disease        0.105142
ever_married         0.047724
residence_type       0.001962
avg_glucose_level    0.075447
bmi                 -0.004039
stroke               1.000000
never_smoked        -0.020755
formerly_smoked      0.027249
smokes              -0.003321
Private             -0.024582
Self_employed        0.048088
Govt_job            -0.009425
Never_worked        -0.022031
Name: stroke, dtype: float64

In [10]:
corrrelation['heart_disease']

gender               0.097634
age                  0.247430
hypertension         0.117962
heart_disease        1.000000
ever_married         0.095226
residence_type      -0.003127
avg_glucose_level    0.137482
bmi                  0.022740
stroke               0.105142
never_smoked        -0.083632
formerly_smoked      0.076389
smokes               0.021596
Private             -0.045288
Self_employed        0.081756
Govt_job            -0.011363
Never_worked        -0.037281
Name: heart_disease, dtype: float64

In [11]:
corrrelation = df.corr(method="kendall")

# KENDALL CORRELATION COEFFICIENT

In [12]:
corrrelation['stroke']

gender               0.012257
age                  0.124671
hypertension         0.078674
heart_disease        0.105142
ever_married         0.047724
residence_type       0.001962
avg_glucose_level    0.038813
bmi                  0.000678
stroke               1.000000
never_smoked        -0.020755
formerly_smoked      0.027249
smokes              -0.003321
Private             -0.024582
Self_employed        0.048088
Govt_job            -0.009425
Never_worked        -0.022031
Name: stroke, dtype: float64

In [13]:
corrrelation['heart_disease']

gender               0.097634
age                  0.202823
hypertension         0.117962
heart_disease        1.000000
ever_married         0.095226
residence_type      -0.003127
avg_glucose_level    0.074154
bmi                  0.027242
stroke               0.105142
never_smoked        -0.083632
formerly_smoked      0.076389
smokes               0.021596
Private             -0.045288
Self_employed        0.081756
Govt_job            -0.011363
Never_worked        -0.037281
Name: heart_disease, dtype: float64

# SPEARMAN CORRELATION COEFFICIENT

In [14]:
corrrelation = df.corr(method="spearman")

In [15]:
corrrelation['stroke']

gender               0.012257
age                  0.151572
hypertension         0.078674
heart_disease        0.105142
ever_married         0.047724
residence_type       0.001962
avg_glucose_level    0.047532
bmi                  0.000828
stroke               1.000000
never_smoked        -0.020755
formerly_smoked      0.027249
smokes              -0.003321
Private             -0.024582
Self_employed        0.048088
Govt_job            -0.009425
Never_worked        -0.022031
Name: stroke, dtype: float64

In [16]:
corrrelation['heart_disease']

gender               0.097634
age                  0.246587
hypertension         0.117962
heart_disease        1.000000
ever_married         0.095226
residence_type      -0.003127
avg_glucose_level    0.090814
bmi                  0.033290
stroke               0.105142
never_smoked        -0.083632
formerly_smoked      0.076389
smokes               0.021596
Private             -0.045288
Self_employed        0.081756
Govt_job            -0.011363
Never_worked        -0.037281
Name: heart_disease, dtype: float64