# **Logistic Regression**

## **Step 1: Importing The Required Libraries**

In [31]:
import numpy as np
import pandas as pd

df = pd.read_csv("/content/framingham_heart_disease.csv")
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


## **Step 2: Cleaning The Dataset**

In [5]:
df.isnull().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [6]:
series = pd.isnull(df['cigsPerDay'])
df[series]

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
131,1,43,4.0,1,,0.0,0,0,0,222.0,109.5,69.0,25.5,75.0,,0
139,1,49,4.0,1,,0.0,0,0,0,256.0,127.5,81.5,28.21,93.0,85.0,1
1046,0,49,1.0,1,,0.0,0,0,0,280.0,120.0,80.0,22.33,90.0,75.0,0
1292,1,42,3.0,1,,0.0,0,0,0,225.0,122.5,80.0,25.54,90.0,90.0,0
1347,0,58,4.0,1,,0.0,0,1,0,270.0,195.0,117.5,23.35,75.0,,0
1451,1,54,1.0,1,,0.0,0,0,0,219.0,110.0,72.0,26.05,95.0,86.0,0
1497,1,55,1.0,1,,0.0,0,0,0,214.0,132.5,85.5,29.25,70.0,103.0,0
1610,0,61,1.0,1,,0.0,0,1,0,356.0,168.0,98.0,27.3,103.0,106.0,0
1625,0,49,2.0,1,,0.0,0,1,0,233.0,158.0,102.0,25.31,90.0,72.0,0
1870,0,47,2.0,1,,0.0,0,0,0,365.0,127.0,76.0,24.44,72.0,80.0,0


In [7]:
data = df.drop(['currentSmoker','education'], axis = 'columns')
data.head()

Unnamed: 0,male,age,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [8]:
cigs = data['cigsPerDay']
cigs.head()

0     0.0
1     0.0
2    20.0
3    30.0
4    23.0
Name: cigsPerDay, dtype: float64

In [9]:
cig = cigs.mean()

In [11]:
import math
integer_value = math.floor(cig)

cigs.fillna(integer_value, inplace = True)
data.isnull().sum()

male                 0
age                  0
cigsPerDay           0
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [12]:
data.dropna( axis = 0, inplace = True)
data.isnull().sum()

male               0
age                0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

## **Step 3: Analyzing The Dataset**

In [13]:
Heart_Attack = data[data.TenYearCHD == 1]
Heart_Attack.head()

Unnamed: 0,male,age,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
3,0,61,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
6,0,63,0.0,0.0,0,0,0,205.0,138.0,71.0,33.11,60.0,85.0,1
15,0,38,20.0,0.0,0,1,0,221.0,140.0,90.0,21.35,95.0,70.0,1
17,0,46,20.0,0.0,0,0,0,291.0,112.0,78.0,23.38,80.0,89.0,1
25,1,47,20.0,0.0,0,0,0,294.0,102.0,68.0,24.18,62.0,66.0,1


In [15]:
Heart_Attack = data[data.TenYearCHD == 1]
Heart_Attack.head()

Unnamed: 0,male,age,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
3,0,61,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
6,0,63,0.0,0.0,0,0,0,205.0,138.0,71.0,33.11,60.0,85.0,1
15,0,38,20.0,0.0,0,1,0,221.0,140.0,90.0,21.35,95.0,70.0,1
17,0,46,20.0,0.0,0,0,0,291.0,112.0,78.0,23.38,80.0,89.0,1
25,1,47,20.0,0.0,0,0,0,294.0,102.0,68.0,24.18,62.0,66.0,1


In [16]:

No_Heart_Attack = data[data.TenYearCHD == 0]
No_Heart_Attack.head()

Unnamed: 0,male,age,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
4,0,46,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0
5,0,43,0.0,0.0,0,1,0,228.0,180.0,110.0,30.3,77.0,99.0,0


In [17]:
data.groupby('TenYearCHD').mean()

Unnamed: 0_level_0,male,age,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose
TenYearCHD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,0.425312,48.740313,8.72375,0.024063,0.004063,0.27625,0.020625,235.347187,130.308437,82.182969,25.662769,75.609688,80.665
1,0.557491,54.233449,10.574913,0.06446,0.013937,0.506969,0.062718,246.311847,143.827526,87.16115,26.644094,76.378049,88.827526


In [18]:
final = data.drop(['diaBP','BMI','heartRate'], axis = 'columns')

In [19]:
No_Heart_Attack = final[final.TenYearCHD == 0]
No_Heart_Attack.head()

Unnamed: 0,male,age,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,glucose,TenYearCHD
0,1,39,0.0,0.0,0,0,0,195.0,106.0,77.0,0
1,0,46,0.0,0.0,0,0,0,250.0,121.0,76.0,0
2,1,48,20.0,0.0,0,0,0,245.0,127.5,70.0,0
4,0,46,23.0,0.0,0,0,0,285.0,130.0,85.0,0
5,0,43,0.0,0.0,0,1,0,228.0,180.0,99.0,0


In [20]:
Heart_Attack = final[final.TenYearCHD == 1]
Heart_Attack.head()

Unnamed: 0,male,age,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,glucose,TenYearCHD
3,0,61,30.0,0.0,0,1,0,225.0,150.0,103.0,1
6,0,63,0.0,0.0,0,0,0,205.0,138.0,85.0,1
15,0,38,20.0,0.0,0,1,0,221.0,140.0,70.0,1
17,0,46,20.0,0.0,0,0,0,291.0,112.0,89.0,1
25,1,47,20.0,0.0,0,0,0,294.0,102.0,66.0,1


In [21]:
final.groupby('TenYearCHD').mean()

Unnamed: 0_level_0,male,age,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,glucose
TenYearCHD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.425312,48.740313,8.72375,0.024063,0.004063,0.27625,0.020625,235.347187,130.308437,80.665
1,0.557491,54.233449,10.574913,0.06446,0.013937,0.506969,0.062718,246.311847,143.827526,88.827526


In [23]:
X = final[['male','age','cigsPerDay','BPMeds','prevalentStroke','prevalentHyp','diabetes','totChol','sysBP','glucose']]
y = final['TenYearCHD']

In [24]:
X

Unnamed: 0,male,age,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,glucose
0,1,39,0.0,0.0,0,0,0,195.0,106.0,77.0
1,0,46,0.0,0.0,0,0,0,250.0,121.0,76.0
2,1,48,20.0,0.0,0,0,0,245.0,127.5,70.0
3,0,61,30.0,0.0,0,1,0,225.0,150.0,103.0
4,0,46,23.0,0.0,0,0,0,285.0,130.0,85.0
...,...,...,...,...,...,...,...,...,...,...
4231,1,58,0.0,0.0,0,1,0,187.0,141.0,81.0
4232,1,68,0.0,0.0,0,1,0,176.0,168.0,79.0
4233,1,50,1.0,0.0,0,1,0,313.0,179.0,86.0
4234,1,51,43.0,0.0,0,0,0,207.0,126.5,68.0


In [25]:
y

0       0
1       0
2       0
3       1
4       0
       ..
4231    0
4232    1
4233    1
4234    0
4237    0
Name: TenYearCHD, Length: 3774, dtype: int64

## **Step 4: Preparing the Model**

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state = 99)

In [27]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [28]:
model.fit(X_train, y_train)
model.score(X_test,y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8556291390728477

In [30]:
y_pred = model.predict(X_test)

from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test,y_pred)
matrix

array([[642,  10],
       [ 99,   4]])