In [1]:
import pandas as pd
import numpy as np
from statistics import mean

In [2]:
# load the data
data = pd.read_csv('normalized_data.csv')  # cleaned and normalized data
data = data.dropna()

# create feature vectors
bp_x = data.drop(labels=['ap_lo', 'ap_hi', 'cardio', 'bp', 'bmi'], axis=1)
bmi_x = data.drop(labels=['bmi', 'height', 'weight', 'bp'], axis=1)

# create label vectors
ap_lo = data['ap_lo']
ap_hi = data['ap_hi']
cardio = data['cardio']
bp = data['bp']
bmi = data['bmi']

In [3]:
bp_x.describe()

Unnamed: 0,age,height,weight,gender,smoke,alco,active,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2
count,68023.0,68023.0,68023.0,68023.0,68023.0,68023.0,68023.0,68023.0,68023.0,68023.0,68023.0,68023.0
mean,0.671328,0.560808,0.333974,0.348691,0.088014,0.053585,0.803566,0.749791,0.135572,0.114638,0.85033,0.073607
std,0.1928,0.042097,0.075844,0.476559,0.283318,0.225198,0.397303,0.433137,0.342336,0.318587,0.35675,0.261133
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.527778,0.533333,0.285714,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
50%,0.694444,0.564103,0.322751,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,0.805556,0.589744,0.375661,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
bmi_x.describe()

Unnamed: 0,age,ap_hi,ap_lo,gender,smoke,alco,active,cardio,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2
count,68023.0,68023.0,68023.0,68023.0,68023.0,68023.0,68023.0,68023.0,68023.0,68023.0,68023.0,68023.0,68023.0
mean,0.671328,0.708437,0.58201,0.348691,0.088014,0.053585,0.803566,0.494906,0.749791,0.135572,0.114638,0.85033,0.073607
std,0.1928,0.045866,0.038051,0.476559,0.283318,0.225198,0.397303,0.499978,0.433137,0.342336,0.318587,0.35675,0.261133
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.527778,0.692308,0.576923,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,0.694444,0.692308,0.576923,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
75%,0.805556,0.74359,0.615385,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
ap_hi.describe()

count    68023.000000
mean         0.708437
std          0.045866
min          0.000000
25%          0.692308
50%          0.692308
75%          0.743590
max          1.000000
Name: ap_hi, dtype: float64

In [6]:
bmi.describe()

count    68023.000000
mean        27.530371
std          6.096176
min          3.471784
25%         23.875115
50%         26.346494
75%         30.119376
max        298.666667
Name: bmi, dtype: float64

In [7]:
bp.describe()

count    68023.000000
mean         1.212384
std          0.738666
min          0.000000
25%          1.000000
50%          1.000000
75%          2.000000
max          2.000000
Name: bp, dtype: float64

In [8]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.svm import SVR, SVC


def linear_model(features, target, cv=10):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    print("score:", model.score(X_test, y_test))
    print("score:", cross_val_score(model, features, target, cv=cv))
    print("coefs:", model.coef_)

In [9]:
def svr_model(features, target, cv=10):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    model = SVR(kernel='rbf', gamma=1e-5, C=10)
    model.fit(X_train, y_train)
    print(cross_val_score(model, features, target, cv=cv))

In [13]:
# parameters = {
#     "gamma" : [1e-7, 1e-5, 1e-3],
#     "C" : [1, 2, 10],
#     "kernel" : ['rbf', 'poly']
# }
# svr = SVR()
# clf = GridSearchCV(svr, parameters, cv=10)
# clf.fit(bp_x,ap_hi)
# print(clf.best_params_)

# gridsearch(bp_x, ap_hi, cv=5)
# optimal parameters for predicting ap_hi was kernel='rbf', gamma-1e-5, C=10

In [14]:
# linear regression to predict systolic blood pressure
linear_model(bp_x, ap_hi)

score: 0.102106746148
score: [ 0.0914036   0.09411018  0.10124877  0.11017023  0.10340927  0.10893004
  0.08409047  0.10112069  0.09635897  0.09789276]
coefs: [  4.23736965e-02  -4.45217842e-02   1.54149096e-01   7.97609019e-05
  -1.18446195e-03   7.81520774e-04   4.17717146e-04   8.26226057e+11
   8.26226057e+11   8.26226057e+11  -2.44140625e-04  -9.15527344e-04]


In [15]:
# SVR (with 10-fold CV) to predict systolic blood pressure
svr_model(bp_x, ap_hi)

[-0.04432627 -0.03529696 -0.04476734 -0.02437445 -0.03256923 -0.02924846
 -0.04247407 -0.0378978  -0.03861775 -0.04130921]


In [16]:
# linear regression to predict diastolic blood pressure
linear_model(bp_x, ap_lo)

score: 0.0771984295981
score: [ 0.07072502  0.07514136  0.0804462   0.08471265  0.09471651  0.08294947
  0.05719067  0.07909383  0.08716347  0.07785264]
coefs: [  2.70498562e-02  -2.34857499e-02   1.22674120e-01  -2.12556938e-04
  -2.06543746e-04   3.33388066e-04   2.99716073e-04   8.82721340e+11
   8.82721340e+11   8.82721340e+11  -2.44140625e-04   1.75476074e-04]


In [17]:
# SVR (with 10-fold CV) to predict diastolic blood pressure
svr_model(bp_x, ap_lo)

[-0.20285133 -0.2239342  -0.22238084 -0.20297716 -0.21970024 -0.21266146
 -0.19225696 -0.21719457 -0.22509295 -0.20935835]


Regression models seem to perform abysmally when predicting blood pressure. Let us move on to predicting BMI if height and weight data are removed from the dataset.

In [18]:
# linear model to predict bmi
linear_model(bmi_x, bmi)

score: 0.0598044323885
score: [ 0.07122785  0.03582598  0.04956207  0.03961792  0.05058673  0.07954697
  0.0494367   0.05885491  0.05556576  0.05726395]
coefs: [ 0.14019142 -0.50361754  0.58734375 -1.22441421 -0.06403012  0.78295317
 -0.11191594  1.68043354 -0.80402527  0.00590529  0.79811998 -0.50708857
  1.05521596]


In [19]:
# svr_model(bmi_x, bmi)  # don't run! doesn't terminate

In [22]:
def logistic_model(features, target, cv=10):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    print("score:", model.score(X_test, y_test))
    print("score:", cross_val_score(model, features, target, cv=cv))
    print("coefs:", model.coef_)

logistic_model(bp_x, bp)

score: 0.497537669974
score: [ 0.49301779  0.49845656  0.49713362  0.49919153  0.4805233   0.4914731
  0.49470744  0.49750074  0.49433907  0.4956624 ]
coefs: [[-0.05188486  0.09892022  0.03000578 -0.51896969  0.22403249 -0.00807857
   0.08936248 -0.06982473 -0.16451351 -1.09805529  0.10613415  0.04626726]
 [ 0.04732943  0.15718325 -0.17742769  0.14129653 -0.18533395 -0.15629006
  -0.04934248  0.40579308 -0.34580198 -0.50356627 -0.1754845  -0.4890146 ]
 [-0.01843539 -0.27559852  0.16437231  0.18130682  0.05520637  0.15711056
  -0.00880755 -0.71903105  0.0480223   0.63374173  0.1090821   0.43016101]]


In [23]:
def svc_model(features, target, cv=10):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    model = SVC(kernel='rbf', gamma=1e-5, C=10)
    model.fit(X_train, y_train)
    print(cross_val_score(model, features, target, cv=cv))

#svc_model(bp_x, bp)  # don't run

In [24]:
from sklearn.neural_network import MLPRegressor, MLPClassifier

def mlp_model(features, target, mlp, cv=10):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    if mlp == 'reg':
        model = MLPRegressor(activation='relu', solver='sgd')
    else:  # mlp == 'class'
        model = MLPClassifier(activation='relu', solver='sgd')
    model.fit(X_train, y_train)
    print(cross_val_score(model, features, target, cv=cv))

In [25]:
# parameters = {
#     "hidden_layer_sizes" : [50, 100, 1000, 5000],
#     "solver" : ['lbfgs', 'sgd', 'adam'],
#     "alpha" : [0.00001, 0.0001, 0.001],
#     "early_stopping" : [True, False]
# }
# mlp = MLPRegressor()
# clf = GridSearchCV(mlp, parameters, cv=10)
# clf.fit(bp_x,ap_hi)
# print(clf.best_params_)

In [26]:
# parameters = {
#     "hidden_layer_sizes" : [50, 100, 1000, 5000],
#     "solver" : ['sgd', 'adam'],
#     "alpha" : [0.00001, 0.0001, 0.001],
#     "early_stopping" : [True, False]
# }
# mlp = MLPRegressor()
# clf = GridSearchCV(mlp, parameters, cv=10)
# clf.fit(bmi_x, bmi)
# print(clf.best_params_)

In [27]:
mlp_model(bp_x, ap_hi, 'reg')

[-0.31538253 -0.21865991 -0.4794237  -0.21387016 -0.42527471 -0.17799035
 -0.30886067 -0.49417838 -0.31949213 -0.27453083]


In [28]:
mlp_model(bmi_x, bmi, 'reg')

[ 0.07063023  0.03186548  0.04944479  0.04133396  0.04807632  0.07594965
  0.05284157  0.05531596  0.05136541  0.05950785]


In [29]:
mlp_model(bp_x, bp, 'class')  # 0.49 score

[ 0.49669264  0.49830957  0.49713362  0.5000735   0.4830222   0.49279624
  0.49485445  0.49705969  0.49316277  0.49742685]


In [30]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier


def rf_model(features, target, rf, cv=10):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    if rf == 'class':
        model = RandomForestClassifier()
    else:
        model = RandomForestRegressor()
    model.fit(X_train, y_train)
    print(cross_val_score(model, features, target, cv=cv))

In [31]:
rf_model(bp_x, ap_hi,'reg')
rf_model(bp_x, ap_lo,'reg')
rf_model(bmi_x, bmi, 'reg')
rf_model(bp_x, bp, 'class')  # 0.41 score

[-0.16773597 -0.1938024  -0.17566514 -0.13635245 -0.19027966 -0.1699366
 -0.17773642 -0.17855151 -0.19854142 -0.15137387]
[-0.18339704 -0.19213284 -0.16027802 -0.20232289 -0.1952052  -0.15041966
 -0.18107747 -0.17700604 -0.1833738  -0.18774514]
[-0.19293385 -0.11189565 -0.10372419 -0.07581162 -0.11166745 -0.16829259
 -0.11512757 -0.09485546 -0.20471728 -0.12164069]
[ 0.40805527  0.41246509  0.41584595  0.40673232  0.41525797  0.41090856
  0.40635107  0.42369891  0.41435083  0.41155712]


In [32]:
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier

def et_model(features, target, et, cv=10):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    if et == 'class':
        model = ExtraTreesClassifier()
    else:
        model = ExtraTreesRegressor()
    model.fit(X_train, y_train)
    print(cross_val_score(model, features, target, cv=cv))

et_model(bp_x, ap_hi, 'reg')
et_model(bp_x, ap_lo, 'reg')
et_model(bmi_x, bmi, 'reg')
et_model(bp_x, bp, 'class')

[-0.29112465 -0.35743206 -0.26031132 -0.28167414 -0.34715186 -0.2802849
 -0.33846927 -0.35338069 -0.30810271 -0.29212834]
[-0.29747785 -0.32039484 -0.32399852 -0.35336622 -0.33553613 -0.28414608
 -0.33036593 -0.32411645 -0.31564097 -0.32766111]
[-0.48723121 -0.15011243 -0.14801271 -0.09807763 -0.20914189 -0.20964528
 -0.20670117 -0.14513721 -0.31980214 -0.16621233]
[ 0.39453182  0.39791269  0.41393503  0.40687932  0.40555637  0.40767421
  0.39708909  0.40826228  0.40685193  0.40449934]
