In [1]:
# import necessary packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import train_test_split



In [2]:
train_data = pd.read_csv('fhs_train_cleaned.csv')
test_data = pd.read_csv('fhs_test.csv')
train_data = train_data.drop('Unnamed: 0',axis=1)
train_data.head()

Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TEN_YEAR_CHD
0,1,58,1.0,0,0.0,0.0,0,0,0,220.0,143.0,104.0,29.85,75,87.0,1
1,0,40,1.0,1,15.0,0.0,0,0,0,199.0,122.0,82.0,22.16,85,77.0,0
2,1,38,2.0,1,43.0,0.0,0,1,0,170.0,130.0,94.0,23.9,110,75.0,0
3,0,43,1.0,0,0.0,0.0,0,0,0,202.0,124.0,92.0,21.26,75,74.0,0
4,0,54,1.0,0,0.0,0.0,0,1,0,237.0,171.5,105.5,34.25,91,104.0,0


In [3]:
y_train = train_data['TEN_YEAR_CHD']
d_train = train_data.loc[:,['age', 'prevalentHyp','sysBP']]

y_test = test_data['TenYearCHD']
d_test = test_data.loc[:,['age', 'prevalentHyp','sysBP']]

reg = linear_model.LinearRegression(fit_intercept=False).fit(d_train,y_train) # runs regression on training set, no intercept

y_reg = reg.predict(d_test) # makes variable for y based off prediccted regression values

results = pd.DataFrame({'variable':reg.feature_names_in_, 'coefficient': reg.coef_}) # gets regression coefficients
results

Unnamed: 0,variable,coefficient
0,age,0.004097
1,prevalentHyp,0.118807
2,sysBP,-0.000605


In [4]:
# What is the  R2  and RMSE on the test set?
print('Rsq: ', reg.score(d_test,y_test)) # R2
rmse = np.sqrt( np.mean( (y_test - y_reg)**2 ))
print('RMSE: ', rmse) # R2

Rsq:  0.059236283803369316
RMSE:  0.34453084998865596


In [9]:
# WITH INTERCEPT
y_train = train_data['TEN_YEAR_CHD']
d_train = train_data.loc[:,['age', 'prevalentHyp','sysBP']]

y_test = test_data['TenYearCHD']
d_test = test_data.loc[:,['age', 'prevalentHyp','sysBP']]

reg = linear_model.LinearRegression().fit(d_train,y_train) # runs regression on training set, no intercept

y_reg = reg.predict(d_test) # makes variable for y based off prediccted regression values

results = pd.DataFrame({'variable':reg.feature_names_in_, 'coefficient': reg.coef_}) # gets regression coefficients
results

print('Rsq: ', reg.score(d_test,y_test)) # R2
rmse = np.sqrt( np.mean( (y_test - y_reg)**2 ))
print('RMSE: ', rmse) # R2

Rsq:  0.08090154901145519
RMSE:  0.34054056573208524


Running regression on all 4 variables:

In [5]:
y_train = train_data['TEN_YEAR_CHD']
d_train = train_data.loc[:,['age', 'prevalentHyp','sysBP','diaBP']]

y_test = test_data['TenYearCHD']
d_test = test_data.loc[:,['age', 'prevalentHyp','sysBP','diaBP']]

reg = linear_model.LinearRegression(fit_intercept=False).fit(d_train,y_train) # runs regression on training set, no intercept

y_reg = reg.predict(d_test) # makes variable for y based off prediccted regression values

results = pd.DataFrame({'variable':reg.feature_names_in_, 'coefficient': reg.coef_}) # gets regression coefficients
results

Unnamed: 0,variable,coefficient
0,age,0.004502
1,prevalentHyp,0.102792
2,sysBP,0.002042
3,diaBP,-0.004433


In [6]:
# the  R2  and RMSE on the test set:
print('Rsq: ', reg.score(d_test,y_test)) # R2
rmse = np.sqrt( np.mean( (y_test - y_reg)**2 ))
print('RMSE: ', rmse) # R2

Rsq:  0.06328821885916713
RMSE:  0.34378809012451333


In [7]:
# KEEPing the intercept
y_train = train_data['TEN_YEAR_CHD']
d_train = train_data.loc[:,['age', 'prevalentHyp','sysBP','diaBP']]

y_test = test_data['TenYearCHD']
d_test = test_data.loc[:,['age', 'prevalentHyp','sysBP','diaBP']]

reg = linear_model.LinearRegression().fit(d_train,y_train) # runs regression on training set, WITH intercept

y_reg = reg.predict(d_test) # makes variable for y based off prediccted regression values

results = pd.DataFrame({'variable':reg.feature_names_in_, 'coefficient': reg.coef_}) # gets regression coefficients
results

Unnamed: 0,variable,coefficient
0,age,0.007263
1,prevalentHyp,0.028865
2,sysBP,0.002477
3,diaBP,-0.00121


In [8]:
print('Rsq: ', reg.score(d_test,y_test)) # R2
rmse = np.sqrt( np.mean( (y_test - y_reg)**2 ))
print('RMSE: ', rmse) # R2

Rsq:  0.0802647068492095
RMSE:  0.3406585253672308


with only 2 vars now - with intercept

In [10]:
y_train = train_data['TEN_YEAR_CHD']
d_train = train_data.loc[:,['age','sysBP']]

y_test = test_data['TenYearCHD']
d_test = test_data.loc[:,['age','sysBP']]

reg = linear_model.LinearRegression().fit(d_train,y_train) # runs regression on training set, WITH intercept

y_reg = reg.predict(d_test) # makes variable for y based off prediccted regression values

results = pd.DataFrame({'variable':reg.feature_names_in_, 'coefficient': reg.coef_}) # gets regression coefficients
results

print('Rsq: ', reg.score(d_test,y_test)) # R2
rmse = np.sqrt( np.mean( (y_test - y_reg)**2 ))
print('RMSE: ', rmse) # R2

Rsq:  0.07928160078218238
RMSE:  0.3408405418771412


In [13]:
y_train = train_data['TEN_YEAR_CHD']
d_train = train_data.loc[:,['diaBP','sysBP']]

y_test = test_data['TenYearCHD']
d_test = test_data.loc[:,['diaBP','sysBP']]

reg = linear_model.LinearRegression().fit(d_train,y_train) # runs regression on training set, WITH intercept

y_reg = reg.predict(d_test) # makes variable for y based off prediccted regression values

results = pd.DataFrame({'variable':reg.feature_names_in_, 'coefficient': reg.coef_}) # gets regression coefficients
results

print('Rsq: ', reg.score(d_test,y_test)) # R2
rmse = np.sqrt( np.mean( (y_test - y_reg)**2 ))
print('RMSE: ', rmse) # R2

Rsq:  0.0595109509595203
RMSE:  0.3444805513773597
