In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [3]:
data = pd.read_csv('linear_health_dataset.csv')

In [4]:
data

Unnamed: 0,Age,Gender,Education,Smoking,Activity,BMI,Waist,HDL,Triglycerides,MetabolicRiskScore
0,56,Male,HighSchool,No,Low,28.220249,78.908910,48.996843,233.136570,116.847116
1,69,Male,Bachelor,Yes,Low,23.796130,65.401435,50.834187,164.496459,118.518046
2,46,Female,HighSchool,No,Low,28.121568,76.417668,34.598058,145.768476,93.899151
3,32,Male,HighSchool,No,Moderate,28.448161,78.768517,65.125676,199.542320,95.023080
4,60,Male,Bachelor,No,Low,27.573376,67.421361,53.910602,91.924653,66.894203
...,...,...,...,...,...,...,...,...,...,...
995,18,Male,HighSchool,No,High,28.634908,85.904316,52.179185,196.883446,104.839808
996,35,Female,Master,No,Low,34.319839,87.182092,25.611334,171.222801,111.251640
997,49,Female,HighSchool,No,Low,27.448266,86.804165,35.691047,89.383815,92.915192
998,64,Male,HighSchool,No,Low,31.566515,82.327263,38.181466,145.154047,110.838767


In [5]:
x = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [6]:
print(np.unique(x[:,4]))

['High' 'Low' 'Moderate']


In [7]:
activity_order = [['Low', 'Moderate', 'High']]
education_order = [['HighSchool', 'Bachelor', 'Master', 'PhD']]
smoking_order = [['No', 'Yes']]

In [8]:
ct = ColumnTransformer(transformers= [('gender_ohe', OneHotEncoder(drop='first'), [1]), 
                                      ('activity_ord', OrdinalEncoder(categories=activity_order), [4]), 
                                      ('edu_ord', OrdinalEncoder(categories=education_order), [2]),
                                      ('smok_ord', OrdinalEncoder(categories=smoking_order), [3])], remainder= 'passthrough')

In [9]:
x = ct.fit_transform(x)

In [10]:
x.shape

(1000, 9)

In [11]:
x[:5]

array([[1.0, 0.0, 0.0, 0.0, 56, 28.220248555860667, 78.90891022566693,
        48.99684285969641, 233.13656956057565],
       [1.0, 0.0, 1.0, 1.0, 69, 23.79612961241608, 65.40143518720694,
        50.83418748440542, 164.49645851501805],
       [0.0, 0.0, 0.0, 0.0, 46, 28.121567884453256, 76.41766751838377,
        34.59805837191012, 145.7684759174094],
       [1.0, 1.0, 0.0, 0.0, 32, 28.44816058343621, 78.76851736312292,
        65.12567554823671, 199.54231984622592],
       [1.0, 0.0, 1.0, 0.0, 60, 27.57337553038356, 67.42136079952805,
        53.9106015602028, 91.92465297993007]], dtype=object)

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [13]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

In [14]:
y_pred = regressor.predict(x_test)

In [15]:
from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred))

0.7345426483070929


In [18]:
np.set_printoptions(precision= 2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[ 58.73  79.94]
 [ 98.23  70.75]
 [ 70.69  68.36]
 [100.14  91.58]
 [ 73.82  69.95]
 [ 93.76  94.  ]
 [ 70.72  78.85]
 [ 74.64  70.65]
 [107.87  93.8 ]
 [ 81.74  68.36]
 [111.22 111.9 ]
 [ 81.31  83.66]
 [ 85.48  95.95]
 [ 67.74  71.35]
 [ 89.87  91.88]
 [ 70.41  58.22]
 [ 92.17  96.52]
 [106.49 110.8 ]
 [ 58.5   33.24]
 [ 66.27  75.24]
 [ 72.56  71.09]
 [ 89.74  89.36]
 [ 78.51  73.04]
 [ 90.6  105.22]
 [ 66.27  77.08]
 [ 67.5   71.44]
 [ 75.66  75.87]
 [ 64.76  70.44]
 [ 74.76  60.82]
 [ 99.42  88.42]
 [ 87.19  90.45]
 [ 77.18  72.58]
 [104.   103.96]
 [101.79 102.09]
 [ 86.85  95.2 ]
 [ 83.07  77.76]
 [ 85.06  84.47]
 [ 76.41  77.13]
 [103.94 109.65]
 [103.72 107.48]
 [ 73.93  60.06]
 [102.8   82.2 ]
 [ 49.64  50.89]
 [ 85.65  78.09]
 [ 68.16  71.98]
 [ 86.42  84.94]
 [129.39 126.19]
 [ 58.15  69.66]
 [ 94.79  81.68]
 [ 85.93 104.84]
 [ 66.59  88.97]
 [106.58 127.55]
 [ 46.65  48.46]
 [125.03 136.22]
 [ 89.86  97.83]
 [ 96.42  89.87]
 [ 65.27  63.71]
 [110.29 108.35]
 [107.1  112.3