# Linear regression
In this notebook, we do simple linear regression for the problem of predicting mean activity based on non-wearable features taken from the UK biobank. 

In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# For reproducibility
np.random.seed(42)

In [2]:
from accel.read_data import load_data, load_xy, dataset_split, to_numpy_cont

## Data prep

In [3]:
features_of_interest = ['age_entry_years', 'sex', 'smoking', 'BMI', 'inc_ihd']
categorical_features = ['sex', 'smoking']
numeric_features = list(set(features_of_interest) - set(categorical_features))
response = 'acc.overall.avg' 

X, y = load_xy("../data/health_data/dataset-with-preprocessing-done.csv",
               features_of_interest,
               response
               )
#Check for null values
X[X.isnull().any(axis=1)] 

Unnamed: 0,age_entry_years,sex,smoking,BMI,inc_ihd


In [4]:
print(X.shape)
X.head()

(91243, 5)


Unnamed: 0,age_entry_years,sex,smoking,BMI,inc_ihd
0,73.722108,Female,Never,27.6685,0
1,65.785079,Male,Previous,30.0408,0
2,63.841205,Male,Previous,27.9744,0
3,59.041752,Male,Previous,31.1317,0
4,56.873374,Male,Previous,22.9938,0


In [8]:
# encode categorical variables
X_enc = pd.get_dummies(X, columns=categorical_features)
X_enc.head()

Unnamed: 0,age_entry_years,BMI,inc_ihd,sex_Female,sex_Male,smoking_Current,smoking_Never,smoking_Previous
0,73.722108,27.6685,0,1,0,0,1,0
1,65.785079,30.0408,0,0,1,0,0,1
2,63.841205,27.9744,0,0,1,0,0,1
3,59.041752,31.1317,0,0,1,0,0,1
4,56.873374,22.9938,0,0,1,0,0,1


In [9]:
# Split into training and testing, 80:20
X_train, X_test, y_train, y_test = dataset_split(X_enc, y, 0.2)

print("Shape of training set:", X_train.shape)
print("Shape of test set:", X_test.shape)

Shape of training set: (72994, 8)
Shape of test set: (18249, 8)


## Model

In [11]:
from time import time
from sklearn.linear_model import LinearRegression 
from accel.training import score

In [12]:
# fitting on full training set
clf = LinearRegression()
clf.fit(X_train, y_train) 

LinearRegression()

In [16]:
clf.coef_

array([-0.27522312, -0.46077662, -0.97409739,  0.00348093, -0.00348093,
       -1.45158243,  0.56899221,  0.88259022])

In [13]:
score(y_test, clf.predict(X_test))

MSE is:  60.18058919862818
R2 is:  0.13313451561829692
Explained variance is: 0.1332820822300962
