# ML - Regression

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import datasets

sns.set_theme(style="whitegrid")

%matplotlib inline

## Motivation

Ten baseline variables, age, sex, body mass index, average blood pressure, and six blood serum measurements were obtained for each of n = 442 diabetes patients, as well as the response of interest, a quantitative measure of disease progression one year after baseline.

Source: https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset

In [None]:
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True, as_frame=True)
diabetes = pd.concat([diabetes_X, diabetes_y], axis=1)
diabetes.head()

First 10 columns are numeric predictive values

Column 11 is a quantitative measure of disease progression one year after baseline


| Feature       | Description     |
| :------------- | :----------: |
| age | age in years|
| sex | sex |
| bmi | body mass index|
| bp | average blood pressure|
| s1 | tc, T-Cells (a type of white blood cells)|
| s2 | ldl, low-density lipoproteins|
| s3 | hdl, high-density lipoproteins|
| s4 | tch, thyroid stimulating hormone|
| s5 | ltg, lamotrigine|
| s6 | glu, blood sugar level|

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times the square root of n_samples (i.e. the sum of squares of each column totals 1).

In [None]:
diabetes.describe().T

In [None]:
diabetes.apply(np.linalg.norm)

In [None]:
sns.pairplot(diabetes)

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(diabetes_X, diabetes_y)
print(f"Coefficients:\n {model.coef_.T}\n")
print(f"Score: {model.score(diabetes_X, diabetes_y)}")


In [None]:
%%timeit
LinearRegression().fit(diabetes_X, diabetes_y)

In [None]:
y_pred = model.predict(diabetes_X)
line = np.arange(0, diabetes_y.max(), 0.1)

fig, ax = plt.subplots(figsize=(6.5, 6.5))
sns.scatterplot(
    x=diabetes_y,
    y=y_pred,
    ax=ax
)
ax.axis("equal")
ax.set_xlabel("y_true")
ax.set_ylabel("y_pred")
ax.set_title("Prediction vs True Values")
sns.lineplot(x=line, y=line, color="red", ax=ax)
fig.show()

## Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=0.5)
ridge_model.fit(diabetes_X, diabetes_y)
print(f"Coefficients:\n {ridge_model.coef_.T}\n")
print(f"Score: {ridge_model.score(diabetes_X, diabetes_y)}")

## Lasso Regression

In [None]:
from sklearn.linear_model import Lasso

lasso_model = Lasso(alpha=0.5)
lasso_model.fit(diabetes_X, diabetes_y)
print(f"Coefficients:\n {lasso_model.coef_.T}\n")
print(f"Score: {lasso_model.score(diabetes_X, diabetes_y)}")

## Elastic-Net Regression

In [None]:
from sklearn.linear_model import ElasticNet

elasticnet_model = ElasticNet(alpha=0.5, l1_ratio=0.4)
elasticnet_model.fit(diabetes_X, diabetes_y)
print(f"Coefficients:\n {elasticnet_model.coef_.T}\n")
print(f"Score: {elasticnet_model.score(diabetes_X, diabetes_y)}")

## Decision Tree

Tree-based methods are non-parametric models that involve stratifying or segmenting the predictor space into a number of simple regions. They are conceptually simple yet powerful. Its name comes from the fact that the set of splitting rules used to segment the predictor space can be summarized in a tree.

In [None]:
from sklearn.tree import DecisionTreeRegressor

decision_tree_model = DecisionTreeRegressor(
    min_samples_leaf=0.01,
    random_state=42
)
decision_tree_model.fit(diabetes_X, diabetes_y)
# print(f"Coefficients:\n {decision_tree_model.coef_.T}\n")
print(f"Score: {decision_tree_model.score(diabetes_X, diabetes_y)}")

## Random Forest

It is an ensemble approach that combines many simple block-models in order to obtain a single, and potentially very powerful model. Blocks are decision trees in which each tree uses a random sample of predictors.

In [None]:
from sklearn.ensemble import RandomForestRegressor

random_forest_model =  RandomForestRegressor(
    n_estimators=100,
    random_state=42
)
random_forest_model.fit(diabetes_X, diabetes_y)
print(f"Score: {random_forest_model.score(diabetes_X, diabetes_y)}")