In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Childhood Respiratory Disease

Keywords: polynomial regression, multiple regression.

## Description

FEV (forced expiratory volume) is an index of pulmonary function that measures the volume of air expelled after one second of constant effort. The data contains determinations of FEV on 654 children ages 6-22 who were seen in the Childhood Respiratory Desease Study in 1980 in East Boston, Massachusetts. The data are part of a larger study to follow the change in pulmonary function over time in children.

ID	 - 	ID number
Age	 - 	years
FEV	 - 	litres
Height	 - 	inches
Sex	 - 	Male or Female
Smoker	 - 	Non = nonsmoker, Current = current smoker


## Source

Tager, I. B., Weiss, S. T., Rosner, B., and Speizer, F. E. (1979). Effect of parental cigarette smoking on pulmonary function in children. American Journal of Epidemiology, 110, 15-26.
Rosner, B. (1990). Fundamentals of Biostatistics, 3rd Edition. PWS-Kent, Boston, Massachusetts.


In [3]:
# Read the csv file into a pandas DataFrame

smoking = pd.read_csv('../Resources/smoking.csv')
smoking.head()

Unnamed: 0,Id,Age,FEV,Height,Sex,Smoker
0,301,9,1.708,57.0,Female,Non
1,451,8,1.724,67.5,Female,Non
2,501,7,1.72,54.5,Female,Non
3,642,9,1.558,53.0,Male,Non
4,901,9,1.895,57.0,Male,Non


In [4]:
X = smoking[['Age', 'Height', 'Sex', 'Smoker']]
y = smoking[["FEV"]]

In [5]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

ct = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(), make_column_selector(dtype_include=object))
)

In [7]:
# Split the data into training and testing

### BEGIN SOLUTION
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
### END SOLUTION

## Grid Search or Hyper-Parameter Tuning

search for the best parameter value of a model given a range.

To leverage MLFlow we can't use the prebuilt `GridSearchCV` class from scikit-learn.

Program our own GridSearch

In [8]:
import mlflow
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

class CustomGridSearchCV():
    """
    model - scikit-learn model
    alpha_params - list of alpha parameter values to try
    max_p - max number of cpus to use
    """
    def __init__(self, name, model, X, Y, alpha_params, max_p=4, n_splits=5):
        self.name = name
        self.model = model
        self.X = X
        self.Y = Y
        self.alpha_params = alpha_params
        self.max_p = max_p
        self.n_splits = n_splits
        self.k_fold = KFold(n_splits=5, random_state=42)
        self.tracking_client = mlflow.tracking.MlflowClient()
    
    def quantify(self, model, train, test):
        test_MSE = mean_squared_error(self.Y.iloc[test], model.predict(self.X.iloc[test]))
        test_r2 = model.score(self.X.iloc[test], self.Y.iloc[test])
        train_MSE = mean_squared_error(self.Y.iloc[train], model.predict(self.X.iloc[train]))
        train_r2 = model.score(self.X.iloc[train], self.Y.iloc[train])
        return {"test_MSE": test_MSE, "test_r2": test_r2, "train_MSE": train_MSE, "train_r2": train_r2}
    
    def run_one(self, experiment_id):
        
        def func(alpha_param):
            mse_list, r2_list = ([], [])
            for i, (train, test) in enumerate(self.k_fold.split(self.X, self.Y)):
                run_name = f"K = {i+1}"
                with mlflow.start_run(nested=True, run_name=run_name) as child_run:
                    child_run_id = child_run.info.run_id
                    model = self.model
                    key = f"{name.lower()}__alpha"
                    model = model.set_params(**{key: alpha_param})
                model.fit(self.X.iloc[train], self.Y.iloc[train])
                metrics = self.quantify(model, train, test)
                self.tracking_client.log_metric(child_run_id, "MSE", metrics["train_MSE"])
                self.tracking_client.log_metric(child_run_id, "R2", metrics["train_r2"])
                mse_list.append(metrics["train_MSE"])
                r2_list.append(metrics["train_r2"])
            mlflow.log_params({"name": self.name,"alpha": alpha_param,"alpha_name": f"{self.name} alpha={alpha_param}"})
            mlflow.log_metric("MSE", np.mean(mse_list))
            mlflow.log_metric("R2", np.mean(r2_list))
            return 
        
        return func
    
    def run_all(self):
        for alpha_param in alpha_params:
            run_name = f"{self.name} - alpha={alpha_param}"
            with mlflow.start_run(run_name=run_name) as run:
                experiment_id = run.info.experiment_id
                self.run_one(experiment_id)(alpha_param)

In [9]:
lr_models = [LinearRegression(), ElasticNet(), Ridge(), Lasso()]
alpha_params = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]
for model in lr_models:
    name = type(model).__name__
    if name.lower() == "linearregression":
        continue
    model = make_pipeline(ct, model)
    grid = CustomGridSearchCV(name, model, X, y, alpha_params, max_p=4, n_splits=5)
    grid.run_all()