In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

from sklearn.feature_selection import SelectKBest, RFE, f_regression

from pydataset import data

# Regression Model Exercises

1. Select a dataset with a continuous target variable.

2. Be sure your data is prepared (no missing values, numeric datatypes) and split into samples.

3. Work through all of the steps outlined in the lesson, from setting the baseline to selected a model and evaluating the final model on your test data.

## Acquire and Prepare Data

In [2]:
epilepsy = data('epilepsy')
epilepsy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 236 entries, 1 to 593
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   treatment     236 non-null    object
 1   base          236 non-null    int64 
 2   age           236 non-null    int64 
 3   seizure.rate  236 non-null    int64 
 4   period        236 non-null    int64 
 5   subject       236 non-null    int64 
dtypes: int64(5), object(1)
memory usage: 12.9+ KB


In [3]:
epilepsy.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
base,236.0,31.220339,26.705051,6.0,12.0,22.0,41.0,151.0
age,236.0,28.338983,6.26129,18.0,23.0,28.0,32.0,42.0
seizure.rate,236.0,8.262712,12.35636,0.0,2.75,4.0,9.0,102.0
period,236.0,2.5,1.12041,1.0,1.75,2.5,3.25,4.0
subject,236.0,30.0,17.065581,1.0,15.0,30.0,45.0,59.0


In [4]:
epilepsy = epilepsy.rename(columns = {'seizure.rate' : 'seizure_rate'})

In [5]:
epilepsy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 236 entries, 1 to 593
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   treatment     236 non-null    object
 1   base          236 non-null    int64 
 2   age           236 non-null    int64 
 3   seizure_rate  236 non-null    int64 
 4   period        236 non-null    int64 
 5   subject       236 non-null    int64 
dtypes: int64(5), object(1)
memory usage: 12.9+ KB


In [6]:
dummy_df = pd.get_dummies(epilepsy[['treatment']], dummy_na = False, drop_first = False)
epilepsy = pd.concat([epilepsy, dummy_df], axis = 1)
epilepsy = epilepsy.drop(columns = 'treatment')
epilepsy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 236 entries, 1 to 593
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   base                 236 non-null    int64
 1   age                  236 non-null    int64
 2   seizure_rate         236 non-null    int64
 3   period               236 non-null    int64
 4   subject              236 non-null    int64
 5   treatment_Progabide  236 non-null    uint8
 6   treatment_placebo    236 non-null    uint8
dtypes: int64(5), uint8(2)
memory usage: 11.5 KB


## Split Data

In [7]:
train_val, test = train_test_split(epilepsy, test_size = 0.2, random_state = 24)
train, validate = train_test_split(train_val, test_size = 0.3, random_state = 24)

In [8]:
X_train, y_train = train.drop(columns = 'seizure_rate'), train[['seizure_rate']]
X_validate, y_validate = validate.drop(columns = 'seizure_rate'), validate[['seizure_rate']]
X_test, y_test = test.drop(columns = 'seizure_rate'), test[['seizure_rate']]

## Creating a Baseline

In [9]:
# Let's compare a median baseline versus a mean baseline
baseline = pd.DataFrame({
    'median' : [y_train.seizure_rate.median()] * y_train.size,
    'mean' : [y_train.seizure_rate.mean()] * y_train.size
})
baseline.shape

(131, 2)

In [10]:
baseline.head()

Unnamed: 0,median,mean
0,4.0,7.183206
1,4.0,7.183206
2,4.0,7.183206
3,4.0,7.183206
4,4.0,7.183206


In [11]:
print(f'RMSE median baseline: {mean_squared_error(y_train, baseline["median"], squared = False)}')
print(f'RMSE mean baseline: {mean_squared_error(y_train, baseline["mean"], squared = False)}')

RMSE median baseline: 10.099127012381503
RMSE mean baseline: 9.584339585667749


We'll use the mean as our baseline.

## Create Models

In [12]:
# We'll keep all our results in this dictionary
baseline_validate = pd.DataFrame({
    'mean' : [y_train.seizure_rate.mean()] * y_validate.size
})

results = {
    'baseline' : {
        'RMSE_train' : mean_squared_error(y_train, baseline['mean'], squared = False),
        'RMSE_validate' : mean_squared_error(y_validate, baseline_validate['mean'], squared = False)
    }
}

models = {}

### Linear Regression

In [13]:
# We'll use RFE feature selection to select the four best features
model = LinearRegression()
models['linear regression'] = model

k = 4

rfe = RFE(model, n_features_to_select = k)
rfe.fit(X_train, y_train)

X_train_rfe = rfe.transform(X_train)
model.fit(X_train_rfe, y_train)

X_train.columns[rfe.get_support()]

Index(['base', 'age', 'period', 'treatment_placebo'], dtype='object')

In [14]:
X_validate_rfe = rfe.transform(X_validate)

results['linear regression'] = {
    'RMSE_train' : mean_squared_error(y_train, model.predict(X_train_rfe), squared = False),
    'RMSE_validate' : mean_squared_error(y_validate, model.predict(X_validate_rfe), squared = False)
}

### LassoLars

In [15]:
model = LassoLars()
models['lasso lars'] = model

rfe = RFE(model, n_features_to_select = k)
rfe.fit(X_train, y_train)

X_train_rfe = rfe.transform(X_train)
model.fit(X_train_rfe, y_train)

X_train.columns[rfe.get_support()]

Index(['period', 'subject', 'treatment_Progabide', 'treatment_placebo'], dtype='object')

In [16]:
X_validate_rfe = rfe.transform(X_validate)

results['lasso lars'] = {
    'RMSE_train' : mean_squared_error(y_train, model.predict(X_train_rfe), squared = False),
    'RMSE_validate' : mean_squared_error(y_validate, model.predict(X_validate_rfe), squared = False)
}

### Tweedie Regressor

In [17]:
model = TweedieRegressor(power = 1)
models['tweedie regressor'] = model

rfe = RFE(model, n_features_to_select = k)
rfe.fit(X_train, y_train)

X_train_rfe = rfe.transform(X_train)
model.fit(X_train_rfe, y_train)

X_train.columns[rfe.get_support()]

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


Index(['base', 'age', 'period', 'treatment_placebo'], dtype='object')

In [18]:
X_validate_rfe = rfe.transform(X_validate)

results['tweedie regressor'] = {
    'RMSE_train' : mean_squared_error(y_train, model.predict(X_train_rfe), squared = False),
    'RMSE_validate' : mean_squared_error(y_validate, model.predict(X_validate_rfe), squared = False)
}

### Polynomial Regression

In [19]:
poly = PolynomialFeatures(degree = 2, include_bias = False, interaction_only = False)
poly.fit(X_train)

X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns = poly.get_feature_names(X_train.columns),
    index = X_train.index
)

model = LinearRegression()
models['polynomial regression'] = model

rfe = RFE(model, n_features_to_select = k)
rfe.fit(X_train_poly, y_train)

X_train_rfe = rfe.transform(X_train_poly)
model.fit(X_train_rfe, y_train)

X_train_poly.columns[rfe.get_support()]

Index(['base', 'age', 'period', 'treatment_Progabide^2'], dtype='object')

In [20]:
X_validate_poly = pd.DataFrame(
    poly.transform(X_validate),
    columns = poly.get_feature_names(X_validate.columns),
    index = X_validate.index
)

X_validate_rfe = rfe.transform(X_validate_poly)

results['polynomial regression'] = {
    'RMSE_train' : mean_squared_error(y_train, model.predict(X_train_rfe), squared = False),
    'RMSE_validate' : mean_squared_error(y_validate, model.predict(X_validate_rfe), squared = False)
}

### Polynomial Regression Interactions Only

In [21]:
poly = PolynomialFeatures(degree = 2, include_bias = False, interaction_only = True)
poly.fit(X_train)

X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns = poly.get_feature_names(X_train.columns),
    index = X_train.index
)

model = LinearRegression()
models['polynomial regression interaction only'] = model

rfe = RFE(model, n_features_to_select = k)
rfe.fit(X_train_poly, y_train)

X_train_rfe = rfe.transform(X_train_poly)
model.fit(X_train_rfe, y_train)

X_train_poly.columns[rfe.get_support()]

Index(['base', 'treatment_Progabide', 'treatment_placebo',
       'period treatment_placebo'],
      dtype='object')

In [22]:
X_validate_poly = pd.DataFrame(
    poly.transform(X_validate),
    columns = poly.get_feature_names(X_validate.columns),
    index = X_validate.index
)

X_validate_rfe = rfe.transform(X_validate_poly)

results['polynomial regression interaction only'] = {
    'RMSE_train' : mean_squared_error(y_train, model.predict(X_train_rfe), squared = False),
    'RMSE_validate' : mean_squared_error(y_validate, model.predict(X_validate_rfe), squared = False)
}

## Results

In [23]:
pd.DataFrame(results).T

Unnamed: 0,RMSE_train,RMSE_validate
baseline,9.58434,17.115481
linear regression,5.875694,12.532288
lasso lars,9.58434,17.115481
tweedie regressor,5.167301,10.806308
polynomial regression,5.875694,12.532288
polynomial regression interaction only,5.928869,12.630305


In [24]:
r2_score(y_validate, models['tweedie regressor'].predict(X_validate[['base', 'age', 'period', 'treatment_placebo']]))

0.587062505042971

In [25]:
r2_score(y_validate, models['linear regression'].predict(X_validate[['base', 'age', 'period', 'treatment_placebo']]))

0.4446197643699248

In [27]:
# How does the best model perform on test
r2_score(y_test, models['tweedie regressor'].predict(X_test[['base', 'age', 'period', 'treatment_placebo']]))

0.8441519120719873

In [29]:
mean_squared_error(
    y_test,
    models['tweedie regressor'].predict(X_test[['base', 'age', 'period', 'treatment_placebo']]),
    squared = False
)

4.872431442114245