In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

from sklearn.feature_selection import SelectKBest, RFE, f_regression

from pydataset import data

# Regression Model Exercises

1. Select a dataset with a continuous target variable.

2. Be sure your data is prepared (no missing values, numeric datatypes) and split into samples.

3. Work through all of the steps outlined in the lesson, from setting the baseline to selected a model and evaluating the final model on your test data.

## Acquire and Prepare Data

In [21]:
epilepsy = data('epilepsy')
epilepsy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 236 entries, 1 to 593
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   treatment     236 non-null    object
 1   base          236 non-null    int64 
 2   age           236 non-null    int64 
 3   seizure.rate  236 non-null    int64 
 4   period        236 non-null    int64 
 5   subject       236 non-null    int64 
dtypes: int64(5), object(1)
memory usage: 12.9+ KB


In [22]:
epilepsy.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
base,236.0,31.220339,26.705051,6.0,12.0,22.0,41.0,151.0
age,236.0,28.338983,6.26129,18.0,23.0,28.0,32.0,42.0
seizure.rate,236.0,8.262712,12.35636,0.0,2.75,4.0,9.0,102.0
period,236.0,2.5,1.12041,1.0,1.75,2.5,3.25,4.0
subject,236.0,30.0,17.065581,1.0,15.0,30.0,45.0,59.0


In [24]:
epilepsy = epilepsy.rename(columns = {'seizure.rate' : 'seizure_rate'})

In [25]:
epilepsy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 236 entries, 1 to 593
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   treatment     236 non-null    object
 1   base          236 non-null    int64 
 2   age           236 non-null    int64 
 3   seizure_rate  236 non-null    int64 
 4   period        236 non-null    int64 
 5   subject       236 non-null    int64 
dtypes: int64(5), object(1)
memory usage: 12.9+ KB


In [27]:
dummy_df = pd.get_dummies(epilepsy[['treatment']], dummy_na = False, drop_first = False)
epilepsy = pd.concat([epilepsy, dummy_df], axis = 1)
epilepsy = epilepsy.drop(columns = 'treatment')
epilepsy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 236 entries, 1 to 593
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   base                 236 non-null    int64
 1   age                  236 non-null    int64
 2   seizure_rate         236 non-null    int64
 3   period               236 non-null    int64
 4   subject              236 non-null    int64
 5   treatment_Progabide  236 non-null    uint8
 6   treatment_placebo    236 non-null    uint8
dtypes: int64(5), uint8(2)
memory usage: 11.5 KB


## Split Data

In [28]:
train_val, test = train_test_split(epilepsy, test_size = 0.2)
train, validate = train_test_split(train_val, test_size = 0.3)

In [29]:
X_train, y_train = train.drop(columns = 'seizure_rate'), train[['seizure_rate']]
X_validate, y_validate = validate.drop(columns = 'seizure_rate'), validate[['seizure_rate']]
X_test, y_test = test.drop(columns = 'seizure_rate'), test[['seizure_rate']]

## 