# Regression Exercises: Feature Engineering

In [273]:
import pandas as pd
import numpy as np
import wrangle as w
import warnings
from pydataset import data
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
import sklearn.preprocessing
warnings.filterwarnings("ignore")

## 1. Load the tips dataset.

In [274]:
tips = data('tips')

In [275]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


#### 1.a. Create a column named price_per_person. This should be the total bill divided by the party size.

In [276]:
tips['price_per_person'] = tips['total_bill']/tips['size']

#### 1.b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?
><b> ANSWER: total_bill should be by far the most important, followed by party size. </b>

#### 1.c. Use Select K Best to select the top 2 features for predicting tip amount. What are they?


In [277]:
# split data
train, validate, test = w.split_function(tips)

In [278]:
train.sample(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
160,16.49,2.0,Male,No,Sun,Dinner,4,4.1225
231,24.01,2.0,Male,Yes,Sat,Dinner,4,6.0025
75,14.73,2.2,Female,No,Sat,Dinner,2,7.365
101,11.35,2.5,Female,Yes,Fri,Dinner,2,5.675
111,14.0,3.0,Male,No,Sat,Dinner,2,7.0
64,18.29,3.76,Male,Yes,Sat,Dinner,4,4.5725
19,16.97,3.5,Female,No,Sun,Dinner,3,5.656667
240,29.03,5.92,Male,No,Sat,Dinner,3,9.676667
222,13.42,3.48,Female,Yes,Fri,Lunch,2,6.71
15,14.83,3.02,Female,No,Sun,Dinner,2,7.415


In [279]:
# Manually encode binary categorical variables
train['gender_encoded'] = train.sex.map({'Female': 1, 'Male': 0})
train['smoker_encoded'] = train.smoker.map({'Yes': 1, 'No': 0})
train['time_encoded'] = train.time.map({'Lunch': 1, 'Dinner': 0})

# Use pd.get_dummies to encode nonbinary categorical variable 'day'
dummy_train = pd.get_dummies(train[['day']],drop_first=True).astype(int)

# concat dummy_train to train
train = pd.concat([train, dummy_train], axis=1)

In [280]:
train.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size',
       'price_per_person', 'gender_encoded', 'smoker_encoded', 'time_encoded',
       'day_Sat', 'day_Sun', 'day_Thur'],
      dtype='object')

In [281]:
# split train into x_train and y_train
x_train = train.drop(columns=['tip','smoker','day','time','sex'])
y_train = train.tip
y_train = pd.DataFrame(y_train)

In [282]:
# scale
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_train_scaled = pd.DataFrame(x_train_scaled)

In [283]:
# initialize f_selector object: f_regression stats test, 2 features
f_selector = SelectKBest(f_regression, k=2)

# fit object to find top 2 features correlated with y
f_selector.fit(x_train_scaled, y_train)

# show me which columns were selected
feature_mask = f_selector.get_support()
f_feature = x_train.iloc[:,feature_mask].columns.tolist()
f_feature

['total_bill', 'size']

#### 1.d. Use Recursive Feature Elimination to select the top 2 features for tip amount. What are they?


In [284]:
# initialize LinearRegression algorithm
lm = LinearRegression()

# create rfe object: number of features=2
rfe = RFE(lm, n_features_to_select=2)

# fit object
rfe.fit(x_train_scaled,y_train)

# mask to get selected columns
feature_mask = rfe.support_

# show me selected columns
rfe_feature = x_train.iloc[:,feature_mask].columns.tolist()
rfe_feature

['total_bill', 'price_per_person']

#### 1.e. Why do you think Select K Best and Recursive Feature Elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?
> <b> ANSWER: Because SelectKBest selects based on hypothesis testing it may select two features that are not independent from each other. On the other had, RFE actually tests each in the model and scores the model. When including 3 features it picks the same three. </b>

## 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.


In [285]:
def select_kbest(x,y,k=2):
    # initialize f_selector object: f_regression stats test
    f_selector = SelectKBest(f_regression, k=k)
    
    # fit object to find top 2 features correlated with y
    f_selector.fit(x, y)

    # show me which columns were selected
    feature_mask = f_selector.get_support()
    f_feature = x.iloc[:,feature_mask].columns.tolist()
    return f_feature

In [286]:
select_kbest(x_train, y_train)

['total_bill', 'size']

## 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top n features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.



In [287]:
def rfe(x,y,n_features_to_select=2):
    # initialize LinearRegression algorithm
    lm = LinearRegression()
    
    # create rfe object
    rfe = RFE(lm, n_features_to_select=n_features_to_select)   
    
    # fit object
    rfe.fit(x,y)
    
    # mask to get selected columns
    feature_mask = rfe.support_
    
    # show me selected columns
    rfe_feature = x.iloc[:,feature_mask].columns.tolist()
    return rfe_feature

In [288]:
rfe(x_train,y_train)

['time_encoded', 'day_Thur']

## 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both Select K Best and Recursive Feature Elimination (use the functions you just built to help you out).

In [289]:
swiss = data('swiss')

In [290]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [291]:
# split data
train, validate, test = w.split_function(swiss)

In [292]:
# split train into x_train and y_train
x_train = train.drop(columns=['Fertility'])
y_train = train.Fertility

In [293]:
# scale
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)

In [294]:
x_train_scaled = pd.DataFrame(x_train_scaled)

In [295]:
select_kbest(x_train, y_train, 3)

['Examination', 'Education', 'Catholic']

In [296]:
rfe(x_train, y_train, 3)

['Examination', 'Education', 'Infant.Mortality']