# Feature Engineering Exercises

In [1]:
import pandas as pd
import numpy as np
from pydataset import data
import matplotlib.pyplot as plt

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

Do your work for this exercise in a jupyter notebook named `feature_engineering` within the r`egression-exercises` repo. Add, commit, and push your work.

## 1.

Load the `tips` dataset.

In [2]:
tips = data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
# what are categories for day column?
tips.day.value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

**Next things to do:**
- Encode all binary categorical columns using:
    - tips['smoker'] = (tips.smoker == 'Yes').astype(int)
    - tips['dinner'] = (tips.time == 'Dinner').astype(int)
    - same for sex
- use pd.getdummies for day column
    - Should I drop or deep the last column
    - I get that it is redundant, but how can you tell if it would have had impact or been an important feature?

In [4]:
# change binary categorical columns to 1s and 0s
tips['smoker'] = (tips.smoker == 'Yes').astype(int)
tips['dinner'] = (tips.time == 'Dinner').astype(int)
tips['male'] = (tips.sex == 'Male').astype(int)

In [5]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,dinner,male
1,16.99,1.01,Female,0,Sun,Dinner,2,1,0
2,10.34,1.66,Male,0,Sun,Dinner,3,1,1
3,21.01,3.5,Male,0,Sun,Dinner,3,1,1
4,23.68,3.31,Male,0,Sun,Dinner,2,1,1
5,24.59,3.61,Female,0,Sun,Dinner,4,1,0


In [6]:
tips = pd.get_dummies(data=tips, columns=['day'])

In [7]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,time,size,dinner,male,day_Fri,day_Sat,day_Sun,day_Thur
1,16.99,1.01,Female,0,Dinner,2,1,0,0,0,1,0
2,10.34,1.66,Male,0,Dinner,3,1,1,0,0,1,0
3,21.01,3.50,Male,0,Dinner,3,1,1,0,0,1,0
4,23.68,3.31,Male,0,Dinner,2,1,1,0,0,1,0
5,24.59,3.61,Female,0,Dinner,4,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
240,29.03,5.92,Male,0,Dinner,3,1,1,0,1,0,0
241,27.18,2.00,Female,1,Dinner,2,1,0,0,1,0,0
242,22.67,2.00,Male,1,Dinner,2,1,1,0,1,0,0
243,17.82,1.75,Male,0,Dinner,2,1,1,0,1,0,0


- Create a column named `tip_percentage`. This should be the tip amount divided by the total bill.

In [8]:
tips['tip_percentage'] = tips.tip / tips.total_bill
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,time,size,dinner,male,day_Fri,day_Sat,day_Sun,day_Thur,tip_percentage
1,16.99,1.01,Female,0,Dinner,2,1,0,0,0,1,0,0.059447
2,10.34,1.66,Male,0,Dinner,3,1,1,0,0,1,0,0.160542
3,21.01,3.5,Male,0,Dinner,3,1,1,0,0,1,0,0.166587
4,23.68,3.31,Male,0,Dinner,2,1,1,0,0,1,0,0.13978
5,24.59,3.61,Female,0,Dinner,4,1,0,0,0,1,0,0.146808


- Create a column named `price_per_person`. This should be the total bill divided by the party size.

In [9]:
tips['price_per_person'] = tips.total_bill / tips['size']
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,time,size,dinner,male,day_Fri,day_Sat,day_Sun,day_Thur,tip_percentage,price_per_person
1,16.99,1.01,Female,0,Dinner,2,1,0,0,0,1,0,0.059447,8.495
2,10.34,1.66,Male,0,Dinner,3,1,1,0,0,1,0,0.160542,3.446667
3,21.01,3.5,Male,0,Dinner,3,1,1,0,0,1,0,0.166587,7.003333
4,23.68,3.31,Male,0,Dinner,2,1,1,0,0,1,0,0.13978,11.84
5,24.59,3.61,Female,0,Dinner,4,1,0,0,0,1,0,0.146808,6.1475


- Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

**I think total_bill, size, and price_per_person will be best predictors of tip amount**

- Use select k best and recursive feature elimination to select the top 2 features for predicting tip amount. What are they?

### Target = tip

In [10]:
# split data
train_validate, test = train_test_split(tips, test_size=0.20, random_state=123)
train, validate = train_test_split(train_validate, test_size=.25, random_state=123)

In [11]:
train.shape, validate.shape, test.shape

((146, 14), (49, 14), (49, 14))

In [12]:
# scale continuous columns
import wrangle as w

In [13]:
train_scaled, validate_scaled, test_scaled = w.add_scaled_columns(train, validate, test, MinMaxScaler(), ['total_bill', 'size', 'price_per_person'])

In [14]:
train_scaled.head()

Unnamed: 0,total_bill,tip,sex,smoker,time,size,dinner,male,day_Fri,day_Sat,day_Sun,day_Thur,tip_percentage,price_per_person,total_bill_scaled,size_scaled,price_per_person_scaled
225,13.42,1.58,Male,1,Lunch,2,0,1,1,0,0,0,0.117735,6.71,0.228679,0.2,0.211566
182,23.33,5.65,Male,1,Dinner,2,1,1,0,0,1,0,0.242177,11.665,0.447636,0.2,0.499564
103,44.3,2.5,Female,1,Dinner,3,1,0,0,1,0,0,0.056433,14.766667,0.910959,0.4,0.679841
165,17.51,3.0,Female,1,Dinner,2,1,0,0,0,1,0,0.171331,8.755,0.319046,0.2,0.330427
74,25.28,5.0,Female,1,Dinner,2,1,0,0,1,0,0,0.197785,12.64,0.49072,0.2,0.556234


In [15]:
# create X & y versions of splits, where y is a series with just the target variable and X are all the features. 

X_train = train_scaled.drop(columns=['tip', 'sex', 'time', 'tip_percentage', 'total_bill', 'size', 'price_per_person'])
y_train = train_scaled.tip

X_validate = validate_scaled.drop(columns=['tip', 'sex', 'time', 'tip_percentage', 'total_bill', 'size', 'price_per_person'])
y_validate = validate_scaled.tip

X_test = test_scaled.drop(columns=['tip', 'sex', 'time', 'tip_percentage', 'total_bill', 'size', 'price_per_person'])
y_test = test_scaled.tip

In [16]:
X_train.head()

Unnamed: 0,smoker,dinner,male,day_Fri,day_Sat,day_Sun,day_Thur,total_bill_scaled,size_scaled,price_per_person_scaled
225,1,0,1,1,0,0,0,0.228679,0.2,0.211566
182,1,1,1,0,0,1,0,0.447636,0.2,0.499564
103,1,1,0,0,1,0,0,0.910959,0.4,0.679841
165,1,1,0,0,0,1,0,0.319046,0.2,0.330427
74,1,1,0,0,1,0,0,0.49072,0.2,0.556234


#### KBest

In [17]:
# create the object
kbest = SelectKBest(f_regression, k=2)

In [18]:
# fit the object
kbest.fit(X_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fb86b0219d0>)

In [19]:
X_train.columns[kbest.get_support()].tolist()

['total_bill_scaled', 'size_scaled']

#### RFE

In [20]:
# create the object
rfe = RFE(estimator=LinearRegression(), n_features_to_select=2)

In [21]:
# fit the object
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [22]:
X_train.columns[rfe.get_support()].tolist()

['total_bill_scaled', 'price_per_person_scaled']

In [23]:
rankings = pd.Series(rfe.ranking_, index=X_train.columns)

In [24]:
rankings.sort_values()

total_bill_scaled          1
price_per_person_scaled    1
day_Sun                    2
male                       3
day_Fri                    4
day_Sat                    5
dinner                     6
size_scaled                7
day_Thur                   8
smoker                     9
dtype: int64

- Use select k best and recursive feature elimination to select the top 2 features for predicting tip percentage. What are they?

### Target = tip_percentage

In [25]:
# create X & y versions of splits, where y is a series with just the target variable and X are all the features. 

X_train = train_scaled.drop(columns=['tip', 'sex', 'time', 'tip_percentage', 'total_bill', 'size', 'price_per_person'])
y_train = train_scaled.tip_percentage

X_validate = validate_scaled.drop(columns=['tip', 'sex', 'time', 'tip_percentage', 'total_bill', 'size', 'price_per_person'])
y_validate = validate_scaled.tip_percentage

X_test = test_scaled.drop(columns=['tip', 'sex', 'time', 'tip_percentage', 'total_bill', 'size', 'price_per_person'])
y_test = test_scaled.tip_percentage

In [26]:
X_train.head()

Unnamed: 0,smoker,dinner,male,day_Fri,day_Sat,day_Sun,day_Thur,total_bill_scaled,size_scaled,price_per_person_scaled
225,1,0,1,1,0,0,0,0.228679,0.2,0.211566
182,1,1,1,0,0,1,0,0.447636,0.2,0.499564
103,1,1,0,0,1,0,0,0.910959,0.4,0.679841
165,1,1,0,0,0,1,0,0.319046,0.2,0.330427
74,1,1,0,0,1,0,0,0.49072,0.2,0.556234


#### KBest

In [27]:
# create the object
kbest = SelectKBest(f_regression, k=2)

In [28]:
# fit the object
kbest.fit(X_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fb86b0219d0>)

In [29]:
X_train.columns[kbest.get_support()].tolist()

['total_bill_scaled', 'price_per_person_scaled']

#### RFE

In [30]:
# create the object
rfe = RFE(estimator=LinearRegression(), n_features_to_select=2)

In [31]:
# fit the object
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [32]:
X_train.columns[rfe.get_support()].tolist()

['size_scaled', 'price_per_person_scaled']

In [33]:
rankings = pd.Series(rfe.ranking_, index=X_train.columns)

In [34]:
rankings.sort_values()

size_scaled                1
price_per_person_scaled    1
total_bill_scaled          2
day_Sun                    3
smoker                     4
male                       5
day_Sat                    6
day_Fri                    7
day_Thur                   8
dinner                     9
dtype: int64

- Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

**They probably gave different features due to the fact that KBest evaluates each column in isolation while RFE takes into account interaction between columns**

**Each has provided different results for varying numbers of features selected**

## 2.

Write a function named `select_kbest` that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the `SelectKBest` class. Test your function with the `tips` dataset. You should see the same results as when you did the process manually.

In [35]:
def select_kbest(X, y, k):
    kbest = SelectKBest(f_regression, k=k)
    kbest.fit(X, y)
    return X.columns[kbest.get_support()].tolist()

In [36]:
select_kbest(X_train, y_train, 2)

['total_bill_scaled', 'price_per_person_scaled']

## 3.

Write a function named `rfe` that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the `RFE` class. Test your function with the `tips` dataset. You should see the same results as when you did the process manually.

In [37]:
def rfe(X, y, n):
    rfe = RFE(estimator=LinearRegression(), n_features_to_select=n)
    rfe.fit(X, y)
    return X.columns[rfe.get_support()].tolist()

In [38]:
rfe(X_train, y_train, 2)

['size_scaled', 'price_per_person_scaled']

In [39]:
def show_rfe_feature_ranking(X, y):
    rfe = RFE(estimator=LinearRegression(), n_features_to_select=1)
    rfe.fit(X, y)
    rankings = pd.Series(rfe.ranking_, index=X.columns)
    return rankings.sort_values()

In [40]:
show_rfe_feature_ranking(X_train, y_train)

price_per_person_scaled     1
size_scaled                 2
total_bill_scaled           3
day_Sun                     4
smoker                      5
male                        6
day_Sat                     7
day_Fri                     8
day_Thur                    9
dinner                     10
dtype: int64

## 4.

Load the `swiss` dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [41]:
swiss = data('swiss')

In [42]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [43]:
swiss = swiss.rename(columns={'Infant.Mortality' : 'Infant_Mortality'})

In [44]:
# split data
train_validate, test = train_test_split(swiss, test_size=0.20, random_state=123)
train, validate = train_test_split(train_validate, test_size=.25, random_state=123)

In [45]:
train.shape, validate.shape, test.shape

((27, 6), (10, 6), (10, 6))

In [46]:
train_scaled, validate_scaled, test_scaled = w.add_scaled_columns(train, validate, test, MinMaxScaler(), ['Agriculture', 'Examination', 'Education', 'Catholic', 'Infant_Mortality'])

In [47]:
train_scaled.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant_Mortality,Agriculture_scaled,Examination_scaled,Education_scaled,Catholic_scaled,Infant_Mortality_scaled
Rive Droite,44.7,46.6,16,29,50.43,18.2,0.47439,0.40625,0.903226,0.492786,0.316327
Aubonne,66.9,67.5,14,7,2.27,19.1,0.729268,0.34375,0.193548,0.0,0.408163
Rolle,60.5,60.8,16,10,7.72,16.3,0.647561,0.40625,0.290323,0.055766,0.122449
Lavaux,65.1,73.0,19,9,2.84,20.0,0.796341,0.5,0.258065,0.005832,0.5
Nyone,56.6,50.9,22,12,15.14,16.7,0.526829,0.59375,0.354839,0.131689,0.163265


In [48]:
# create X & y versions of splits, where y is a series with just the target variable and X are all the features. 

X_train = train_scaled.drop(columns=['Fertility', 'Agriculture', 'Examination', 'Education', 'Catholic', 'Infant_Mortality'])
y_train = train_scaled.Fertility

X_validate = validate_scaled.drop(columns=['Fertility', 'Agriculture', 'Examination', 'Education', 'Catholic', 'Infant_Mortality'])
y_validate = validate_scaled.Fertility

X_test = test_scaled.drop(columns=['Fertility', 'Agriculture', 'Examination', 'Education', 'Catholic', 'Infant_Mortality'])
y_test = test_scaled.Fertility

In [49]:
X_train.head()

Unnamed: 0,Agriculture_scaled,Examination_scaled,Education_scaled,Catholic_scaled,Infant_Mortality_scaled
Rive Droite,0.47439,0.40625,0.903226,0.492786,0.316327
Aubonne,0.729268,0.34375,0.193548,0.0,0.408163
Rolle,0.647561,0.40625,0.290323,0.055766,0.122449
Lavaux,0.796341,0.5,0.258065,0.005832,0.5
Nyone,0.526829,0.59375,0.354839,0.131689,0.163265


In [50]:
select_kbest(X_train, y_train, 3)

['Examination_scaled', 'Catholic_scaled', 'Infant_Mortality_scaled']

In [51]:
rfe(X_train, y_train, 3)

['Examination_scaled', 'Education_scaled', 'Infant_Mortality_scaled']

In [52]:
show_rfe_feature_ranking(X_train, y_train)

Examination_scaled         1
Infant_Mortality_scaled    2
Education_scaled           3
Agriculture_scaled         4
Catholic_scaled            5
dtype: int64

**Interestingly, same features were selected by both methods in this case**