In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.preprocessing import MinMaxScaler

from pydataset import data

# Feature Engineering Exercises

## 1

Load the tips dataset.

In [2]:
tips = data('tips')

### a

Create a column named price_per_person. This should be the total bill divided by the party size.

In [3]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


In [4]:
tips['price_per_person'] = tips['total_bill'] / tips['size']

In [5]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


### b

Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

I think total bill and party size would be the most important features for predicting tip amount.

### c

Use select k best to select the top 2 features for predicting tip amount. What are they?

In [6]:
tips.sex = tips.sex.map({'Male' : 0, 'Female' : 1})
tips.smoker = tips.smoker.map({'No' : 0, 'Yes' : 1})
tips.day = tips.day.map({'Thur' : 0, 'Fri' : 1, 'Sat' : 2, 'Sun' : 3})
tips.time = tips.time.map({'Lunch' : 0, 'Dinner' : 1})

In [7]:
train_validate, test = train_test_split(tips, test_size = 0.2)
train, validate = train_test_split(train_validate, test_size = 0.3)

scaler = MinMaxScaler()

columns = train.drop(columns = 'tip').columns

train[columns] = scaler.fit_transform(train[columns])
validate[columns] = scaler.transform(validate[columns])
test[columns] = scaler.transform(test[columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [8]:
kbest = SelectKBest(f_regression, k = 2)
kbest.fit(train.drop(columns = 'tip'), train.tip)

SelectKBest(k=2, score_func=<function f_regression at 0x7f7c11054b80>)

In [9]:
train.drop(columns = 'tip').columns[kbest.get_support()]

Index(['total_bill', 'size'], dtype='object')

### d

Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [10]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select = 2)
rfe.fit(train.drop(columns = 'tip'), train.tip)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [11]:
pd.DataFrame({'rfe_ranking': rfe.ranking_}, index=train.drop(columns = 'tip').columns)

Unnamed: 0,rfe_ranking
total_bill,1
sex,6
smoker,4
day,5
time,3
size,2
price_per_person,1


In [12]:
train.drop(columns = 'tip').columns[rfe.get_support()]

Index(['total_bill', 'price_per_person'], dtype='object')

### e

Why do you think select k best and recursive feature elimination might give different answers for the top features?

RFE probably gives a different answer because it uses an actual regression model to determine which features are most important whereas select k best uses statistical testing.

Does this change as you change the number of features your are selecting?

In [13]:
kbest = SelectKBest(f_regression, k = 3)
kbest.fit(train.drop(columns = 'tip'), train.tip)
train.drop(columns = 'tip').columns[kbest.get_support()]

Index(['total_bill', 'size', 'price_per_person'], dtype='object')

In [14]:
rfe = RFE(model, n_features_to_select = 3)
rfe.fit(train.drop(columns = 'tip'), train.tip)
train.drop(columns = 'tip').columns[rfe.get_support()]

Index(['total_bill', 'size', 'price_per_person'], dtype='object')

They both select the same features when selecting the top 3.

In [15]:
kbest = SelectKBest(f_regression, k = 4)
kbest.fit(train.drop(columns = 'tip'), train.tip)
train.drop(columns = 'tip').columns[kbest.get_support()]

Index(['total_bill', 'time', 'size', 'price_per_person'], dtype='object')

In [16]:
rfe = RFE(model, n_features_to_select = 4)
rfe.fit(train.drop(columns = 'tip'), train.tip)
train.drop(columns = 'tip').columns[rfe.get_support()]

Index(['total_bill', 'time', 'size', 'price_per_person'], dtype='object')

They both select the same features when selecting the top 4.

## 2

Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [17]:
def select_kbest(X, y, k):
    kbest = SelectKBest(f_regression, k = k)
    kbest.fit(X, y)
    return list(X.columns[kbest.get_support()])

In [18]:
select_kbest(train.drop(columns = 'tip'), train.tip, 2)

['total_bill', 'size']

## 3

Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [19]:
def rfe(X, y, k):
    model = LinearRegression()
    rfe = RFE(model, n_features_to_select = k)
    rfe.fit(X, y)
    return list(X.columns[rfe.get_support()])

In [20]:
rfe(train.drop(columns = 'tip'), train.tip, 2)

['total_bill', 'price_per_person']

## 4

Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [21]:
swiss = data('swiss')
swiss.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [22]:
train_validate, test = train_test_split(swiss, test_size = 0.2)
train, validate = train_test_split(train_validate, test_size = 0.3)

In [23]:
select_kbest(train.drop(columns = 'Fertility'), train.Fertility, 3)

['Examination', 'Education', 'Catholic']

In [24]:
rfe(train.drop(columns = 'Fertility'), train.Fertility, 3)

['Examination', 'Education', 'Infant.Mortality']