# Feature Engineering

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

## 1. Load the `tips` dataset.

In [2]:
import pydataset
df = pydataset.data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


### a. Create a column named tip_percentage. This should be the tip amount divided by the total bil.

In [3]:
df['tip_percentage'] = df.tip / df.total_bill
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


### b. Create a column named price_per_person. This should be the total bill divided by the party size.

In [4]:
df['price_per_person'] = df.total_bill / df['size']
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


### c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

total bill

### e. Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [5]:
# split data
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, train_size = .8, random_state = 123)


In [6]:
# Choose X and y 
X_train = train[['total_bill', 'size', 'tip_percentage', 'price_per_person']]
y_train = train[['tip']]
X_train.head()

Unnamed: 0,total_bill,size,tip_percentage,price_per_person
24,39.42,4,0.192288,9.855
191,15.69,2,0.095602,7.845
210,12.76,2,0.174765,6.38
11,10.27,2,0.166504,5.135
197,10.34,2,0.193424,5.17


> Using SelectKBest

In [7]:
from sklearn.feature_selection import SelectKBest, f_regression

In [8]:
# Initialize the f_selector object, 
# which defines the test for scoring the features and the number of features 
f_selector = SelectKBest(f_regression, k=2)

In [9]:
# Fit the object to our data. 
# In doing this, our selector is scoring, ranking, and identifying the top k features.
f_selector.fit(X_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x1a16406290>)

In [10]:
#Transform our dataset to reduce to the k best features
X_reduced = f_selector.transform(X_train)


In [11]:
#feature indices
f_support = f_selector.get_support()

print(f_support) 

[ True  True False False]


In [12]:
# get a list of the feature names selected from X_train using .loc with our mask, 
#using .columns to get the column names, and convert the values to a list using .tolist()
f_feature = X_train.loc[:,f_support].columns.tolist()

print(str(len(f_feature)), 'selected features')
print(f_feature)

2 selected features
['total_bill', 'size']


> Using recursive feature elimination

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [14]:
#itialize the linear regression object
lm = LinearRegression()

In [15]:
#Initialize the RFE object
rfe = RFE(lm, 2)

In [16]:
# Transforming data using RFE
X_rfe = rfe.fit_transform(X_train,y_train)  

In [17]:
mask = rfe.support_

In [18]:
rfe_features = X_train.loc[:,mask].columns.tolist()

In [19]:
print(str(len(rfe_features)), 'selected features')
print(rfe_features)

2 selected features
['total_bill', 'tip_percentage']


e. Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

> Using SelectKBest

In [20]:
# define X, Y
X_train = train[['total_bill', 'size', 'tip', 'price_per_person']]
y_train = train[['tip_percentage']]
X_train.head()

Unnamed: 0,total_bill,size,tip,price_per_person
24,39.42,4,7.58,9.855
191,15.69,2,1.5,7.845
210,12.76,2,2.23,6.38
11,10.27,2,1.71,5.135
197,10.34,2,2.0,5.17


In [21]:
# initialie the selector
f_selector = SelectKBest(f_regression, k=2)

In [22]:
#fit and transform
X_reduced = f_selector.fit_transform(X_train, y_train)

In [23]:
#feature indices, list the selected features
f_support = f_selector.get_support()
f_feature = X_train.loc[:,f_support].columns.tolist()

print(str(len(f_feature)), 'selected features')
print(f_feature)

2 selected features
['tip', 'price_per_person']


> RFE

In [24]:
#itialize the linear regression object
lm = LinearRegression()
#Initialize the RFE object
rfe = RFE(lm, 2)
# Fit and Transform data using RFE
X_rfe = rfe.fit_transform(X_train,y_train) 
# Make a column mask
mask = rfe.support_
#use the mask to select columns
rfe_features = X_train.loc[:,mask].columns.tolist()

print(str(len(rfe_features)), 'selected features')
print(rfe_features)

2 selected features
['size', 'tip']


f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

They use different method, so it could be different

## 2. Write a function named `select_kbest `
that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [25]:
def select_kbest(X, y, k):
    # initialie the selector
    f_selector = SelectKBest(f_regression, k)
    #fit and transform
    X_reduced = f_selector.fit_transform(X, y)
    #feature indices, list the selected features
    f_support = f_selector.get_support()
    f_feature = X.loc[:,f_support].columns.tolist()
    return f_feature
    

In [26]:
select_kbest(X_train, y_train, 2)

['tip', 'price_per_person']

## 3. Write a function named `rfe` 
that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [27]:
def rfe(X, y, k):
    #itialize the linear regression object
    lm = LinearRegression()
    #Initialize the RFE object
    rfe = RFE(lm, k)
    # Fit and Transform data using RFE
    X_rfe = rfe.fit_transform(X,y) 
    # Make a column mask
    mask = rfe.support_
    #use the mask to select columns
    rfe_features = X.loc[:,mask].columns.tolist()
    return rfe_features

In [28]:
rfe(X_train, y_train, 2)

['size', 'tip']

## 4. Load the `swiss` dataset 
and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [29]:
df = pydataset.data('swiss')
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [30]:
X = df.drop(columns = 'Fertility')
y = df[['Fertility']]

In [31]:
# selectKbest
select_kbest(X, y, 3)

['Examination', 'Education', 'Catholic']

In [32]:
rfe(X, y, 3)

['Examination', 'Education', 'Infant.Mortality']