# Exercise (Feature Engineering)

Do your work for this exercise in a jupyter notebook named feature_engineering within the regression-exercises repo. Add, commit, and push your work.

## 1. 

Load the tips dataset.



In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from wrangle import check_columns, box_plotter
from prepare import standard_scaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [2]:
tips = sns.load_dataset('tips')

check_columns(tips)

Unnamed: 0,Column Name,Number of Unique Values,Unique Values,Number of Null Values,Proportion of Null Values,dtype
0,total_bill,229,"[16.99, 10.34, 21.01, 23.68, 24.59, 25.29, 8.7...",0,0.0,float64
1,tip,123,"[1.01, 1.66, 3.5, 3.31, 3.61, 4.71, 2.0, 3.12,...",0,0.0,float64
2,sex,2,"['Female', 'Male'] Categories (2, object): ['M...",0,0.0,category
3,smoker,2,"['No', 'Yes'] Categories (2, object): ['Yes', ...",0,0.0,category
4,day,4,"['Sun', 'Sat', 'Thur', 'Fri'] Categories (4, o...",0,0.0,category
5,time,2,"['Dinner', 'Lunch'] Categories (2, object): ['...",0,0.0,category
6,size,6,"[2, 3, 4, 1, 6, 5]",0,0.0,int64


### 1a. 
 
Create a column named price_per_person. This should be the total bill divided by the party size.



In [3]:
tips['price_per_person']= tips['total_bill'] / tips['size']

tips['price_per_person'] = tips['price_per_person'].round(2)

In [4]:
# encode the categorical

# create an instance of the OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# fit the encoder to the categorical variables in the dataframe
encoder.fit(tips[['sex', 'smoker', 'day', 'time']])

# transform the categorical variables into one-hot encoded features
one_hot_encoded = encoder.transform(tips[['sex', 'smoker', 'day', 'time']])

# create a new dataframe with the one-hot encoded features
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(['sex', 'smoker', 'day', 'time']))

# concatenate the original dataframe with the one-hot encoded dataframe
tips = pd.concat([tips, one_hot_df], axis=1)

# drop the original categorical columns
tips.drop(['sex', 'smoker', 'day', 'time'], axis=1, inplace=True)



In [5]:
X_tips = tips.drop(columns=['tip'])
y_tips = tips['tip']

In [6]:
# Scale total_bill and price_per_person using StandardScaler

cols = ['total_bill', 'price_per_person']

standard_scaler(tips, cols)

Unnamed: 0,total_bill,tip,size,price_per_person,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,-0.314711,1.01,2,0.206929,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,-1.063235,1.66,3,-1.526068,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.137780,3.50,3,-0.305405,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.438315,3.31,2,1.358822,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.540745,3.61,4,-0.597677,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,1.040511,5.92,3,0.616109,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,0.832275,2.00,2,1.960557,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,0.324630,2.00,2,1.186898,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,-0.221287,1.75,2,0.351346,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


### 1b.

Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?



*I think size and total_bill will be most important, due to directly increasing the tip based on percentages.*

*I also think time and day will be important due to dinner typically being a larger meal and people typically going out for dinner on weekends.*

### 1c. 

Use Select K Best to select the top 2 features for predicting tip amount. What are they?



In [7]:
from sklearn.feature_selection import SelectKBest, f_regression

# parameters: f_regression stats test, give me 8 features
f_selector = SelectKBest(f_regression, k=2)

# find the top 8 X's correlated with y
f_selector.fit(X_tips, y_tips)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature = X_tips.iloc[:,feature_mask].columns.tolist()

f_feature

['total_bill', 'size']

### 1d. 

Use Recursive Feature Elimination to select the top 2 features for tip amount. What are they?



In [8]:
# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, n_features_to_select=2)

# fit the data using RFE
rfe.fit(X_tips,y_tips)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_tips.iloc[:,feature_mask].columns.tolist()

rfe_feature

['day_Fri', 'day_Sun']

In [9]:
# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, n_features_to_select=10)

# fit the data using RFE
rfe.fit(X_tips,y_tips)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_tips.iloc[:,feature_mask].columns.tolist()

rfe_feature

['total_bill',
 'size',
 'price_per_person',
 'smoker_No',
 'smoker_Yes',
 'day_Fri',
 'day_Sat',
 'day_Sun',
 'day_Thur',
 'time_Lunch']

### 1e.

Why do you think Select K Best and Recursive Feature Elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

**`SelectKBest` might select features that have high individual scores but low importance when combined with other features**

**`RFE` might select features that have lower individual scores but higher importance when combined with other features.**

---

`SelectKBest` selects the top k features based on a univariate statistical test, such as the F-test or mutual information. It evaluates each feature independently and selects the k features with the highest scores.

`RFE` selects the top n features by recursively eliminating the least important features based on their coefficients or feature importance scores. It evaluates the features in a nested manner and selects the n features that contribute the most to the model's performance.

## 2.

Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [10]:
from sklearn.feature_selection import SelectKBest, f_regression

def select_kbest(X, y, k):
    """
    Selects the top k features based on the SelectKBest class and a univariate statistical test.
    Returns the names of the top k selected features.
    """
    # create an instance of the SelectKBest class
    selector = SelectKBest(score_func=f_regression, k=k)

    # fit the selector to the predictors and target
    selector.fit(X, y)

    # get the indices of the top k selected features
    top_k_indices = selector.get_support(indices=True)

    # get the names of the top k selected features
    top_k_features = X.columns[top_k_indices].tolist()

    return top_k_features

## 3.

Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top n features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [11]:
def rfe(X, y, n):
    """
    Selects the top n features based on the RFE class and a linear regression model.
    Returns the names of the top n selected features.
    """
    # create an instance of the RFE class
    estimator = LinearRegression()
    selector = RFE(estimator, n_features_to_select=n)

    # fit the selector to the predictors and target
    selector.fit(X, y)

    # get the indices of the top n selected features
    top_n_indices = selector.get_support(indices=True)

    # get the names of the top n selected features
    top_n_features = X.columns[top_n_indices].tolist()

    return top_n_features

## 4.

Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both Select K Best and Recursive Feature Elimination (use the functions you just built to help you out).

In [12]:
# Load swiss dataset

from pydataset import data

swiss = data('swiss')

In [13]:
check_columns(swiss)

Unnamed: 0,Column Name,Number of Unique Values,Unique Values,Number of Null Values,Proportion of Null Values,dtype
0,Fertility,46,"[80.2, 83.1, 92.5, 85.8, 76.9, 76.1, 83.8, 92.4, 82.4, 82.9, 87.1, 64.1, 66.9, 68.9, 6...",0,0.0,float64
1,Agriculture,47,"[17.0, 45.1, 39.7, 36.5, 43.5, 35.3, 70.2, 67.8, 53.3, 45.2, 64.5, 62.0, 67.5, 60.7, 6...",0,0.0,float64
2,Examination,22,"[15, 6, 5, 12, 17, 9, 16, 14, 21, 19, 22, 18, 26, 31, 20, 25, 3, 7, 13, 29, 35, 37]",0,0.0,int64
3,Education,19,"[12, 9, 5, 7, 15, 8, 13, 6, 2, 28, 20, 10, 3, 1, 19, 11, 32, 53, 29]",0,0.0,int64
4,Catholic,46,"[9.96, 84.84, 93.4, 33.77, 5.16, 90.57, 92.85, 97.16, 97.67, 91.38, 98.61, 8.52, 2.27,...",0,0.0,float64
5,Infant.Mortality,37,"[22.2, 20.2, 20.3, 20.6, 26.6, 23.6, 24.9, 21.0, 24.4, 24.5, 16.5, 19.1, 22.7, 18.7, 2...",0,0.0,float64


In [14]:
X_swiss = swiss.drop('Fertility', axis=1)
y_swiss = swiss['Fertility']

In [15]:
# No outliers, so we scale using StandardScaler

cols = ['Agriculture', 'Examination', 'Education', 'Catholic', 'Infant.Mortality']

X_swiss_scaled = standard_scaler(X_swiss, cols)

In [16]:
check_columns(X_swiss_scaled)

Unnamed: 0,Column Name,Number of Unique Values,Unique Values,Number of Null Values,Proportion of Null Values,dtype
0,Agriculture,47,"[-1.4980910547294009, -0.24744070328747875, -0.48777920498806154, -0.6302020208106293,...",0,0.0,float64
1,Examination,22,"[-0.1887046103263814, -1.3290196127272287, -1.4557212796606562, -0.5688096111266638, 0...",0,0.0,float64
2,Education,19,"[0.10736079119826608, -0.20801153294664063, -0.6285079651398495, -0.4182597490432451, ...",0,0.0,float64
3,Catholic,46,"[-0.7558104598091429, 1.0590752555328953, 1.26654616529956, -0.1787213988810722, -0.87...",0,0.0,float64
4,Infant.Mortality,37,"[0.78341570968399, 0.08934335614680794, 0.12404697382366753, 0.22815782685424507, 2.31...",0,0.0,float64


In [17]:
select_kbest(X_swiss_scaled, y_swiss, 3)

['Examination', 'Education', 'Catholic']

In [18]:
rfe(X_swiss_scaled, y_swiss, 3)

['Agriculture', 'Education', 'Catholic']

# Notes