# Feature Engineering

In this lesson we discuss some methods for *automated* feature engineering, specifically feature selection.

While these methods can produce useful results, they are but a single piece of of the feature engineering puzzle.

## Setup

In [3]:
import pandas as pd
import numpy as np
import pydataset

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

tips = pydataset.data('tips')
tips['smoker'] = (tips.smoker == 'Yes').astype(int)
tips['dinner'] = (tips.time == 'Dinner').astype(int)

In [4]:
X = tips[['total_bill', 'size', 'smoker', 'dinner']]
y = tips.tip

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Select K Best

- uses an f regression test
- looks at each feature in isolation
- is a model with that feature better than no model at all

In [9]:
kbest = SelectKBest(f_regression, k=1)   #k=1 selects top  features to apply on dataset 
kbest.fit(X_train_scaled, y_train)

#tests to see if any coefficients are of linear value 
#looks at every feature in dataset against the target variable. is this target predicitive etc...

SelectKBest(k=1, score_func=<function f_regression at 0x7f8f5a843af0>)

In [11]:
kbest.get_support()     #an array indicates which of the columns were chosen for k-best 

array([ True, False, False, False])

In [12]:
X_train.columns[kbest.get_support()] #grabs the cols, and tells us which were our top two features/cols

Index(['total_bill'], dtype='object')

In [13]:
X_kbest = kbest.transform(X_train_scaled) #effectively drops the cols that were not chosen
X_kbest.shape

(195, 1)

<div style="border: 1px solid black; border-radius: 3px; background: palegreen; padding: .5em 1em;">
    <p style="font-size: 1.3em; font-weight: bold">Mini Exercise</p>
    <ol>
        <li>Use <code>pydataset</code> to load the <code>swiss</code> dataset.</li>
        <li>Split the swiss dataset into X and y, and train and test. The goal is to predict <code>Fertility</code>.</li>
        <li>Use <code>SelectKBest</code> to find the top 3 features that predict fertility in the swiss data set.</li>
    </ol>
</div>

In [15]:
swiss_data = pydataset.data('swiss')

In [16]:
def split_data(df):
    '''
    take in a DataFrame and return train, validate, and test DataFrames; stratify on species.
    return train, validate, test DataFrames.
    '''
    
    # splits df into train_validate and test using train_test_split() stratifying on species to get an even mix of each species
    train_validate, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.species)
    
    # splits train_validate into train and validate using train_test_split() stratifying on species to get an even mix of each species
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123, 
                                       stratify=train_validate.species)
    return train, validate, test

In [17]:
swiss_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [18]:
X = swiss_data[['Fertility', 'Agriculture', 'Examination', 'Education', 'Catholic', 'Infant.Mortality']]

y = swiss_data['Fertility']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [19]:
kbest = SelectKBest(f_regression, k=3)   #k=1 selects top  features to apply on dataset 
kbest.fit(X_train_scaled, y_train)

#tests to see if any coefficients are of linear value 
#looks at every feature in dataset against the target variable. is this target predicitive etc...

SelectKBest(k=3, score_func=<function f_regression at 0x7f8f5a843af0>)

In [20]:
kbest.get_support()

array([False, False,  True,  True,  True, False])

In [21]:
#get support for columns provided 
X_train.columns[kbest.get_support()]

Index(['Examination', 'Education', 'Catholic'], dtype='object')

In [22]:
X_kbest = kbest.transform(X_train_scaled) #effectively drops the cols that were not chosen
X_kbest.shape

#Top three features chose Examination, Education, Catholic 

(37, 3)

## Recursive Feature Elimination

- Fits a model and eliminates the worst performing features
- More computationally expensive
- Looks at all the features together

In [26]:

X = tips[['total_bill', 'size', 'smoker', 'dinner']]
y = tips.tip

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [27]:
#takes a look at all thefeatures together instead of looking at them in isolation in comparison to other features

rfe = RFE(estimator=LinearRegression(), n_features_to_select=3)

rfe.fit(X_train_scaled, y_train)
rfe.get_support()

array([ True,  True,  True, False])

In [29]:
X_train.columns[rfe.get_support()]

Index(['total_bill', 'size', 'smoker'], dtype='object')

In [30]:
pd.Series(rfe.ranking_, index=X_train.columns)

#which features are selected and what their individual ranking are


total_bill    1
size          1
smoker        1
dinner        2
dtype: int64

<div style="border: 1px solid black; border-radius: 3px; background: palegreen; padding: .5em 1em;">
    <p style="font-size: 1.3em; font-weight: bold">Mini Exercise</p>
    <ol>
        <li>Use <code>RFE</code> and <code>LinearRegression</code> to find the top 3 features that predict fertility in the swiss data set.</li>
        <li>Are the results different than what select k best gave you?</li>
    </ol>
</div>

In [31]:
swiss_data = pydataset.data('swiss')

In [32]:
def split_data(df):
    '''
    take in a DataFrame and return train, validate, and test DataFrames; stratify on species.
    return train, validate, test DataFrames.
    '''
    
    # splits df into train_validate and test using train_test_split() stratifying on species to get an even mix of each species
    train_validate, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.species)
    
    # splits train_validate into train and validate using train_test_split() stratifying on species to get an even mix of each species
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123, 
                                       stratify=train_validate.species)
    return train, validate, test

In [37]:
X = swiss_data[['Fertility', 'Agriculture', 'Examination', 'Education', 'Catholic', 'Infant.Mortality']]

y = swiss_data.drop(columns='Fertility')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [38]:
rfe = RFE(estimator=LinearRegression(), n_features_to_select=3)

rfe.fit(X_train_scaled, y_train)
rfe.get_support()

array([False,  True, False,  True,  True, False])

In [39]:
X_train.columns[rfe.get_support()]

Index(['Agriculture', 'Education', 'Catholic'], dtype='object')

In [40]:
pd.Series(rfe.ranking_, index=X_train.columns)

#which features are selected and what their individual ranking are

Fertility           4
Agriculture         1
Examination         2
Education           1
Catholic            1
Infant.Mortality    3
dtype: int64

In [None]:
#RFE looks at all of the features and how they interact with one another, more computationally expensive
