In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

import env
import wrangle
import explore
import evaluate

import warnings
warnings.filterwarnings("ignore")

from pydataset import data

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression, RFE

### 1. Tips Data

In [2]:
df = data('tips')

In [3]:
df = df.rename(columns={'size':'party_size'})

#### A. Create a column named price_per_person. This should be the total bill divided by the party size.

In [4]:
df['price_per_person'] = df.total_bill / df.party_size

In [5]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,party_size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


#### B. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? 

total_bill, party_size, and price_per_person

In [6]:
# identify target
target = 'tip'

# identify quantitative features
quant_features = [col for col in df.columns if (df[col].dtype != 'object') & (col != target)]

# identify categorical features
categ_features = [col for col in df.columns if (df[col].dtype == 'object') & (col != target)]

In [7]:
# encode categorical features
for feature in categ_features:
    dummy_df = pd.get_dummies(df[feature],
                              prefix=f'enc_{df[feature].name}',
                              drop_first=True)
    df = pd.concat([df, dummy_df], axis=1)
    
# identify encoded features
enc_features = [col for col in df.columns if col.startswith('enc_')]

In [8]:
quant_features

['total_bill', 'party_size', 'price_per_person']

In [9]:
train, test, validate = wrangle.train_test_validate_split(df)

train	 n = 136
test	 n = 49
validate n = 59


In [10]:
# scale quantitative features

# establish empty dataframes for storing scaled dataset
train_scaled = pd.DataFrame(index=train.index)
validate_scaled = pd.DataFrame(index=validate.index)
test_scaled = pd.DataFrame(index=test.index)

# screate and fit the scaler
scaler = MinMaxScaler().fit(train[quant_features])

# adding scaled features to scaled dataframes
train_scaled[quant_features] = scaler.transform(train[quant_features])
validate_scaled[quant_features] = scaler.transform(validate[quant_features])
test_scaled[quant_features] = scaler.transform(test[quant_features])

# add 'scaled' prefix to columns
for feature in quant_features:
    train_scaled = train_scaled.rename(columns={feature: f'scaled_{feature}'})
    validate_scaled = validate_scaled.rename(columns={feature: f'scaled_{feature}'})
    test_scaled = test_scaled.rename(columns={feature: f'scaled_{feature}'})
    
# concat scaled features to original train, validate, test df's
train = pd.concat([train, train_scaled], axis=1)
validate = pd.concat([validate, validate_scaled], axis=1)
test = pd.concat([test, test_scaled], axis=1)

#identify scaled features
scaled_features = [col for col in train.columns if col.startswith('scaled_')]

In [11]:
# divide samples into x and y using only model-appropriate (encoded and scaled) features for x

x_train = train[scaled_features + enc_features]
y_train = train[target]

x_validate = validate[scaled_features + enc_features]
y_validate = validate[target]

x_test = test[scaled_features + enc_features]
y_test = test[target]

#### C. Use select k best to select the top 2 features for predicting tip amount. What are they?

In [12]:
selector = SelectKBest(f_regression, k=2)
selector.fit(x_train, y_train)
feature_mask = selector.get_support()
k_best_features = x_train.iloc[:,feature_mask].columns.tolist()
k_best_features

['scaled_total_bill', 'scaled_party_size']

#### D. Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [13]:
rfe = RFE(LinearRegression(), n_features_to_select=2)
x_rfe = rfe.fit_transform(x_train, y_train)
feature_mask = rfe.support_
rfe_features = x_train.loc[:,feature_mask].columns.to_list()
rfe_features

['scaled_total_bill', 'scaled_price_per_person']

#### E. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

SelectKBest does not take into account the way features interact with each other, and only selects the features that are most independently correlated with the target. In contrast, RFE examines model performance with different combinations of features, which does take into account the features interactions with each other. It is for this reason that these two methods may sometimes produce different results. 

When parameters are changed to select the 3 best features, both methods produce the same 3 features. (see below)

In [14]:
# SelectKBest
selector = SelectKBest(f_regression, k=3)
selector.fit(x_train, y_train)
feature_mask = selector.get_support()
k_best_features = x_train.iloc[:,feature_mask].columns.tolist()

# RFE
rfe = RFE(LinearRegression(), n_features_to_select=3)
x_rfe = rfe.fit_transform(x_train, y_train)
feature_mask = rfe.support_
rfe_features = x_train.loc[:,feature_mask].columns.to_list()

k_best_features, rfe_features

(['scaled_total_bill', 'scaled_party_size', 'scaled_price_per_person'],
 ['scaled_total_bill', 'scaled_party_size', 'scaled_price_per_person'])

### 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.


In [15]:
from sklearn.feature_selection import SelectKBest, f_regression

def select_kbest(x, y, k):
    selector = SelectKBest(f_regression, k=k)
    selector.fit(x, y)
    feature_mask = selector.get_support()
    k_best_features = x_train.iloc[:,feature_mask].columns.tolist()
    return k_best_features

In [16]:
select_kbest(x_train, y_train, 2)

['scaled_total_bill', 'scaled_party_size']

### 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.


In [17]:
from sklearn.feature_selection import RFE

def rfe(x, y, k):
    rfe = RFE(LinearRegression(), n_features_to_select=k)
    x_rfe = rfe.fit_transform(x_train, y_train)
    feature_mask = rfe.support_
    rfe_features = x_train.loc[:,feature_mask].columns.to_list()
    return rfe_features

In [18]:
rfe(x_train, y_train, 2)

['scaled_total_bill', 'scaled_price_per_person']

### 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).


In [19]:
df = data('swiss')

In [20]:
for col in df.columns:
    df = df.rename(columns={col: col.lower().replace('.', '_')})

In [21]:
df.head()

Unnamed: 0,fertility,agriculture,examination,education,catholic,infant_mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [22]:
target = 'fertility'
quant_features = [col for col in df.columns if (df[col].dtype != 'object') & (col != 'target')]

In [23]:
train, test, validate = wrangle.train_test_validate_split(df)

train	 n = 25
test	 n = 10
validate n = 12


In [24]:
# scale quantitative features

# establish empty dataframes for storing scaled dataset
train_scaled = pd.DataFrame(index=train.index)
validate_scaled = pd.DataFrame(index=validate.index)
test_scaled = pd.DataFrame(index=test.index)

# screate and fit the scaler
scaler = MinMaxScaler().fit(train[quant_features])

# adding scaled features to scaled dataframes
train_scaled[quant_features] = scaler.transform(train[quant_features])
validate_scaled[quant_features] = scaler.transform(validate[quant_features])
test_scaled[quant_features] = scaler.transform(test[quant_features])

# add 'scaled' prefix to columns
for feature in quant_features:
    train_scaled = train_scaled.rename(columns={feature: f'scaled_{feature}'})
    validate_scaled = validate_scaled.rename(columns={feature: f'scaled_{feature}'})
    test_scaled = test_scaled.rename(columns={feature: f'scaled_{feature}'})
    
# concat scaled features to original train, validate, test df's
train = pd.concat([train, train_scaled], axis=1)
validate = pd.concat([validate, validate_scaled], axis=1)
test = pd.concat([test, test_scaled], axis=1)

#identify scaled features
scaled_features = [col for col in train.columns if col.startswith('scaled_')]

In [25]:
x_train = train[scaled_features]
y_train = train[target]

x_validate = validate[scaled_features]
y_validate = validate[target]

x_test = test[scaled_features]
y_test = test[target]

In [26]:
select_kbest(x_train, y_train, 3)

['scaled_examination', 'scaled_education', 'scaled_infant_mortality']

In [27]:
rfe(x_train, y_train, 3)

['scaled_fertility', 'scaled_catholic', 'scaled_infant_mortality']