In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import pydataset

from sklearn.feature_selection import f_regression, SelectKBest, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
tips=pydataset.data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


- Create a column named price_per_person. This should be the total bill divided by the party size.
- Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

In [3]:
tips['ppp'] = tips.total_bill/tips.size
tips['tip_rate'] = tips.tip/tips.total_bill
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,ppp,tip_rate
1,16.99,1.01,Female,No,Sun,Dinner,2,0.009947,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,0.006054,0.160542
3,21.01,3.5,Male,No,Sun,Dinner,3,0.012301,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,0.013864,0.13978
5,24.59,3.61,Female,No,Sun,Dinner,4,0.014397,0.146808


- Use select k best to select the top 2 features for predicting tip amount. What are they?
- Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [4]:
X = tips[['size', 'total_bill','ppp']]
y = tips.tip

In [5]:
kbest=SelectKBest(f_regression, k=2)
kbest.fit(X,y)
print('Top 2 features per K-best:')
X.columns[kbest.get_support()]

Top 2 features per K-best:


Index(['total_bill', 'ppp'], dtype='object')

#### Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?
- kbest doesn't take into account how features interact
- Yes as the number of features will change the rate of selection

#### Write a function named `select_kbest` that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.


In [6]:
def select_kbest(X, y , k):
    kbest = SelectKBest(k=k)
    kbest.fit(X, y)
    mask = kbest.get_support()
    return X.columns[mask]

In [7]:
select_kbest(tips[['total_bill', 'ppp','size']], tips.tip, 2)

Index(['total_bill', 'ppp'], dtype='object')

#### Write a function named `rfe` that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [8]:
def rfe( X, y ,k, model=LinearRegression()):
    rfe = RFE(model, n_features_to_select = k)
    rfe.fit(X, y)
    mask = rfe.get_support()
    return X.columns[mask]

In [9]:
rfe(tips[['total_bill', 'ppp','size']], tips.tip, 2)

Index(['total_bill', 'size'], dtype='object')

#### Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [10]:
from pydataset import data
from wrangle import split_data
swiss = data('swiss')

In [11]:
train, validate, test = split_data(swiss)

Data split as follows: Train 56.00%, Validate 24.00%, Test 20.00%


In [12]:
train = train.reset_index()

In [13]:
train.columns

Index(['index', 'Fertility', 'Agriculture', 'Examination', 'Education',
       'Catholic', 'Infant.Mortality'],
      dtype='object')

In [14]:
# Scale the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [15]:
scaled_data = scaler.fit_transform(train[['Agriculture', 'Examination', 'Education', 'Catholic',
       'Infant.Mortality']])
scaled_data_df = pd.DataFrame(data = scaled_data, columns = ['Agriculture_scaled', 'Examination_scaled', 'Education_scaled', 'Catholic_scaled',
       'Infant.Mortality_scaled'])

In [16]:
train = pd.concat([train, scaled_data_df], axis = 1)

In [17]:
X = train[['Agriculture_scaled', 'Examination_scaled', 'Education_scaled', 'Catholic_scaled',
       'Infant.Mortality_scaled']]
y = train.Fertility

In [18]:
X.head()

Unnamed: 0,Agriculture_scaled,Examination_scaled,Education_scaled,Catholic_scaled,Infant.Mortality_scaled
0,0.647561,0.40625,0.290323,0.054508,0.122449
1,0.796341,0.5,0.258065,0.004508,0.5
2,0.526829,0.59375,0.354839,0.130533,0.163265
3,0.953659,0.0,0.032258,0.997029,0.0
4,0.509756,0.375,0.225806,0.03791,0.755102


In [19]:
select_kbest(X, y, 3)

Index(['Education_scaled', 'Catholic_scaled', 'Infant.Mortality_scaled'], dtype='object')

In [20]:
rfe(X,y,3)

Index(['Agriculture_scaled', 'Examination_scaled', 'Infant.Mortality_scaled'], dtype='object')

### Validating against non-scaled items

In [22]:
X = train[['Agriculture', 'Examination', 'Education', 'Catholic',
       'Infant.Mortality']]
select_kbest(X, y, 3)

Index(['Education', 'Catholic', 'Infant.Mortality'], dtype='object')

In [23]:
rfe(X,y,3)

Index(['Agriculture', 'Examination', 'Infant.Mortality'], dtype='object')

### Takeaway:
1. Same results scaled as non-scaled
2. matplotlib needed for encoding