In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression, RFE
import warnings
warnings.filterwarnings("ignore")
from pydataset import data

import split_scale

## Load the tips dataset

In [2]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


### Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [3]:
df['tip_percentage'] = df.tip / df.total_bill
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


### Create a column named price_per_person. This should be the total bill divided by the party size.

In [4]:
df['price_per_person'] = df.total_bill / df['size']
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


### Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

In [5]:
#amount: total_bill, size
#percent: total_bill, size

### Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [6]:
train, test = split_scale.split_my_data(df, 0.8)
train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
211,25.89,5.16,Male,Yes,Sat,Dinner,4,0.199305,6.4725
159,16.49,2.0,Male,No,Sun,Dinner,4,0.121286,4.1225
26,13.37,2.0,Male,No,Sat,Dinner,2,0.149589,6.685
180,34.65,3.68,Male,Yes,Sun,Dinner,4,0.106205,8.6625
68,20.23,2.01,Male,No,Sat,Dinner,2,0.099357,10.115


In [7]:
X_train = train[['total_bill', 'size', 'tip_percentage', 'price_per_person']]
y_train = train[['tip']]

In [8]:
# K Best
f_selector = SelectKBest(f_regression, k=2).fit(X_train, y_train)
f_support = f_selector.get_support()
f_feature = X_train.loc[:,f_support].columns.tolist()
print(f_feature)

['total_bill', 'size']


In [9]:
# RFE
lm = LinearRegression()
rfe = RFE(lm, 2)
X_rfe = rfe.fit_transform(X_train,y_train)
mask = rfe.support_
rfe_features = X_train.loc[:,mask].columns.tolist()
print(rfe_features)

['total_bill', 'tip_percentage']


### Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [10]:
X_train2 = train[['total_bill', 'size', 'tip', 'price_per_person']]
y_train2 = train[['tip_percentage']]

In [11]:
# K Best
f_selector2 = SelectKBest(f_regression, k=2).fit(X_train2, y_train2)
f_support2 = f_selector2.get_support()
f_feature2 = X_train2.loc[:,f_support2].columns.tolist()
print(f_feature2)

['total_bill', 'tip']


In [12]:
# RFE
lm2 = LinearRegression()
rfe2 = RFE(lm2, 2)
X_rfe2 = rfe2.fit_transform(X_train2,y_train2)
mask2 = rfe2.support_
rfe_features2 = X_train2.loc[:,mask2].columns.tolist()
print(rfe_features2)

['total_bill', 'tip']


### Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

In [13]:
# because each algorithm is calculating things differently
# some features might be more attractive to certain algorithms,
# because of their distribution or their correlation to the predicted variable

## Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [14]:
def select_kbest(X, y, k):
    """
    Takes in 3 variables:
        X is the predictor variables, y is the variable to be predicted, k is the number of features to select
    Returns the names of the k best features
    """
    f_selector = SelectKBest(f_regression, k=k).fit(X, y)
    f_feature = X.loc[:,f_selector.get_support()].columns.tolist()
    return print(f_feature)

In [15]:
select_kbest(X_train, y_train, 2)

['total_bill', 'size']


## Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [16]:
def select_rfe(X, y, k):
    lm = LinearRegression()
    rfe = RFE(lm, k)
    X_rfe = rfe.fit_transform(X, y)
    rfe_features = X.loc[:,rfe.support_].columns.tolist()
    print(rfe_features)

In [17]:
select_rfe(X_train, y_train, 2)

['total_bill', 'tip_percentage']


## Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [18]:
swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [19]:
swiss_X_train, swiss_X_test = split_scale.split_my_data(swiss, 0.8)

In [20]:
swiss_X = swiss_X_train.drop(columns='Fertility')
swiss_y = swiss_X_train[['Fertility']]

In [21]:
select_kbest(swiss_X, swiss_y, 3)

['Agriculture', 'Examination', 'Education']


In [22]:
select_rfe(swiss_X, swiss_y, 3)

['Agriculture', 'Education', 'Infant.Mortality']
