In [1]:
import pandas as pd 
import numpy as np 
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
import seaborn as sns
from pydataset import data

In [2]:
tips = sns.load_dataset("tips")


In [3]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [5]:
#Create a column called price per person that divides total bill by party size 
tips['price_per_person']= (tips['total_bill']/tips['size']).round(2)

In [6]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45
2,21.01,3.50,Male,No,Sun,Dinner,3,7.00
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,9.68
240,27.18,2.00,Female,Yes,Sat,Dinner,2,13.59
241,22.67,2.00,Male,Yes,Sat,Dinner,2,11.34
242,17.82,1.75,Male,No,Sat,Dinner,2,8.91


In [7]:
#creating sample of train data from overall dataset 
train = tips.sample(frac=.5)

In [8]:
train

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
85,34.83,5.17,Female,No,Thur,Lunch,4,8.71
113,23.95,2.55,Male,No,Sun,Dinner,2,11.98
11,35.26,5.0,Female,No,Sun,Dinner,4,8.82
170,50.81,10.0,Male,Yes,Sat,Dinner,3,16.94
37,16.93,3.07,Female,No,Sat,Dinner,3,5.64
219,30.14,3.09,Female,Yes,Sat,Dinner,4,7.54
128,11.38,2.0,Female,No,Thur,Lunch,2,5.69
179,34.63,3.55,Male,Yes,Sun,Dinner,2,17.32
98,21.01,3.0,Male,Yes,Fri,Dinner,2,10.5
122,14.26,2.5,Male,No,Thur,Lunch,2,7.13


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122 entries, 85 to 130
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   total_bill        122 non-null    float64 
 1   tip               122 non-null    float64 
 2   sex               122 non-null    category
 3   smoker            122 non-null    category
 4   day               122 non-null    category
 5   time              122 non-null    category
 6   size              122 non-null    int64   
 7   price_per_person  122 non-null    float64 
dtypes: category(4), float64(3), int64(1)
memory usage: 5.8 KB


In [10]:
train['size']=train['size'].astype(float)

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122 entries, 85 to 130
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   total_bill        122 non-null    float64 
 1   tip               122 non-null    float64 
 2   sex               122 non-null    category
 3   smoker            122 non-null    category
 4   day               122 non-null    category
 5   time              122 non-null    category
 6   size              122 non-null    float64 
 7   price_per_person  122 non-null    float64 
dtypes: category(4), float64(4)
memory usage: 5.8 KB


In [12]:
dummy_df = pd.get_dummies(train[['smoker',\
                              'sex', \
                              'day', \
                              'time']], dummy_na=False, \
                              drop_first=True)

In [13]:
train = pd.concat([train, dummy_df], axis=1)

In [15]:
train

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,smoker_No,sex_Female,day_Fri,day_Sat,day_Sun,time_Dinner
85,34.83,5.17,Female,No,Thur,Lunch,4.0,8.71,1,1,0,0,0,0
113,23.95,2.55,Male,No,Sun,Dinner,2.0,11.98,1,0,0,0,1,1
11,35.26,5.0,Female,No,Sun,Dinner,4.0,8.82,1,1,0,0,1,1
170,50.81,10.0,Male,Yes,Sat,Dinner,3.0,16.94,0,0,0,1,0,1
37,16.93,3.07,Female,No,Sat,Dinner,3.0,5.64,1,1,0,1,0,1
219,30.14,3.09,Female,Yes,Sat,Dinner,4.0,7.54,0,1,0,1,0,1
128,11.38,2.0,Female,No,Thur,Lunch,2.0,5.69,1,1,0,0,0,0
179,34.63,3.55,Male,Yes,Sun,Dinner,2.0,17.32,0,0,0,0,1,1
98,21.01,3.0,Male,Yes,Fri,Dinner,2.0,10.5,0,0,1,0,0,1
122,14.26,2.5,Male,No,Thur,Lunch,2.0,7.13,1,0,0,0,0,0


In [None]:
tips.pairplot

In [None]:
tips.corr()

In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122 entries, 85 to 130
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   total_bill        122 non-null    float64 
 1   tip               122 non-null    float64 
 2   sex               122 non-null    category
 3   smoker            122 non-null    category
 4   day               122 non-null    category
 5   time              122 non-null    category
 6   size              122 non-null    float64 
 7   price_per_person  122 non-null    float64 
 8   smoker_No         122 non-null    uint8   
 9   sex_Female        122 non-null    uint8   
 10  day_Fri           122 non-null    uint8   
 11  day_Sat           122 non-null    uint8   
 12  day_Sun           122 non-null    uint8   
 13  time_Dinner       122 non-null    uint8   
dtypes: category(4), float64(4), uint8(6)
memory usage: 6.5 KB


In [19]:
#selecting columns to be placed in model
cols =['total_bill','size','price_per_person','sex_Female','day_Fri','day_Sat','day_Sun','smoker_No','time_Dinner']

In [20]:
#designating x and y train
X_train, y_train = train[cols], train['tip']


## use k best to select the top 2 features for predicting tip amount what are they?

In [21]:
# make the thing
kbest = SelectKBest(f_regression, k=2)
# fit the thing
_ = kbest.fit(X_train, y_train)

In [22]:
# statistical f-value:
#high number for score is good 
kbest.scores_ 
#p value: 
kbest.pvalues_

array([8.63723697e-18, 1.64897823e-07, 5.35351845e-06, 8.84465504e-01,
       9.01365395e-01, 9.09836554e-01, 2.00516602e-01, 3.65645327e-01,
       5.23740958e-02])

In [23]:
kbest_results = pd.DataFrame(
    dict(p=kbest.pvalues_, f=kbest.scores_),
                             index = X_train.columns)

#f score ratio between explained variance and unexplained variance , higher f score the better. 
#How much error is explained in test

In [24]:
kbest_results

Unnamed: 0,p,f
total_bill,8.637237e-18,102.503155
size,1.648978e-07,30.92943
price_per_person,5.353518e-06,22.701573
sex_Female,0.8844655,0.021205
day_Fri,0.9013654,0.015425
day_Sat,0.9098366,0.012879
day_Sun,0.2005166,1.656774
smoker_No,0.3656453,0.824646
time_Dinner,0.0523741,3.8396


In [25]:
# get-support() will output a boolean mask to tell me which features were selected
# we can apply this mask to the columns in our original dataframe
# not helpful if you only have 10 columns (good for hundreds of columns in data set)
# Function is picking best columns (2 in this example because K was 2)
X_train.columns[kbest.get_support()]

Index(['total_bill', 'size'], dtype='object')

In [26]:
# kbest transform will convert our information to the selected feature subspace
# this provides numpy array
kbest.transform(X_train)[:5]

array([[34.83,  4.  ],
       [23.95,  2.  ],
       [35.26,  4.  ],
       [50.81,  3.  ],
       [16.93,  3.  ]])

In [27]:
# convert kbest transform back to dataframe
X_train_transformed = pd.DataFrame(
    kbest.transform(X_train),
    columns=X_train.columns[kbest.get_support()],
    index=X_train.index
)

In [28]:
X_train_transformed.head()

Unnamed: 0,total_bill,size
85,34.83,4.0
113,23.95,2.0
11,35.26,4.0
170,50.81,3.0
37,16.93,3.0


## use recursive feature elimination to select the top 2 features. What are they?

In [31]:
from sklearn.linear_model import LinearRegression

In [32]:
# make a model object to use in RFE process.
# The model is here to give us metrics on feature importance and model score
# allowing us to recursively reduce the number of features to reach our desired space
model = LinearRegression()

In [33]:
# make thing
rfe = RFE(model, n_features_to_select=2)
# fit thing
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [34]:
#create rankings 
# keeping 1 and removing 2
rfe.ranking_

array([2, 3, 7, 1, 5, 6, 4, 8, 1])

In [35]:
# converting ranking back into dataframe
pd.DataFrame(
{
    'rfe_ranking': rfe.ranking_
},index = X_train.columns)

Unnamed: 0,rfe_ranking
total_bill,2
size,3
price_per_person,7
sex_Female,1
day_Fri,5
day_Sat,6
day_Sun,4
smoker_No,8
time_Dinner,1


## Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

##  Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [37]:
def select_kbest(X,y,z):
    '''This function will provide the results of the best features for 
    predicting the target using Select K best '''
    
    # make the thing
    kbest = SelectKBest(f_regression, k=z)
    # fit the thing
    _ = kbest.fit(X, y)
    
    # statistical f-value:
    kbest.scores_ 
    #p value: 
    kbest.pvalues_
    X.columns[kbest.get_support()]
    
    #placing kbest results into dataframe
    kbest_results = pd.DataFrame(
    dict(p=kbest.pvalues_, f=kbest.scores_),
                             index = X.columns)
    
    #lists best features based on value of k
    return X.columns[kbest.get_support()]
    

## Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [38]:
def select_rfe(X,y,k):
    '''this function will provide the top k features baed on the RFE class'''

    #creating linearRegression model for rfe 
    model = LinearRegression()
    
    # make rfe
    rfe = RFE(model, n_features_to_select=k)
    
    # fit rfe on data
    rfe.fit(X, y)
    
    #apply ranking to features
    rfe.ranking_
    
    #turn ranking results into dataframe
    pd.DataFrame(
    {
        'rfe_ranking': rfe.ranking_
    },index = X.columns)
    
    #boolean mask for ranking of features 
    rfe.get_support()
    
    #transform get support into dataframe of top k features
    X_transformed = pd.DataFrame(
    rfe.transform(X),
    index = X.index,
    columns = X.columns[rfe.support_])
    
    #shows k top columns
    return X_transformed

## Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [39]:
swiss = data("swiss")


In [40]:
swiss

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6
Porrentruy,76.1,35.3,9,7,90.57,26.6
Broye,83.8,70.2,16,7,92.85,23.6
Glane,92.4,67.8,14,8,97.16,24.9
Gruyere,82.4,53.3,12,7,97.67,21.0
Sarine,82.9,45.2,16,13,91.38,24.4


In [41]:
swiss.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [42]:
#assign everything to X except Fertility because we are using it as our y
X= swiss.drop(columns=['Fertility'])
#assign y_train 
y = swiss['Fertility']

In [43]:
#Use the "select_Kbest" function we created to find our top 3 features
select_kbest(X, y,3)

In [None]:
#Use the "rfe" function we created to find our top 3 features
rfe(X, y, 3)

In [None]:
# creating sample of data 
X_sample = swiss.sample(frac=.5)

In [None]:
#create a function to split our data
def split_data(df):
    '''
    split our data,
    takes in a pandas dataframe
    returns: three pandas dataframes, train, test, and validate
    '''
    #create train_validate and test datasets
    train, test = train_test_split(df, train_size = 0.8, random_state = 123)
    #create train and validate datasets
    train, validate = train_test_split(train, train_size = 0.7, random_state = 123)

    # Have function print datasets shape
    print(f'train -> {train.shape}')
    print(f'validate -> {validate.shape}')
    print(f'test -> {test.shape}')
   
    return train, validate, test

In [None]:
#assign to variable #need scaler for each independent variable
#create it
scaler = MinMaxScaler()

In [None]:
cols = ['Agriculture','Examination','Education','Catholic','infant_mortality']

In [None]:
#fit it
scaler.fit(X_train)
MinMaxScaler()

In [None]:
X

In [None]:
#use it
#make a new column within train
#use 'transform' instead of 'predict'
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)