In [55]:
# imports
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from pydataset import data

from wrangle import split_data
from explore import determine_variable_type
from preprocess import scale_data

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

1. Load the tips dataset.

In [2]:
# importing data
df = data('tips')

In [3]:
# checking the top 5
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.shape

(244, 7)

> a. Create a column named price_per_person. This should be the total bill divided by the party size.

In [5]:
df['price_per_person'] = df['total_bill'] / df['size']

> b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

In [6]:
# I think the total bill and time will be most important

> c. Use select k best to select the top 2 features for predicting tip amount. What are they?

In [7]:
# setting the target
target = 'tip'

In [12]:
# double check
df[target]

1      1.01
2      1.66
3      3.50
4      3.31
5      3.61
       ... 
240    5.92
241    2.00
242    2.00
243    1.75
244    3.00
Name: tip, Length: 244, dtype: float64

In [40]:
# get different variable types
explore_cols, cat_cols, num_cols = determine_variable_type(df)

In [9]:
# split data
train, validate, test = split_data(df)

>**Preprocessing:** X, y

In [13]:
# getting y's
y_train, y_validate, y_test = train[target], validate[target], test[target]

In [20]:
# testing
y_train.shape[0] + y_validate.shape[0] + y_test.shape[0] == df.shape[0]

True

In [14]:
# getting X's
X_train, X_validate, X_test = train.drop(columns=target), validate.drop(columns=target), test.drop(columns=target),

In [21]:
# testing
X_train.shape[0] + X_validate.shape[0] + X_test.shape[0] == df.shape[0]

True

>**Preprocessing:** scaling continuous cols

In [41]:
# inspecting - will remove size
num_cols

['total_bill', 'tip', 'size', 'price_per_person']

In [42]:
# removing size an tip
num_cols.remove('tip')

In [43]:
# scaling data
X_train_scaled, X_validate_scaled, X_test_scaled = scale_data(X_train, num_cols, 'minmax', X_validate, X_test )

>**Preprocessing:** feature selection

In [44]:
# initialize the object with specified parameters
f_selector = SelectKBest(f_regression, k=2)

In [45]:
# fitting the object
f_selector.fit(X_train_scaled, y_train)

In [None]:
# reducing to the k best features
X_reduced = f_selector.transform(X_train_scaled)

In [48]:
# checking the shape
X_reduced.shape

(136, 2)

In [51]:
# getting feature mask
f_support = f_selector.get_support()

In [52]:
# getting features
f_features = X_train_scaled.loc[:,f_support].columns.to_list()

In [54]:
# the two continuous features with the most importance for the 
print(f_features)

['total_bill', 'size']


> d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [56]:
# initialize LR object
lm = LinearRegression()

In [57]:
# initialize RFE object
rfe = RFE(lm, n_features_to_select=2)

In [58]:
# fitting and transforming data
X_rfe = rfe.fit_transform(X_train_scaled, y_train)

In [59]:
# getting mask
rfe_support = rfe.get_support()

In [60]:
# getting features
rfe_features = X_train_scaled.loc[:,rfe_support].columns.to_list()

In [61]:
# the two features are the best 
rfe_features

['total_bill', 'size']

> e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

I think its becasue they sue differrent metrics to evaluate which features stay. The difference in metrics of evaluation will impact which will be selected 

Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [62]:
X_train[[f'{i}_scaled' for i in num_cols]] = MinMaxScaler().fit_transform(X_train[num_cols])

In [63]:
X_train = pd.concat([X_train, pd.get_dummies(X_train[cat_cols], drop_first=True)], axis=1)

Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [None]:
def select_kbeat(X, y, k=2):
    '''
    Arguments:
        X: dataframe of independent features
        y: singles pandas Series (the target)
        k: kwarg, a number of kbest features to select
    Actions
    Returns:
    Modules:
    
    '''
    
    kbest = SelectKBest(f_regression, k=k)
    
    kbest.fit(X, y)
    
    mask = kbest.get_support()
    
    return X.columns[mask]

Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [None]:
def rfe(X, y, k=2):
    '''
    Arguments:
        X: dataframe of independent features
        y: singles pandas Series (the target)
        k: kwarg, a number of best recursive features to select
    Actions
    Returns:
    Modules:
    
    '''
    
    model = linreg()
    
    rfe_lin = RFE(model, n_features_to_select=k)
    
    ref_lin.fit(X, y)
    
    mask = rfe_lin.get_support()
    
    return X.columns[mask]

#### Actions: 
* In preprocessing add a function that takes in the train, validate, test and returns the X_train, y_train, et.