In [81]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from pydataset import data
from regprepare import get_auto_mpg, train_val_test

In [117]:
tips = data('tips')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.50,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3
241,27.18,2.00,Female,Yes,Sat,Dinner,2
242,22.67,2.00,Male,Yes,Sat,Dinner,2
243,17.82,1.75,Male,No,Sat,Dinner,2


In [118]:
tips['price_per_person'] = tips.total_bill / tips['size']
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


In [119]:
tips.corr()

Unnamed: 0,total_bill,tip,size,price_per_person
total_bill,1.0,0.675734,0.598315,0.647497
tip,0.675734,1.0,0.489299,0.347393
size,0.598315,0.489299,1.0,-0.175412
price_per_person,0.647497,0.347393,-0.175412,1.0


#### It looks like time, price per and total bill would be best indicators

In [120]:
train, val, test = train_val_test(tips)

In [121]:
mms = MinMaxScaler()

In [122]:
train[['total_bill', 'price_per_person']] = mms.fit_transform(train[['total_bill', 'price_per_person']])

In [123]:
train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
195,0.240346,4.0,Male,Yes,Thur,Lunch,2,0.311207
77,0.270084,3.08,Male,Yes,Sat,Dinner,2,0.349713
42,0.259876,2.54,Male,No,Sun,Dinner,2,0.336494
109,0.277186,3.76,Male,No,Sat,Dinner,2,0.358908
224,0.227031,3.0,Female,No,Fri,Lunch,3,0.1409


In [124]:
X_train_scaled = train[['total_bill', 'price_per_person']]
y_train = train['tip']

In [125]:
f_selector = SelectKBest(f_regression, k=2)

In [126]:
f_selector.fit(X_train_scaled, y_train)

In [127]:
f_select_mask = f_selector.get_support()

In [128]:
X_train_scaled.iloc[:,f_select_mask].head()
## total bill and price per person are two best but since they were the only 2 continuous there was no other options

Unnamed: 0,total_bill,price_per_person
195,0.240346,0.311207
77,0.270084,0.349713
42,0.259876,0.336494
109,0.277186,0.358908
224,0.227031,0.1409


In [129]:
X_train = pd.get_dummies(train, columns = ['sex', 'smoker', 'day', 'time', 'size'])

In [130]:
X_train.head()

Unnamed: 0,total_bill,tip,price_per_person,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,size_1,size_2,size_3,size_4,size_5,size_6
195,0.240346,4.0,0.311207,0,1,0,1,0,0,0,1,0,1,0,1,0,0,0,0
77,0.270084,3.08,0.349713,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0
42,0.259876,2.54,0.336494,0,1,1,0,0,0,1,0,1,0,0,1,0,0,0,0
109,0.277186,3.76,0.358908,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,0
224,0.227031,3.0,0.1409,1,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0


In [131]:
X_train = X_train.drop(columns = 'tip')

In [132]:
len(X_train.columns)

18

In [133]:
lm = LinearRegression()

rfe = RFE(lm, n_features_to_select = 2)

In [134]:
rfe.fit(X_train, y_train)

In [135]:
ranks = rfe.ranking_
columns = X_train.columns.tolist()

In [136]:
feature_ranks = pd.DataFrame({'ranking': ranks, 
                              'feature': columns})

In [137]:
feature_ranks.sort_values('ranking').reset_index().drop(columns = 'index')

Unnamed: 0,ranking,feature
0,1,total_bill
1,1,price_per_person
2,2,size_5
3,3,size_1
4,4,size_2
5,5,size_6
6,6,sex_Female
7,7,size_3
8,8,time_Lunch
9,9,smoker_No


In [138]:
## They are the same 2 features however they might give different results because all columns are included in rfe
## It also will change as more k values are allowed because more categorical features will become relevant

In [139]:
def select_kbest(X_train, y_train, stat_test, k_value):
    f_selector = SelectKBest(stat_test, k = k_value)
    f_selector.fit(X_train, y_train)
    f_select_mask = f_selector.get_support()
    return X_train.iloc[:,f_select_mask].head()


In [140]:
select_kbest(X_train_scaled, y_train, f_regression, 2)

Unnamed: 0,total_bill,price_per_person
195,0.240346,0.311207
77,0.270084,0.349713
42,0.259876,0.336494
109,0.277186,0.358908
224,0.227031,0.1409


In [141]:
def rfe(X_train, y_train, model, k):
    m = model
    rfe = RFE(m, n_features_to_select = k)
    rfe.fit(X_train, y_train)
    ranks = rfe.ranking_
    columns = X_train.columns.tolist()
    feature_ranks = pd.DataFrame({'ranking': ranks, 
                              'feature': columns})
    return feature_ranks.sort_values('ranking').reset_index().drop(columns = 'index')


In [142]:
rfe(X_train, y_train, LinearRegression(), 2)

Unnamed: 0,ranking,feature
0,1,total_bill
1,1,price_per_person
2,2,size_5
3,3,size_1
4,4,size_2
5,5,size_6
6,6,sex_Female
7,7,size_3
8,8,time_Lunch
9,9,smoker_No


In [143]:
swiss = data('swiss')

In [144]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [145]:
swiss[['Agriculture', 'Examination', 'Education', 'Catholic', 'Infant.Mortality']] = mms.fit_transform(swiss[['Agriculture', 'Examination', 'Education', 'Catholic', 'Infant.Mortality']])

In [146]:
X_train = swiss.drop(columns = 'Fertility')
y_train = swiss['Fertility']

In [147]:
select_kbest(X_train, y_train, f_regression, 3)

Unnamed: 0,Examination,Education,Catholic
Courtelary,0.352941,0.211538,0.079816
Delemont,0.088235,0.153846,0.845069
Franches-Mnt,0.058824,0.076923,0.93255
Moutier,0.264706,0.115385,0.323148
Neuveville,0.411765,0.269231,0.030761


In [148]:
rfe(X_train, y_train, LinearRegression(), 3)

Unnamed: 0,ranking,feature
0,1,Agriculture
1,1,Education
2,1,Infant.Mortality
3,2,Catholic
4,3,Examination
