# FEATURE SELECTION
- Goal: leave this section with a dataframe with the **features** to be used to build your model.

- Are there **new features** you could create based on existing features that might be helpful?

- You could use **feature selection techniques** to see if there are any that are not adding value to the model.

- `feature_selection.py`: to run whatever functions need to be run to end with a dataframe that contains the features that will be used to model the data.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from split_scale import split_scale

In [2]:
train, test, scaler = split_scale()
train.head()

Unnamed: 0,home_size,bedroomcnt,bathroomcnt,tax_rate,yearbuilt,regionidcounty,home_value,bedrm_per_bathrm
14140,0.361642,0.6,0.0,0.066475,0.496296,1.0,0.421163,0.6
12285,0.225008,0.4,0.0,0.397378,0.496296,1.0,0.302009,0.4
13357,0.347853,0.6,0.5,0.197533,0.503704,1.0,0.324614,0.3
14130,0.282357,0.6,0.5,0.08981,0.518519,1.0,0.228193,0.3
1187,0.524914,0.6,1.0,0.668349,0.814815,1.0,0.248124,0.2


## Select feature 

In [3]:
X = train.drop(columns = 'home_value')
y = train[['home_value']]
X.head()

Unnamed: 0,home_size,bedroomcnt,bathroomcnt,tax_rate,yearbuilt,regionidcounty,bedrm_per_bathrm
14140,0.361642,0.6,0.0,0.066475,0.496296,1.0,0.6
12285,0.225008,0.4,0.0,0.397378,0.496296,1.0,0.4
13357,0.347853,0.6,0.5,0.197533,0.503704,1.0,0.3
14130,0.282357,0.6,0.5,0.08981,0.518519,1.0,0.3
1187,0.524914,0.6,1.0,0.668349,0.814815,1.0,0.2


### SelectKBest

In [4]:
from sklearn.feature_selection import SelectKBest, f_regression
f_selector = SelectKBest(f_regression, k = 3)
f_selector.fit(X,y)
X_reduced = f_selector.transform(X)
f_support = f_selector.get_support()
feature_selected = X.loc[:,f_support].columns.tolist()

In [5]:
feature_eleminated = X.loc[:,~f_support].columns.tolist()
df = train.drop(columns = feature_eleminated)
df.head()

Unnamed: 0,home_size,bathroomcnt,tax_rate,home_value
14140,0.361642,0.0,0.066475,0.421163
12285,0.225008,0.0,0.397378,0.302009
13357,0.347853,0.5,0.197533,0.324614
14130,0.282357,0.5,0.08981,0.228193
1187,0.524914,1.0,0.668349,0.248124


### RFE

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [7]:
lm = LinearRegression()
rfe = RFE(lm, 3)
X_rfe = rfe.fit_transform(X,y)
feature_selected = X.loc[:,rfe.support_].columns

In [8]:
feature_eleminated = X.loc[:,~rfe.support_].columns.tolist()

In [9]:
df = train.drop(columns = feature_eleminated)
df.head()

Unnamed: 0,home_size,tax_rate,yearbuilt,home_value
14140,0.361642,0.066475,0.496296,0.421163
12285,0.225008,0.397378,0.496296,0.302009
13357,0.347853,0.197533,0.503704,0.324614
14130,0.282357,0.08981,0.518519,0.228193
1187,0.524914,0.668349,0.814815,0.248124


### Validate function

In [10]:
from feature_selection import k_best

In [11]:
train, selector = k_best(train, 3)
train.head()

Unnamed: 0,home_size,bathroomcnt,tax_rate,home_value
14140,0.361642,0.0,0.066475,0.421163
12285,0.225008,0.0,0.397378,0.302009
13357,0.347853,0.5,0.197533,0.324614
14130,0.282357,0.5,0.08981,0.228193
1187,0.524914,1.0,0.668349,0.248124


In [12]:
from feature_selection import rfe

In [13]:
train, selector = rfe(train, 3)
train.head()

Unnamed: 0,home_size,bathroomcnt,tax_rate,home_value
14140,0.361642,0.0,0.066475,0.421163
12285,0.225008,0.0,0.397378,0.302009
13357,0.347853,0.5,0.197533,0.324614
14130,0.282357,0.5,0.08981,0.228193
1187,0.524914,1.0,0.668349,0.248124
