# FEATURE SELECTION
- Goal: leave this section with a dataframe with the **features** to be used to build your model.

- Are there **new features** you could create based on existing features that might be helpful?

- You could use **feature selection techniques** to see if there are any that are not adding value to the model.

- `feature_selection.py`: to run whatever functions need to be run to end with a dataframe that contains the features that will be used to model the data.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from split_scale import split_data

In [2]:
train, test = split_data()
train.head()

Unnamed: 0,home_size,bedroomcnt,bathroomcnt,home_value
5364,1449.0,3.0,2.0,363906.0
10814,3739.0,4.0,3.0,1203071.0
10863,1920.0,3.0,2.5,507625.0
12983,1574.0,4.0,3.0,569000.0
325,1992.0,3.0,3.0,592398.0


## Select feature 

In [3]:
X = train.drop(columns = 'home_value')
y = train[['home_value']]
X.head()

Unnamed: 0,home_size,bedroomcnt,bathroomcnt
5364,1449.0,3.0,2.0
10814,3739.0,4.0,3.0
10863,1920.0,3.0,2.5
12983,1574.0,4.0,3.0
325,1992.0,3.0,3.0


### SelectKBest

In [4]:
from sklearn.feature_selection import SelectKBest, f_regression
f_selector = SelectKBest(f_regression, k = 2)
f_selector.fit(X,y)
X_reduced = f_selector.transform(X)
f_support = f_selector.get_support()
feature_selected = X.loc[:,f_support].columns.tolist()

In [5]:
feature_eleminated = X.loc[:,~f_support].columns.tolist()
df = train.drop(columns = feature_eleminated)
df.head()

Unnamed: 0,home_size,bathroomcnt,home_value
5364,1449.0,2.0,363906.0
10814,3739.0,3.0,1203071.0
10863,1920.0,2.5,507625.0
12983,1574.0,3.0,569000.0
325,1992.0,3.0,592398.0


### RFE

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [7]:
lm = LinearRegression()
rfe = RFE(lm, 2)
X_rfe = rfe.fit_transform(X,y)
feature_selected = X.loc[:,rfe.support_].columns

In [8]:
feature_eleminated = X.loc[:,~rfe.support_].columns.tolist()

In [9]:
df = train.drop(columns = feature_eleminated)
df.head()

Unnamed: 0,bedroomcnt,bathroomcnt,home_value
5364,3.0,2.0,363906.0
10814,4.0,3.0,1203071.0
10863,3.0,2.5,507625.0
12983,4.0,3.0,569000.0
325,3.0,3.0,592398.0


### Validate function

In [10]:
from feature_selection import select_feature

In [11]:
train, selector = select_feature(train, 2)
train.head()

Unnamed: 0,home_size,bathroomcnt,home_value
5364,1449.0,2.0,363906.0
10814,3739.0,3.0,1203071.0
10863,1920.0,2.5,507625.0
12983,1574.0,3.0,569000.0
325,1992.0,3.0,592398.0
