## Feature Selection
Having engineered our features, we will be selecting the best features in the sets to avoid the curse of dimensionality and eventually overfitting the sets by the ML model

In [34]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso

import warnings
warnings.filterwarnings("ignore")

In [35]:
X_train = pd.read_csv('Xtrain.csv', index_col=0)
X_val = pd.read_csv('Xval.csv', index_col=0)
X_test = pd.read_csv('Xtest.csv', index_col=0)

X_train.shape

(70374, 1012)

In [36]:
X_train.head()

Unnamed: 0,goal,disable_communication,backers_count,final_status,AU,CA,GB,US,AUD,CAD,...,year,year old,years,yet,york,young,youth,youtube,zombie,zombies
0,0.0,0.0,0.002695,0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.009434,0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.022911,0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.152291,1,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
y_train = X_train.pop('final_status')
y_val = X_val.pop('final_status')
y_test = X_test.pop('final_status')

In [38]:
selector = SelectFromModel(Lasso(alpha=0.0002, random_state=200))
selector.fit(X_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.0002, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=200,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [39]:
selected_feats = X_train.columns[selector.get_support()]
list(selected_feats)

['disable_communication',
 'backers_count',
 'AU',
 'GB',
 'US',
 'GBP',
 'USD',
 '2013',
 '2014',
 '3d',
 'action',
 'album',
 'android',
 'animated',
 'anthology',
 'app',
 'art',
 'band',
 'based',
 'burning man',
 'business',
 'card game',
 'cd',
 'clothing',
 'comedy',
 'create',
 'debut',
 'debut album',
 'detroit',
 'documentary',
 'dream',
 'edition',
 'energy',
 'ep',
 'explores',
 'fantasy',
 'fashion',
 'feature',
 'feature film',
 'festival',
 'film',
 'final',
 'first',
 'first full',
 'folk',
 'food',
 'food truck',
 'free',
 'full length',
 'game',
 'get',
 'girl',
 'help',
 'help us',
 'hip',
 'hip hop',
 'independent',
 'installation',
 'last',
 'length album',
 'life',
 'like',
 'little',
 'local',
 'looking',
 'man',
 'many',
 'mobile',
 'need help',
 'new',
 'new album',
 'night',
 'online',
 'painting',
 'paintings',
 'people',
 'piece',
 'platform',
 'playing cards',
 'pre order',
 'presents',
 'press',
 'print',
 'prints',
 'produce',
 'real',
 'reality',
 'recor

In [40]:
len(selected_feats)

114

Only 114 features were selected out of 1012 columns by Lasso Regression

In [41]:
# saving selected features
pd.Series(selected_feats).to_csv('selected_features.csv', index=False)

After selecting the best features and persisting them, we will start building models in the ML building notebook