In [1]:
# to handle datasets
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt

# to build the models
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [2]:
# load the train and test set with the engineered variables

# we built and saved these datasets in the previous lecture.
# If you haven't done so, go ahead and check the previous notebook
# to find out how to create these datasets

X_train = pd.read_csv('xtrain.csv')
X_test = pd.read_csv('xtest.csv')

X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,total_rooms_per_person,bedrooms_per_room,income_per_person,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,total_bedrooms_na,bedrooms_per_room_na
0,0.677291,0.137088,0.098039,0.311189,0.011639,0.215874,0.225651,0.257838,0.039722,0.057105,0.01605,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.190239,0.551541,1.0,0.498354,0.076195,0.407729,0.456587,0.268265,0.028884,0.118484,0.0025,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.63247,0.137088,0.490196,0.453651,0.05959,0.362548,0.428111,0.236783,0.03029,0.137886,0.003273,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.606574,0.156217,0.72549,0.345507,0.032123,0.327834,0.350665,0.066578,0.016699,0.220469,0.001633,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.596614,0.163656,1.0,0.497877,0.09311,0.374837,0.495768,0.184591,0.037711,0.170295,0.002363,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# loading the target (***** target is log transformed)
y_train = pd.read_csv('ytrain.csv')
y_test = pd.read_csv('ytest.csv')

y_train.head()

Unnamed: 0,median_ces
0,13.122365
1,12.506177
2,12.18638
3,11.685197
4,12.323856


In [4]:
# setting the seed, the random state in this function
sel_ = SelectFromModel(Lasso(alpha=0.001, random_state=0))

# train Lasso model and select features
sel_.fit(X_train, y_train)

In [5]:
sel_.get_support().sum()

10

In [6]:
# visualizing those features that were selected.
# (selected features marked with True)

sel_.get_support()

array([ True,  True,  True,  True, False,  True,  True,  True, False,
        True, False,  True,  True, False, False, False, False, False])

In [7]:
# printing the number of total and selected features

# creating a list of the selected features
selected_feats = X_train.columns[(sel_.get_support())]

# Exploring stats
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feats)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 18
selected features: 10
features with coefficients shrank to zero: 8


In [8]:
# print the selected features
selected_feats

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'population', 'households', 'median_income', 'bedrooms_per_room',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND'],
      dtype='object')

In [9]:
with open("features.txt", "w") as file:
    [file.write(str(x) + "\n") for x in selected_feats]