In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## for feature selection

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel


## visulise all the columns in dataset

In [2]:
dataset = pd.read_csv('X_training')

In [3]:
dataset.head()

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,12.247694,0.235294,0.0,0.418208,0.366344,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.090909,0.5,1.0,0.0,0.0,0.0,0.0
1,2,12.109011,0.0,0.0,0.495064,0.391317,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.363636,0.25,1.0,0.0,0.0,0.0,0.0
2,3,12.317167,0.235294,0.0,0.434909,0.422359,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.727273,0.5,1.0,0.0,0.0,0.0,0.0
3,4,11.849398,0.294118,0.0,0.388581,0.390295,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.090909,0.0,1.0,1.0,0.0,0.0,0.0
4,5,12.429216,0.235294,0.0,0.513123,0.468761,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.5,1.0,0.0,0.0,0.0,0.0


In [4]:
y_train = dataset['SalePrice']

In [5]:
x_train = dataset.drop(['SalePrice', 'Id'], axis=1)

In [6]:
#### Apply Feature Selection

# first specify the Lasso Regression Model and select the suitable alpha value( smaller the value given more no. of features)
# apply SelectFromModel from sklearn.feature_selection and select the features with non-zero cofficients

feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0)) #remember to set the seed , this should be same for the test data(prediction dataset)
feature_sel_model.fit(x_train, y_train)


SelectFromModel(estimator=Lasso(alpha=0.005, random_state=0))

In [7]:
feature_sel_model.get_support()

array([False,  True, False, False, False, False,  True, False, False,
       False, False, False,  True, False,  True, False,  True, False,
        True,  True, False, False, False, False, False, False,  True,
       False, False, False, False, False,  True, False, False, False,
       False, False, False,  True,  True, False,  True, False, False,
        True,  True, False,  True, False, False, False,  True, False,
        True,  True, False,  True, False,  True,  True, False, False,
        True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [8]:
feature_sel_model.get_params()

{'estimator__alpha': 0.005,
 'estimator__copy_X': True,
 'estimator__fit_intercept': True,
 'estimator__max_iter': 1000,
 'estimator__normalize': False,
 'estimator__positive': False,
 'estimator__precompute': False,
 'estimator__random_state': 0,
 'estimator__selection': 'cyclic',
 'estimator__tol': 0.0001,
 'estimator__warm_start': False,
 'estimator': Lasso(alpha=0.005, random_state=0),
 'max_features': None,
 'norm_order': 1,
 'prefit': False,
 'threshold': None}

In [9]:
## print the number of total features and selected features

# make a list of selected features
select_feature = x_train.columns[(feature_sel_model.get_support())]
# printing stats 
print('Total features: {}'.format(x_train.shape[1]))
print('Total selected features: {}'.format(len(select_feature)))
print('Features with cofficent shrink to zero: {}'.format(np.sum(feature_sel_model.estimator_.coef_ ==0)))

Total features: 82
Total selected features: 23
Features with cofficent shrink to zero: 59


In [10]:
x_train = x_train[select_feature]

In [11]:
x_train

Unnamed: 0,MSZoning,LotShape,Condition1,BldgType,OverallQual,YearBuilt,YearRemodAdd,ExterQual,BsmtFinType1,HeatingQC,...,BsmtFullBath,FullBath,KitchenQual,Functional,Fireplaces,GarageType,GarageFinish,GarageCars,GarageCond,PavedDrive
0,0.0,1.0,0.0,0.0,0.666667,0.036765,0.098361,0.0,0.50,0.000000,...,0.333333,0.666667,0.0,1.0,0.000000,0.0,0.333333,0.50,1.0,1.0
1,0.0,1.0,1.0,0.0,0.555556,0.227941,0.524590,1.0,0.00,0.000000,...,0.000000,0.666667,1.0,1.0,0.333333,0.0,0.333333,0.50,1.0,1.0
2,0.0,0.0,0.0,0.0,0.666667,0.051471,0.114754,0.0,0.50,0.000000,...,0.333333,0.666667,0.0,1.0,0.333333,0.0,0.333333,0.50,1.0,1.0
3,0.0,0.0,0.0,0.0,0.666667,0.669118,0.606557,1.0,0.00,0.333333,...,0.333333,0.333333,0.0,1.0,0.333333,0.5,1.000000,0.75,1.0,1.0
4,0.0,0.0,0.0,0.0,0.777778,0.058824,0.147541,0.0,0.50,0.000000,...,0.333333,0.666667,0.0,1.0,0.333333,0.0,0.333333,0.75,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.0,1.0,0.0,0.0,0.555556,0.058824,0.131148,1.0,1.00,0.000000,...,0.000000,0.666667,1.0,1.0,0.333333,0.0,0.333333,0.50,1.0,1.0
1456,0.0,1.0,0.0,0.0,0.555556,0.235294,0.377049,1.0,0.00,1.000000,...,0.333333,0.666667,1.0,0.0,0.666667,0.0,1.000000,0.50,1.0,1.0
1457,0.0,1.0,0.0,0.0,0.666667,0.507353,0.081967,0.5,0.50,0.000000,...,0.000000,0.666667,0.0,1.0,0.666667,0.0,0.333333,0.25,1.0,1.0
1458,0.0,1.0,0.0,0.0,0.444444,0.441176,0.245902,1.0,0.50,0.333333,...,0.333333,0.333333,0.0,1.0,0.000000,0.0,1.000000,0.25,1.0,1.0


In [12]:
df_test = pd.read_csv('X_testing')

In [13]:
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,BsmtFullBathnan,BsmtHalfBathnan,GarageYrBltnan
0,1461,0.0,0.6,0.593445,0.56636,1.0,0.5,1.0,1.0,0.0,...,0.0,0.454545,1.0,1.0,0.8,0.0,0.0,0.0,0.0,0.0
1,1462,0.0,0.8,0.598957,0.622527,1.0,0.5,0.0,1.0,0.0,...,0.735294,0.454545,1.0,1.0,0.8,0.0,0.0,0.0,0.0,0.0
2,1463,0.235294,0.8,0.558854,0.614005,1.0,0.5,0.0,1.0,0.0,...,0.0,0.181818,1.0,1.0,0.8,0.0,0.0,0.0,0.0,0.0
3,1464,0.235294,0.8,0.582212,0.524583,1.0,0.5,0.0,1.0,0.0,...,0.0,0.454545,1.0,1.0,0.8,0.0,0.0,0.0,0.0,0.0
4,1465,0.588235,0.8,0.317987,0.335596,1.0,0.5,0.0,0.333333,0.0,...,0.0,0.0,1.0,1.0,0.8,0.0,0.0,0.0,0.0,0.0


In [14]:
from sklearn.linear_model import LinearRegression
model_1 = LinearRegression()
model_1.fit(x_train, y_train)

LinearRegression()

In [15]:
model_1.score(x_train, y_train)

0.8735469555514143

In [16]:
y_test = df_test[x_train.columns]
y_test

Unnamed: 0,MSZoning,LotShape,Condition1,BldgType,OverallQual,YearBuilt,YearRemodAdd,ExterQual,BsmtFinType1,HeatingQC,...,BsmtFullBath,FullBath,KitchenQual,Functional,Fireplaces,GarageType,GarageFinish,GarageCars,GarageCond,PavedDrive
0,0.6,1.0,0.125,0.00,0.444444,0.384615,0.822581,1.000000,0.833333,1.0,...,0.000000,0.25,1.0,1.0,0.00,0.166667,1.000000,0.2,1.0,1.0
1,0.8,0.0,0.250,0.00,0.555556,0.407692,0.870968,1.000000,0.000000,1.0,...,0.000000,0.25,0.5,1.0,0.00,0.166667,1.000000,0.2,1.0,1.0
2,0.8,0.0,0.250,0.00,0.444444,0.107692,0.225806,1.000000,0.333333,0.5,...,0.000000,0.50,1.0,1.0,0.25,0.166667,0.000000,0.4,1.0,1.0
3,0.8,0.0,0.250,0.00,0.555556,0.100000,0.225806,1.000000,0.333333,0.0,...,0.000000,0.50,0.5,1.0,0.25,0.166667,0.000000,0.4,1.0,1.0
4,0.8,0.0,0.250,1.00,0.777778,0.146154,0.322581,0.666667,0.000000,0.0,...,0.000000,0.50,0.5,1.0,0.00,0.166667,0.666667,0.4,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1.0,1.0,0.250,0.75,0.333333,0.284615,0.612903,1.000000,1.000000,0.5,...,0.000000,0.25,1.0,1.0,0.00,1.000000,0.333333,0.0,0.6,1.0
1455,1.0,1.0,0.250,1.00,0.333333,0.284615,0.612903,1.000000,0.833333,1.0,...,0.000000,0.25,1.0,1.0,0.00,0.666667,1.000000,0.2,1.0,1.0
1456,0.8,1.0,0.250,0.00,0.444444,0.361538,0.193548,1.000000,0.000000,0.0,...,0.333333,0.25,1.0,1.0,0.25,0.833333,1.000000,0.4,1.0,1.0
1457,0.8,1.0,0.250,0.00,0.444444,0.115385,0.258065,1.000000,0.333333,1.0,...,0.000000,0.25,1.0,1.0,0.00,1.000000,0.333333,0.0,0.6,1.0


In [21]:
model_1_pred = model_1.predict(y_test)

In [22]:
model_1_pred

array([11.42979477, 11.79418268, 11.95215873, ..., 11.90073553,
       11.53840036, 12.25045562])

In [24]:
df_test.shape

(1459, 85)

In [25]:
i = pd.read_csv('test.csv')

In [42]:
z=pd.concat([i, j], axis=1)

In [38]:
j = pd.DataFrame(model_1_pred, columns=['SalePrice_Prediction'])

In [39]:
j

Unnamed: 0,SalePrice_Prediction
0,11.429795
1,11.794183
2,11.952159
3,12.049211
4,12.064648
...,...
1454,11.269068
1455,11.290195
1456,11.900736
1457,11.538400


In [43]:
z.to_csv('pred_1.csv')