In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [3]:
data = pd.read_csv('train_modified.csv')

In [4]:
data.head()

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,12.247694,0.235294,0.75,0.418208,0.366344,1.0,1.0,0.0,0.333333,...,1.0,1.0,0.0,0.090909,0.5,0.666667,0.75,0.0,0.0,0.0
1,2,12.109011,0.0,0.75,0.495064,0.391317,1.0,1.0,0.0,0.333333,...,1.0,1.0,0.0,0.363636,0.25,0.666667,0.75,0.0,0.0,0.0
2,3,12.317167,0.235294,0.75,0.434909,0.422359,1.0,1.0,0.333333,0.333333,...,1.0,1.0,0.0,0.727273,0.5,0.666667,0.75,0.0,0.0,0.0
3,4,11.849398,0.294118,0.75,0.388581,0.390295,1.0,1.0,0.333333,0.333333,...,1.0,1.0,0.0,0.090909,0.0,0.666667,0.0,0.0,0.0,0.0
4,5,12.429216,0.235294,0.75,0.513123,0.468761,1.0,1.0,0.333333,0.333333,...,1.0,1.0,0.0,1.0,0.5,0.666667,0.75,0.0,0.0,0.0


In [5]:
X = data.drop(['Id','SalePrice'],axis=1)
y = data['SalePrice']

In [6]:
# Splitting dataset for validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [8]:
### Apply Feature Selection
# first, I specify the Lasso Regression model, and I
# select a suitable alpha (equivalent of penalty).
# The bigger the alpha the less features that will be selected.

# Then I use the selectFromModel object from sklearn, which
# will select the features which coefficients are non-zero

feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0)) # remember to set the seed, the random state in this function
feature_sel_model.fit(X_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=0,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [9]:
feature_sel_model.get_support()

array([ True,  True, False, False, False, False,  True, False, False,
       False, False,  True, False, False, False, False,  True,  True,
       False,  True,  True, False, False, False, False, False, False,
       False, False,  True, False,  True, False, False, False, False,
       False, False, False,  True,  True, False,  True, False, False,
        True,  True, False, False, False, False, False,  True, False,
       False,  True,  True,  True, False,  True,  True,  True, False,
        True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False])

In [10]:
# let's print the number of total and selected features

# this is how we can make a list of the selected features
selected_feat = X_train.columns[(feature_sel_model.get_support())]

# let's print some stats
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(X_train.shape[1]-len(selected_feat)))

total features: 82
selected features: 25
features with coefficients shrank to zero: 57


In [11]:
selected_feat

Index(['MSSubClass', 'MSZoning', 'LotShape', 'Neighborhood', 'OverallQual',
       'OverallCond', 'YearRemodAdd', 'RoofStyle', 'BsmtQual', 'BsmtExposure',
       'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea', 'BsmtFullBath',
       'KitchenQual', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageCond', 'PavedDrive',
       'SaleCondition'],
      dtype='object')

In [12]:
X_train=X_train[selected_feat]

In [13]:
X_test=X_test[selected_feat]

In [14]:
regressor = LinearRegression()

In [15]:
regressor.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [16]:
#To retrieve the intercept:
print(regressor.intercept_)

10.064897859115126


In [17]:
#For retrieving the slope:
print(regressor.coef_)

[-0.06941323  0.12799298  0.0601533   0.23248659  0.54491702  0.3272563
 -0.02173207  0.02270173  0.17800935  0.07560025  0.05017014  0.07088214
  0.27132963  1.09166027  0.1910211   0.09956717  0.09468351 -0.005944
  0.0098844   0.0351538   0.10092633  0.12330622  0.02025892  0.03655594
  0.11240912]


In [18]:
y_pred = regressor.predict(X_test)

In [19]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df

Unnamed: 0,Actual,Predicted
529,12.209188,12.301561
491,11.798104,12.002838
459,11.608236,11.723759
279,12.165251,12.298491
655,11.385092,11.429639
...,...,...
326,12.688499,12.497057
440,13.226723,12.992400
1387,11.820410,11.999524
1323,11.320554,11.320286


In [20]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.10113642233111274
Mean Squared Error: 0.02343663347734811
Root Mean Squared Error: 0.15309027884666


In [21]:
test_data = pd.read_csv('test_modified.csv')

In [22]:
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,BsmtFullBathnan,BsmtHalfBathnan,GarageYrBltnan
0,1461,0.0,0.25,0.593445,0.56636,1.0,0.5,0.666667,0.0,1.0,...,0.0,0.454545,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
1,1462,0.0,0.5,0.598957,0.622527,1.0,0.5,0.333333,0.0,1.0,...,0.735294,0.454545,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
2,1463,0.235294,0.5,0.558854,0.614005,1.0,0.5,0.333333,0.0,1.0,...,0.0,0.181818,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
3,1464,0.235294,0.5,0.582212,0.524583,1.0,0.5,0.333333,0.0,1.0,...,0.0,0.454545,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
4,1465,0.588235,0.5,0.317987,0.335596,1.0,0.5,0.333333,0.666667,1.0,...,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
X_test2 = test_data.drop(['Id'],axis=1)

In [24]:
X_test2

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,BsmtFullBathnan,BsmtHalfBathnan,GarageYrBltnan
0,0.000000,0.25,0.593445,0.566360,1.0,0.5,0.666667,0.000000,1.0,0.50,...,0.000000,0.454545,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.50,0.598957,0.622527,1.0,0.5,0.333333,0.000000,1.0,0.25,...,0.735294,0.454545,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
2,0.235294,0.50,0.558854,0.614005,1.0,0.5,0.333333,0.000000,1.0,0.50,...,0.000000,0.181818,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
3,0.235294,0.50,0.582212,0.524583,1.0,0.5,0.333333,0.000000,1.0,0.50,...,0.000000,0.454545,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
4,0.588235,0.50,0.317987,0.335596,1.0,0.5,0.333333,0.666667,1.0,0.50,...,0.000000,0.000000,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.823529,0.75,0.000000,0.075426,1.0,0.5,0.666667,0.000000,1.0,0.50,...,0.000000,0.454545,0.0,0.5,0.0,0.0,0.0,0.0,0.0,1.0
1455,0.823529,0.75,0.000000,0.069418,1.0,0.5,0.666667,0.000000,1.0,0.50,...,0.000000,0.272727,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0
1456,0.000000,0.50,0.900992,0.715051,1.0,0.5,0.666667,0.000000,1.0,0.50,...,0.000000,0.727273,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0
1457,0.382353,0.50,0.480351,0.537007,1.0,0.5,0.666667,0.000000,1.0,0.50,...,0.041176,0.545455,0.0,0.5,0.0,0.0,0.0,0.0,0.0,1.0


In [25]:
X_test2 = X_test2[selected_feat]

In [26]:
X_test2

Unnamed: 0,MSSubClass,MSZoning,LotShape,Neighborhood,OverallQual,OverallCond,YearRemodAdd,RoofStyle,BsmtQual,BsmtExposure,...,KitchenQual,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,GarageArea,GarageCond,PavedDrive,SaleCondition
0,0.000000,0.25,0.666667,0.45,0.444444,0.625,0.822581,0.5,0.75,0.00,...,0.75,0.00,0.6,0.333333,0.666667,0.2,0.490591,0.000000,0.5,0.0
1,0.000000,0.50,0.333333,0.45,0.555556,0.625,0.870968,0.0,0.75,0.00,...,0.50,0.00,0.6,0.333333,0.666667,0.2,0.209677,0.000000,0.5,0.0
2,0.235294,0.50,0.333333,0.00,0.444444,0.500,0.225806,0.5,0.25,0.00,...,0.75,0.25,0.2,0.333333,0.000000,0.4,0.323925,0.000000,0.5,0.0
3,0.235294,0.50,0.333333,0.00,0.555556,0.625,0.225806,0.5,0.75,0.00,...,0.50,0.25,0.0,0.333333,0.000000,0.4,0.315860,0.000000,0.5,0.0
4,0.588235,0.50,0.333333,0.55,0.777778,0.500,0.322581,0.5,0.25,0.00,...,0.50,0.00,0.6,0.333333,0.333333,0.4,0.340054,0.000000,0.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.823529,0.75,0.666667,1.00,0.333333,0.750,0.612903,0.5,0.75,0.00,...,0.75,0.00,0.6,0.666667,1.000000,0.0,0.000000,0.333333,0.5,0.0
1455,0.823529,0.75,0.666667,1.00,0.333333,0.500,0.612903,0.5,0.75,0.00,...,0.75,0.00,0.6,0.833333,0.666667,0.2,0.192204,0.000000,0.5,0.5
1456,0.000000,0.50,0.666667,0.70,0.444444,0.750,0.193548,0.5,0.75,0.00,...,0.75,0.25,0.2,0.500000,0.666667,0.4,0.387097,0.000000,0.5,0.5
1457,0.382353,0.50,0.666667,0.70,0.444444,0.500,0.258065,0.5,0.25,0.75,...,0.75,0.00,0.6,0.666667,1.000000,0.0,0.000000,0.333333,0.5,0.0


In [27]:
np.isnan(X_test2)

Unnamed: 0,MSSubClass,MSZoning,LotShape,Neighborhood,OverallQual,OverallCond,YearRemodAdd,RoofStyle,BsmtQual,BsmtExposure,...,KitchenQual,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,GarageArea,GarageCond,PavedDrive,SaleCondition
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1455,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1456,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1457,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [28]:
np.where(np.isnan(X_test2))

(array([1116, 1116], dtype=int64), array([20, 21], dtype=int64))

In [29]:
X_test2 = np.nan_to_num(X_test2)

In [30]:
X_test2

array([[0.        , 0.25      , 0.66666667, ..., 0.        , 0.5       ,
        0.        ],
       [0.        , 0.5       , 0.33333333, ..., 0.        , 0.5       ,
        0.        ],
       [0.23529412, 0.5       , 0.33333333, ..., 0.        , 0.5       ,
        0.        ],
       ...,
       [0.        , 0.5       , 0.66666667, ..., 0.        , 0.5       ,
        0.5       ],
       [0.38235294, 0.5       , 0.66666667, ..., 0.33333333, 0.5       ,
        0.        ],
       [0.23529412, 0.5       , 0.66666667, ..., 0.        , 0.5       ,
        0.        ]])

In [31]:
y_pred2=regressor.predict(X_test2)

In [32]:
y_pred2

array([11.45025586, 11.66344213, 11.51362836, ..., 11.90564567,
       11.43677815, 11.96691529])

In [33]:
y_pred2 = np.exp(y_pred2)

In [34]:
y_pred2

array([ 93925.37493867, 116243.46773661, 100070.31445654, ...,
       148100.39490516,  92667.96865854, 157458.19818199])

In [36]:
output = pd.DataFrame({'Id': test_data['Id'], 'Saleprice': y_pred2})
output.to_csv('LR1.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
