In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [36]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [37]:
data = pd.read_csv('train_modified.csv')

In [38]:
data.head()

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,12.247694,0.235294,0.75,0.418208,0.366344,1.0,1.0,0.0,0.333333,...,1.0,1.0,0.0,0.090909,0.5,0.666667,0.75,0.0,0.0,0.0
1,2,12.109011,0.0,0.75,0.495064,0.391317,1.0,1.0,0.0,0.333333,...,1.0,1.0,0.0,0.363636,0.25,0.666667,0.75,0.0,0.0,0.0
2,3,12.317167,0.235294,0.75,0.434909,0.422359,1.0,1.0,0.333333,0.333333,...,1.0,1.0,0.0,0.727273,0.5,0.666667,0.75,0.0,0.0,0.0
3,4,11.849398,0.294118,0.75,0.388581,0.390295,1.0,1.0,0.333333,0.333333,...,1.0,1.0,0.0,0.090909,0.0,0.666667,0.0,0.0,0.0,0.0
4,5,12.429216,0.235294,0.75,0.513123,0.468761,1.0,1.0,0.333333,0.333333,...,1.0,1.0,0.0,1.0,0.5,0.666667,0.75,0.0,0.0,0.0


In [39]:
X = data.drop(['Id','SalePrice'],axis=1)
y = data['SalePrice']

In [40]:
# Splitting dataset for validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [41]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [42]:
### Apply Feature Selection
# first, I specify the Lasso Regression model, and I
# select a suitable alpha (equivalent of penalty).
# The bigger the alpha the less features that will be selected.

# Then I use the selectFromModel object from sklearn, which
# will select the features which coefficients are non-zero

feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0)) # remember to set the seed, the random state in this function
feature_sel_model.fit(X_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=0,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [43]:
feature_sel_model.get_support()

array([ True,  True, False, False, False, False,  True, False, False,
       False, False,  True, False, False, False, False,  True,  True,
       False,  True,  True, False, False, False, False, False, False,
       False, False,  True, False,  True, False, False, False, False,
       False, False, False,  True,  True, False,  True, False, False,
        True,  True, False, False, False, False, False,  True, False,
       False,  True,  True,  True, False,  True,  True,  True, False,
        True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False])

In [44]:
# let's print the number of total and selected features

# this is how we can make a list of the selected features
selected_feat = X_train.columns[(feature_sel_model.get_support())]

# let's print some stats
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(X_train.shape[1]-len(selected_feat)))

total features: 82
selected features: 25
features with coefficients shrank to zero: 57


In [45]:
selected_feat

Index(['MSSubClass', 'MSZoning', 'LotShape', 'Neighborhood', 'OverallQual',
       'OverallCond', 'YearRemodAdd', 'RoofStyle', 'BsmtQual', 'BsmtExposure',
       'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea', 'BsmtFullBath',
       'KitchenQual', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageCond', 'PavedDrive',
       'SaleCondition'],
      dtype='object')

In [46]:
X_train=X_train[selected_feat]

In [47]:
X_test=X_test[selected_feat]

In [48]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor()

In [49]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [50]:
from sklearn.model_selection import RandomizedSearchCV

In [51]:
# Set up the random search with 4-fold cross validation
random_cv = RandomizedSearchCV(estimator=regressor,
            param_distributions=random_grid,
            cv=5, n_iter=50,
            scoring = 'neg_mean_absolute_error',n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=42)

In [52]:
random_cv.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   13.8s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  6.9min
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:  8.8min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_sta...


In [53]:
random_cv.best_estimator_, random_cv.best_params_

(RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=60,
                       max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=600,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False),
 {'n_estimators': 600,
  'min_samples_split': 5,
  'min_samples_leaf': 1,
  'max_features': 'sqrt',
  'max_depth': 60,
  'bootstrap': False})

In [54]:
regressor=RandomForestRegressor(n_estimators=1000,min_samples_split=2,min_samples_leaf=1,max_features='sqrt',max_depth=110,bootstrap=True)

In [55]:
regressor.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=110,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [56]:
y_pred = regressor.predict(X_test)

In [57]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df

Unnamed: 0,Actual,Predicted
529,12.209188,12.281820
491,11.798104,11.867800
459,11.608236,11.701288
279,12.165251,12.289686
655,11.385092,11.426706
...,...,...
326,12.688499,12.425288
440,13.226723,12.993792
1387,11.820410,11.976297
1323,11.320554,11.245848


In [58]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.0915640691669946
Mean Squared Error: 0.017397260606886355
Root Mean Squared Error: 0.131898675531206


In [83]:
test_data = pd.read_csv('test_modified.csv')

In [84]:
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,BsmtFullBathnan,BsmtHalfBathnan,GarageYrBltnan
0,1461,0.0,0.25,0.593445,0.56636,1.0,0.5,0.666667,0.0,1.0,...,0.0,0.454545,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
1,1462,0.0,0.5,0.598957,0.622527,1.0,0.5,0.333333,0.0,1.0,...,0.735294,0.454545,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
2,1463,0.235294,0.5,0.558854,0.614005,1.0,0.5,0.333333,0.0,1.0,...,0.0,0.181818,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
3,1464,0.235294,0.5,0.582212,0.524583,1.0,0.5,0.333333,0.0,1.0,...,0.0,0.454545,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
4,1465,0.588235,0.5,0.317987,0.335596,1.0,0.5,0.333333,0.666667,1.0,...,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
X_test2 = test_data.drop(['Id'],axis=1)

In [86]:
#X_test2.shape, X_train.shape
#X_test2 = X_test2[best_feat]
X_test2

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,BsmtFullBathnan,BsmtHalfBathnan,GarageYrBltnan
0,0.000000,0.25,0.593445,0.566360,1.0,0.5,0.666667,0.000000,1.0,0.50,...,0.000000,0.454545,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.50,0.598957,0.622527,1.0,0.5,0.333333,0.000000,1.0,0.25,...,0.735294,0.454545,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
2,0.235294,0.50,0.558854,0.614005,1.0,0.5,0.333333,0.000000,1.0,0.50,...,0.000000,0.181818,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
3,0.235294,0.50,0.582212,0.524583,1.0,0.5,0.333333,0.000000,1.0,0.50,...,0.000000,0.454545,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
4,0.588235,0.50,0.317987,0.335596,1.0,0.5,0.333333,0.666667,1.0,0.50,...,0.000000,0.000000,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.823529,0.75,0.000000,0.075426,1.0,0.5,0.666667,0.000000,1.0,0.50,...,0.000000,0.454545,0.0,0.5,0.0,0.0,0.0,0.0,0.0,1.0
1455,0.823529,0.75,0.000000,0.069418,1.0,0.5,0.666667,0.000000,1.0,0.50,...,0.000000,0.272727,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0
1456,0.000000,0.50,0.900992,0.715051,1.0,0.5,0.666667,0.000000,1.0,0.50,...,0.000000,0.727273,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0
1457,0.382353,0.50,0.480351,0.537007,1.0,0.5,0.666667,0.000000,1.0,0.50,...,0.041176,0.545455,0.0,0.5,0.0,0.0,0.0,0.0,0.0,1.0


In [87]:
X_test2 = X_test2[selected_feat]

In [88]:
X_test2

Unnamed: 0,MSSubClass,MSZoning,LotShape,Neighborhood,OverallQual,OverallCond,YearRemodAdd,RoofStyle,BsmtQual,BsmtExposure,...,KitchenQual,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,GarageArea,GarageCond,PavedDrive,SaleCondition
0,0.000000,0.25,0.666667,0.45,0.444444,0.625,0.822581,0.5,0.75,0.00,...,0.75,0.00,0.6,0.333333,0.666667,0.2,0.490591,0.000000,0.5,0.0
1,0.000000,0.50,0.333333,0.45,0.555556,0.625,0.870968,0.0,0.75,0.00,...,0.50,0.00,0.6,0.333333,0.666667,0.2,0.209677,0.000000,0.5,0.0
2,0.235294,0.50,0.333333,0.00,0.444444,0.500,0.225806,0.5,0.25,0.00,...,0.75,0.25,0.2,0.333333,0.000000,0.4,0.323925,0.000000,0.5,0.0
3,0.235294,0.50,0.333333,0.00,0.555556,0.625,0.225806,0.5,0.75,0.00,...,0.50,0.25,0.0,0.333333,0.000000,0.4,0.315860,0.000000,0.5,0.0
4,0.588235,0.50,0.333333,0.55,0.777778,0.500,0.322581,0.5,0.25,0.00,...,0.50,0.00,0.6,0.333333,0.333333,0.4,0.340054,0.000000,0.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.823529,0.75,0.666667,1.00,0.333333,0.750,0.612903,0.5,0.75,0.00,...,0.75,0.00,0.6,0.666667,1.000000,0.0,0.000000,0.333333,0.5,0.0
1455,0.823529,0.75,0.666667,1.00,0.333333,0.500,0.612903,0.5,0.75,0.00,...,0.75,0.00,0.6,0.833333,0.666667,0.2,0.192204,0.000000,0.5,0.5
1456,0.000000,0.50,0.666667,0.70,0.444444,0.750,0.193548,0.5,0.75,0.00,...,0.75,0.25,0.2,0.500000,0.666667,0.4,0.387097,0.000000,0.5,0.5
1457,0.382353,0.50,0.666667,0.70,0.444444,0.500,0.258065,0.5,0.25,0.75,...,0.75,0.00,0.6,0.666667,1.000000,0.0,0.000000,0.333333,0.5,0.0


In [89]:
np.isnan(X_test2)

Unnamed: 0,MSSubClass,MSZoning,LotShape,Neighborhood,OverallQual,OverallCond,YearRemodAdd,RoofStyle,BsmtQual,BsmtExposure,...,KitchenQual,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,GarageArea,GarageCond,PavedDrive,SaleCondition
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1455,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1456,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1457,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [90]:
np.where(np.isnan(X_test2))

(array([1116, 1116], dtype=int64), array([20, 21], dtype=int64))

In [91]:
X_test2 = np.nan_to_num(X_test2)

In [92]:
X_test2

array([[0.        , 0.25      , 0.66666667, ..., 0.        , 0.5       ,
        0.        ],
       [0.        , 0.5       , 0.33333333, ..., 0.        , 0.5       ,
        0.        ],
       [0.23529412, 0.5       , 0.33333333, ..., 0.        , 0.5       ,
        0.        ],
       ...,
       [0.        , 0.5       , 0.66666667, ..., 0.        , 0.5       ,
        0.5       ],
       [0.38235294, 0.5       , 0.66666667, ..., 0.33333333, 0.5       ,
        0.        ],
       [0.23529412, 0.5       , 0.66666667, ..., 0.        , 0.5       ,
        0.        ]])

In [93]:
y_pred2=regressor.predict(X_test2)

In [94]:
y_pred2

array([11.70199628, 11.7419769 , 11.77304992, ..., 11.89544485,
       11.68780908, 12.12068358])

In [95]:
y_pred2 = np.exp(y_pred2)

In [96]:
y_pred2

array([120812.64969593, 125740.67170616, 129709.15029292, ...,
       146597.32949915, 119110.75789185, 183630.99895245])

In [98]:
output = pd.DataFrame({'Id': test_data['Id'], 'Saleprice': y_pred2})
output.to_csv('RFR2.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
