# **Notebook for Home Price Predictions** #


<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/ageron/handson-ml/blob/master/01_the_machine_learning_landscape.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
</table>

# **Setup**

In [None]:
from google.colab import drive # import drive from google colab

ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT)

/content/drive
Mounted at /content/drive


In [None]:
%cd drive/MyDrive/kaggle-home-prices/

/content/drive/MyDrive/kaggle-home-prices


# **Load Data**

In [None]:
import numpy as np # linear algebra
import pandas as pd 

test_set = pd.DataFrame()
train_set = pd.DataFrame()
import os
for dirname, _, filenames in os.walk('data/'):
    for filename in filenames:
        if filename == "test.csv":
            test_set = pd.read_csv(os.path.join(dirname, filename))
        if filename == "train.csv":
            train_set = pd.read_csv(os.path.join(dirname, filename))    
        print(os.path.join(dirname, filename))


data/data_description.txt
data/sample_submission.csv
data/test.csv
data/train.csv


In [None]:
from sklearn.model_selection import train_test_split

practice_train_set, practice_test_set = train_test_split(train_set, test_size=0.2, random_state=42)

# **Vizualization**

In [None]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
train_set.hist(bins=50,figsize=(20,15))
plt.show()

KeyboardInterrupt: ignored

In [None]:
corr_matrix = train_set.corr()
sale_price_corr = corr_matrix["SalePrice"].sort_values(ascending=False)
print(sale_price_corr)

In [None]:
from pandas.plotting import scatter_matrix

scatter_matrix(train_set[["SalePrice", "OverallQual", "GrLivArea", "GarageCars", "GarageArea", "TotalBsmtSF", "1stFlrSF"]], figsize=(12,10),alpha=0.05)

In [None]:
keys = corr_matrix["SalePrice"].sort_values(ascending=False).keys()
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
split_keys = list(chunks(keys, 5))
count = 0
for key_list in split_keys:
  if count == 0:
    count += 1
    continue  
  new_keys = key_list.append(pd.Index(["SalePrice"]))
  print(new_keys)
  scatter_matrix(train_set[new_keys], figsize=(12,10),alpha=0.05)


# **Transformers**

In [None]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn_pandas import CategoricalImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor


class DataFrameSelector(BaseEstimator, TransformerMixin):
  def __init__(self, attribute_names):
    self.attribute_names = attribute_names
  def fit(self, X, y=None):
    return self  
  def transform(self, X):
    return X[self.attribute_names].values  

labels = practice_train_set["SalePrice"].copy()
train_saleprice_dropped = practice_train_set.drop("SalePrice", axis=1)

num_attribs = list(train_saleprice_dropped.select_dtypes(exclude='object').keys())
cat_attribs = list(train_saleprice_dropped.select_dtypes(include='object').keys())
print(num_attribs)
print(cat_attribs)

num_pipeline= Pipeline([('selector', DataFrameSelector(num_attribs)), ('imputer', SimpleImputer(strategy='median')), ('std_scaler', StandardScaler()), ('feature_selector', SelectFromModel(RandomForestRegressor(), threshold=-np.inf, max_features=10)), ])
cat_pipeline= Pipeline([('selector', DataFrameSelector(cat_attribs)), ('imputer', CategoricalImputer()), ('one_hot_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore')), ('feature_selector', SelectFromModel(RandomForestRegressor(), threshold=-np.inf, max_features=10))])

full_pipeline = FeatureUnion(transformer_list=[("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline),])
practice_train_set_prepared = full_pipeline.fit_transform(train_saleprice_dropped, labels)
 


NameError: ignored

In [None]:
from sklearn.model_selection import KFold
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)
train_set_labels = train_set["SalePrice"].copy()
y = train_set_labels
new_train = train_set.drop("SalePrice", axis=1)
train_set_prepared = full_pipeline.fit_transform(train_set, train_set_labels)

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=train_set_prepared):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)



In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor


param_grid = [
              {'n_estimators': [80, 100, 120, 140], 'max_features': [14, 16, 18, 20], 'max_depth': [10, 15, 20], 'learning_rate': [0.01, 0.1, 1]},
              {'n_estimators': [3,10, 30], 'max_features':[2, 3, 4, 5 , 6]},
]
forest_reg = GradientBoostingRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(practice_train_set_prepared, labels)

print(cv_rmse(grid_search.best_estimator_))

[31656.79556838 31829.11691288 22135.56884528 24391.40316072
 41028.87560055 31469.52389818 34551.42748977 26136.23956755
 29459.48612441 22205.61486703]


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor


param_grid = [
              {'n_estimators': [80, 100, 120, 140], 'max_features': [14, 16, 18, 20], 'max_depth': [10, 15, 20], 'learning_rate': [0.01, 0.1, 1]},
              {'n_estimators': [3,10, 30], 'max_features':[2, 3, 4, 5 , 6]},
]
forest_reg = GradientBoostingRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(practice_train_set_prepared, labels)

print(grid_search.best_estimator_)
feature_importances = grid_search.best_estimator_.feature_importances_
lin_reg = grid_search.best_estimator_

test_sale_price_dropped = practice_train_set.drop("SalePrice", axis=1)
practice_test_labels = practice_train_set["SalePrice"].copy()
practice_test_set_prepared = full_pipeline.transform(test_sale_price_dropped)
practice_data_predictions = lin_reg.predict(practice_test_set_prepared)

from sklearn.metrics import mean_squared_error

lin_mse = mean_squared_error(practice_test_labels, practice_data_predictions)
lin_rmse = np.sqrt(lin_mse)
# for i, p in enumerate(practice_test_set["SalePrice"]):
#   print(practice_data_predictions[i], p)
print("RMSE", lin_rmse)

l = np.sort(feature_importances)
print(l)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)
RMSE 33467.8292845672
[0.00000000e+00 5.43333071e-04 6.05159552e-04 2.71551777e-03
 3.29291232e-03 5.27363790e-03 6.57638182e-03 7.10427186e-03
 8.53668059e-03 9.37831216e-03 1.46304960e-02 1.99391448e-02
 2.60853393e-02 2.89922786e-02 3.70514053e-02 4.14459883e-02
 4.64732158e-02 5.54482962e-02 1.39388890e-01 5.46518739e-01]


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso
train_set_labels = train_set["SalePrice"].copy()
new_train = train_set.drop("SalePrice", axis=1)
train_set_prepared = full_pipeline.fit_transform(train_set, train_set_labels)


param_grid = [
              {'alpha': [0.001, 0.01, 0.1]}
]
lasso_reg = Lasso()
grid_search = GridSearchCV(lasso_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(train_set_prepared, train_set_labels)

print(grid_search.best_estimator_)

feature_importances = grid_search.best_estimator_.feature_importances_
lin_reg = grid_search.best_estimator_


test_set_prepared = full_pipeline.transform(test_set)
final_predictions = lin_reg.predict(test_set_prepared)

print(final_predictions)


import csv

# open the file in the write mode
f = open('submission.csv', 'w')
writer = csv.writer(f)

# write a row to the csv file
writer.writerow(["Id", "SalePrice"])
for count, id in enumerate(test_set["Id"]):
   writer.writerow([id, final_predictions[count]])


# close the file
f.close()   

GradientBoostingRegressor(alpha=0.1, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
[131466.80247101 165110.67543194 173263.28928813 ... 153699.81387269
 124934.55985653 226272.40056187]
