# **Notebook for Home Price Predictions** #


<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/ageron/handson-ml/blob/master/01_the_machine_learning_landscape.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
</table>

## **Setup**

In [1]:
from google.colab import drive # import drive from google colab

ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT)

/content/drive
Mounted at /content/drive


In [2]:
%cd drive/MyDrive/kaggle-home-prices/

/content/drive/MyDrive/kaggle-home-prices


## **Load Data**

In [12]:
import numpy as np # linear algebra
import pandas as pd 

os.chdir("data")

In [13]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

In [14]:
from sklearn.model_selection import train_test_split
practice_train_set, practice_test_set = train_test_split(train, test_size=0.2, random_state=42)

## **Vizualization**

In [None]:
train_set.info()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
train_set.hist(bins=50,figsize=(20,15))
plt.show()

KeyboardInterrupt: ignored

In [None]:
corr_matrix = train_set.corr()
sale_price_corr = corr_matrix["SalePrice"].sort_values(ascending=False)
print(sale_price_corr)

In [None]:
from pandas.plotting import scatter_matrix

scatter_matrix(train_set[["SalePrice", "OverallQual", "GrLivArea", "GarageCars", "GarageArea", "TotalBsmtSF", "1stFlrSF"]], figsize=(12,10),alpha=0.05)

In [None]:
keys = corr_matrix["SalePrice"].sort_values(ascending=False).keys()
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
split_keys = list(chunks(keys, 5))
count = 0
for key_list in split_keys:
  if count == 0:
    count += 1
    continue  
  new_keys = key_list.append(pd.Index(["SalePrice"]))
  print(new_keys)
  scatter_matrix(train_set[new_keys], figsize=(12,10),alpha=0.05)


## **Transformers**

In [15]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn_pandas import CategoricalImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor


class DataFrameSelector(BaseEstimator, TransformerMixin):
  def __init__(self, attribute_names):
    self.attribute_names = attribute_names
  def fit(self, X, y=None):
    return self  
  def transform(self, X):
    return X[self.attribute_names].values  


y = train["SalePrice"].copy()
X = train.drop("SalePrice", axis=1)

num_attribs = list(X.select_dtypes(exclude='object').keys())
cat_attribs = list(X.select_dtypes(include='object').keys())

y_practice = practice_train_set["SalePrice"].copy()
X_practice = practice_train_set.drop("SalePrice", axis=1)
y_practice_test = practice_test_set["SalePrice"].copy()
X_practice_test = practice_test_set.drop("SalePrice", axis=1)

num_pipeline= Pipeline([('selector', DataFrameSelector(num_attribs)), ('imputer', SimpleImputer(strategy='median')), ('std_scaler', StandardScaler()), ('feature_selector', SelectFromModel(RandomForestRegressor(), threshold=-np.inf, max_features=10)), ])
cat_pipeline= Pipeline([('selector', DataFrameSelector(cat_attribs)), ('imputer', CategoricalImputer()), ('one_hot_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore')), ])
full_pipeline = FeatureUnion(transformer_list=[("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline),])

X_train = full_pipeline.fit_transform(X_practice, y_practice)
X_test = full_pipeline.transform(X_practice_test)

##**Helper Functions**

###**Generate RMSE**

In [21]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

def get_RMSE(model, y, X):
  preds = model.predict(X)
  lin_mse = mean_squared_error(y, preds)
  lin_rmse = np.sqrt(lin_mse)
  print("RMSE", lin_rmse)
  lin_log_mse = mean_squared_error(np.log(y), np.log(preds))
  lin_log_rmse = np.sqrt(lin_log_mse)
  print("log RMSE", lin_log_rmse)

###**Write to CSV**

In [23]:
import csv

def write_to_csv(predictions):
  # open the file in the write mode
  f = open('submission.csv', 'w')
  writer = csv.writer(f)
  writer.writerow(["Id", "SalePrice"])
  for count, id in enumerate(test_set["Id"]):
    writer.writerow([id, final_predictions[count]])
  f.close()  

##**RandomForestRegressor**

In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = [
              {'n_estimators': [80, 100, 120, 140], 'max_features': [14, 16, 18, 20]},
              {'bootstrap': [False], 'n_estimators': [3,10, 30], 'max_features':[2, 3, 4, 5 , 6]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_log_error')
grid_search.fit(X_train, y_practice)
print(grid_search.best_estimator_)
RFR = grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features=20, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=140, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)


In [35]:
get_RMSE(RFR, y_practice_test, X_test)

RMSE 33621.76711128175
log RMSE 0.16784807299822968


##**GradientBoostingRegressor**

In [37]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor


param_grid = [
              {'n_estimators': [60, 80, 100, 120], 'max_features': [14, 16, 18, 20], 'max_depth': [8, 10, 12, 15, 20], 'learning_rate': [0.01, 0.1, 0.2, 0.4]},
]
gb_reg = GradientBoostingRegressor()
grid_search = GridSearchCV(gb_reg, param_grid, cv=5, scoring='neg_mean_squared_log_error')
grid_search.fit(X_train, y_practice)
print(grid_search.best_estimator_)
GBR = grid_search.best_estimator_

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=8,
                          max_features=14, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=120,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)


In [39]:
get_RMSE(GBR, y_practice_test, X_test)

##RMSE 32435.559566198066
##log RMSE 0.15779955466674883

RMSE 28420.24993072146
log RMSE 0.1532245379240831


##**LassoRegressor**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

param_grid = [
              {'alpha': [0.001, 0.01, 0.1], 'tol': [0.0001, 0.001, 0.01], 'max_iter': [1000]}
]
lasso_reg = Lasso()
grid_search = GridSearchCV(lasso_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_practice)
print(grid_search.best_estimator_)
LassoR = grid_search.best_estimator_
get_RMSE(LassoR, y_practice_test, X_test)

##**SupportVectorRegressor**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}, {'kernel': ['poly'], 'coef0': [2, 3, 4, 5]}]
sv_reg = SVR()
grid_search = GridSearchCV(sv_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_practice)
print(grid_search.best_estimator_)
SupportVR = grid_search.best_estimator_
get_RMSE(SupportVR, y_practice_test, X_test)