<a href="https://colab.research.google.com/github/byruzyayandy1/House_Prediction/blob/master/Prediting_House_Prices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from IPython.display import clear_output 

In [0]:
! wget https://www.theschool.ai/wp-content/uploads/2019/02/train.csv
! wget https://www.theschool.ai/wp-content/uploads/2019/02/test.csv
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
clear_output()
print("Data Downloaded")

Data Downloaded


In [0]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [0]:
df_train = df_train.drop("Id",axis=1)
df_train.shape

(1460, 80)

In [0]:
#Check how many NaN values have each feature
counting_nan = pd.DataFrame({"Counting nan": df_train.isna().sum().sort_values(ascending=False)})
counting_nan.head(10)

Unnamed: 0,Counting nan
PoolQC,1453
MiscFeature,1406
Alley,1369
Fence,1179
FireplaceQu,690
LotFrontage,259
GarageType,81
GarageCond,81
GarageFinish,81
GarageQual,81


In [0]:
def preprocess(df):
  df.drop('PoolQC', axis=1, inplace=True)
  df.drop('MiscFeature', axis=1, inplace=True)
  df.drop('Alley', axis=1, inplace=True)
  df.drop('Fence', axis=1, inplace=True)
  df.drop('FireplaceQu', axis=1, inplace=True)
  df.drop('LotFrontage', axis=1, inplace=True)
  
  numeric_variables = list(df.select_dtypes(include=['int64','float']).columns.values)
  df[numeric_variables]=df[numeric_variables].apply(lambda x: x.fillna(x.median()),axis=0)
  
  categorical_variables = list(df.select_dtypes(exclude=['int64','float','bool']).columns.values)
  df[categorical_variables]=df[categorical_variables].apply(lambda x: x.fillna("None"),axis=0)
  
  df=pd.get_dummies(df)#one hot encoding
  return df

In [0]:
df_train =  preprocess(df_train)
df_test = preprocess(df_test)
df_train.shape

(1460, 280)

# Training the models

In [0]:
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import r2_score, make_scorer, mean_squared_error, mean_absolute_error
from time import time

In [0]:
X_train, X_test, y_train, y_test = train_test_split(df_train.drop("SalePrice",axis=1), np.log(df_train["SalePrice"]), random_state=42)

In [0]:
(X_train.shape, X_test.shape)

((1095, 279), (365, 279))

## LASSO,RIDGE, AND DECISSION TREE REGRESSION 

In [0]:
from sklearn.model_selection import GridSearchCV # Search over specified parameter values for an estimator.
from sklearn.model_selection import RandomizedSearchCV # Search over specified parameter values for an estimator.
from sklearn.model_selection import ShuffleSplit # Random permutation cross-validator
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor

**Lasso with RandomizedSearchCV**

In [0]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(random_state=42)

In [0]:
cv_sets_lasso = ShuffleSplit(random_state = 30)

parameters_lasso = {
    'alpha':np.linspace(0.0005,0.005,100),
    "max_iter":np.linspace(40000,200000,num=100,dtype=int),
}
scorer_lasso = make_scorer(r2_score)

grid_obj_lasso = RandomizedSearchCV(lasso_reg,
                                   parameters_lasso,
                                   scoring=scorer_lasso,
                                   cv=cv_sets_lasso,
                                   random_state=99)

grid_fit_lasso = grid_obj_lasso.fit(X_train, y_train)

lasso_opt = grid_fit_lasso.best_estimator_
grid_fit_lasso.best_params_

{'alpha': 0.0012727272727272728, 'max_iter': 149898}

In [0]:
lasso_opt.fit(X_train, y_train)
lasso_opt_predict = lasso_opt.predict(X_test)

In [0]:
lasso_r2 = r2_score(y_test, lasso_opt_predict)
lasso_mse = mean_squared_error(y_test, lasso_opt_predict)
lasso_mae = mean_absolute_error(y_test, lasso_opt_predict)
print("R2 : {} \nMSE: {}\nMAE: {}".format(lasso_r2,lasso_mse,lasso_mae))

R2 : 0.8986717636540241 
MSE: 0.017545937450267533
MAE: 0.0930376155302393


**Lasso with GridSearchCV**

In [0]:
lasso_reg2 = Lasso(random_state=42)
alphas = np.logspace(-4,-0.5,30)
tuned_parameters = [{'alpha': alphas}]
n_folds = 5

In [0]:
clf = GridSearchCV(estimator=lasso_reg2,
                   param_grid = tuned_parameters, 
                   cv=n_folds, 
                   refit=True)

clf.fit(X_train, y_train)

lasso_opt2 = clf.best_estimator_ 
lasso_opt2_predict =  lasso_opt2.predict(X_test)

In [0]:
lasso2_r2 = r2_score(y_test, lasso_opt2_predict)
lasso2_mse = mean_squared_error(y_test, lasso_opt2_predict)
lasso2_mae = mean_absolute_error(y_test, lasso_opt2_predict)
print("R2 : {} \nMSE: {}\nMAE: {}".format(lasso2_r2,lasso2_mse,lasso2_mae))

R2 : 0.918203051038265 
MSE: 0.014163911283375706
MAE: 0.08306859272846598


**Ridge Regression**

In [0]:
ridgeReg = Ridge(alpha=0.05, normalize=True)
ridge_params = [{'alpha': np.linspace(0.005,0.5,100),
           'random_state':np.linspace(0,10,10,dtype=int) 
          }]

In [0]:
clf_ridge = GridSearchCV(estimator=ridgeReg,
                   param_grid = ridge_params, 
                   cv=5, 
                   refit=True)

clf_ridge.fit(X_train, y_train)

ridgeReg_opt = clf_ridge.best_estimator_ 
ridgeReg_opt_predict = ridgeReg_opt.predict(X_test)

In [0]:
clf_ridge.best_params_

{'alpha': 0.5, 'random_state': 0}

In [0]:
ridgeReg_r2 = r2_score(y_test, ridgeReg_opt_predict)
ridgeReg_mse = mean_squared_error(y_test, ridgeReg_opt_predict)
ridgeReg_mae = mean_absolute_error(y_test, ridgeReg_opt_predict)
print("R2 : {} \nMSE: {}\nMAE: {}".format(ridgeReg_r2,ridgeReg_mse,ridgeReg_mae))

R2 : 0.90709074723873 
MSE: 0.01608811123420961
MAE: 0.08865049397751779


**Decission Tree Regressor**

In [0]:
dtree = DecisionTreeRegressor(max_depth=8,random_state=51)
params = [{'max_depth': np.linspace(1,10,10,dtype=int),
           'random_state':np.linspace(0,100,10,dtype=int) 
          }]

In [0]:
clf_tree = GridSearchCV(estimator=dtree,
                   param_grid = params, 
                   cv=5, 
                   refit=True)

clf_tree.fit(X_train, y_train)

dtree_opt = clf_tree.best_estimator_ 
dtree_opt_predict = dtree_opt.predict(X_test)

In [0]:
dtree_r2 = r2_score(y_test, dtree_opt_predict)
dtree_mse = mean_squared_error(y_test, dtree_opt_predict)
dtree_mae = mean_absolute_error(y_test, dtree_opt_predict)
print("R2 : {} \nMSE: {}\nMAE {}".format(dtree_r2,dtree_mse,dtree_mae))

R2 : 0.789166414314997 
MSE: 0.0365078189480557
MAE 0.1399271976760597
