In [79]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


## Loading data

In [80]:
traindata = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
testdata = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [81]:
traindata.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [82]:
traindata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [83]:
testdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

## Cleaning

In [84]:
#joining data 
alldata = pd.concat((traindata.drop(columns=['Id','SalePrice']), testdata.drop(columns='Id')))
alldata=alldata.drop(columns=[ 'Alley', 'PoolQC', 'Fence', 'MiscFeature'])

In [85]:
def clean_data(data):
    #select numeric data
    l_num=[]
    for i in data.columns:
        if data[i].dtype == 'int64' or data[i].dtype == 'float64':
            l_num.append(str(i))
    
    for i in l_num:
        #fill na values with mean
        data[i]=data[i].fillna(int(data[i].mean()))
        #normalize data
        data[i]=(data[i]-data[i].mean())/data[i].std()
    #select object data
    l_obj=[]
    for i in data.columns:
        if data[i].dtype == 'object':
            l_obj.append(str(i))
            
    for i in l_obj:
        #fill na values with mode
        data[i]=data[i].fillna(data[i].mode()[0])
    
    return data

In [86]:
alldata=clean_data(alldata)
# turn a single categorical column into many indicator columns
alldata = pd.get_dummies(alldata)

n_train = traindata.shape[0]
traindata_r = alldata.iloc[:n_train]
testdata_r = alldata.iloc[n_train:]
pricedata=traindata['SalePrice'].copy()
#transform target data to a better distribution with log
pricedata_r = np.log(pricedata.copy())

## Slipt data

In [87]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(traindata_r, pricedata_r, test_size = 0.2, random_state= 2) #20% data test

## Forecast Models

Will be user the models of artificial neural networks, random forest and XGBRegressor. For the tuning of hyperparameters, the method chosen was the randomized search. The forecast model that presents the best results with test data will be chosen.

### Artificial neural network

In [88]:
import tensorflow as tf   
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
tf.compat.v1.set_random_seed(2)
from numpy.random import seed
seed(1)
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, truncnorm, randint

def create_model(n,ln,fun,layer2,epochs):

    initializer = tf.compat.v1.keras.initializers.glorot_uniform(seed=0)
    model = Sequential()
    model.add(Dense(int(n), input_dim=x_train.shape[1], activation=fun,
                    use_bias=True,kernel_initializer=initializer))
    if layer2==True:
        model.add(Dense(n, activation=fun,use_bias=True,kernel_initializer=initializer)) # Hidden 2
    model.add(Dense(1,kernel_initializer=initializer)) 
    optimizer = tf.keras.optimizers.Adam(learning_rate=ln)
    model.compile(loss='mean_squared_error', optimizer=optimizer,metrics=['mse'])
    return model

#range of hyperparameters
ln = [0.1,0.01,0.001, 0.0001]
n=randint(1,100)
layer2=[False,True]
epochs=randint(10,200)
fun=['tanh','relu']
param_grid = dict(n=n,ln=ln,fun=fun,layer2=layer2,epochs=epochs)

model = KerasRegressor(build_fn=create_model, verbose=0)

random = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_jobs=1,verbose=2)
random_result =random.fit(x_train, y_train)

model_RN=create_model(**random_result.best_params_) 
history=model_RN.fit(x_train,y_train,
                  validation_data=(x_test,y_test),
                  verbose=0,epochs=random_result.best_params_['epochs'])

y_pred = model_RN.predict(x_test)
RMSE = mean_squared_error(np.exp(y_test), np.exp(y_pred), squared = False)
print(RMSE)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END ....epochs=47, fun=relu, layer2=False, ln=0.1, n=10; total time=   2.3s
[CV] END ....epochs=47, fun=relu, layer2=False, ln=0.1, n=10; total time=   2.3s
[CV] END ....epochs=47, fun=relu, layer2=False, ln=0.1, n=10; total time=   2.3s
[CV] END ....epochs=47, fun=relu, layer2=False, ln=0.1, n=10; total time=   2.3s
[CV] END ....epochs=47, fun=relu, layer2=False, ln=0.1, n=10; total time=   2.5s
[CV] END ....epochs=143, fun=relu, layer2=False, ln=0.1, n=2; total time=   5.6s
[CV] END ....epochs=143, fun=relu, layer2=False, ln=0.1, n=2; total time=   5.6s
[CV] END ....epochs=143, fun=relu, layer2=False, ln=0.1, n=2; total time=   5.9s
[CV] END ....epochs=143, fun=relu, layer2=False, ln=0.1, n=2; total time=   5.6s
[CV] END ....epochs=143, fun=relu, layer2=False, ln=0.1, n=2; total time=   5.6s
[CV] END ..epochs=81, fun=relu, layer2=False, ln=0.001, n=26; total time=   3.6s
[CV] END ..epochs=81, fun=relu, layer2=False, ln

### Random forest regressor

In [89]:
max_depth=randint(5,20)
n_estimators=randint(2,1000)
param_grid = dict(max_depth=max_depth,n_estimators=n_estimators)

forest_reg = RandomForestRegressor()

random = RandomizedSearchCV(estimator=forest_reg, param_distributions=param_grid, n_jobs=1,verbose=2)
random_result =random.fit(x_train, y_train)

model_RF=RandomForestRegressor(**random_result.best_params_) 
history=model_RF.fit(x_train,y_train)

y_pred = model_RF.predict(x_test)
RMSE = mean_squared_error(np.exp(y_test), np.exp(y_pred), squared = False)
print(RMSE)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END ......................max_depth=19, n_estimators=73; total time=   1.4s
[CV] END ......................max_depth=19, n_estimators=73; total time=   1.3s
[CV] END ......................max_depth=19, n_estimators=73; total time=   1.3s
[CV] END ......................max_depth=19, n_estimators=73; total time=   1.3s
[CV] END ......................max_depth=19, n_estimators=73; total time=   1.3s
[CV] END ......................max_depth=8, n_estimators=456; total time=   5.5s
[CV] END ......................max_depth=8, n_estimators=456; total time=   5.4s
[CV] END ......................max_depth=8, n_estimators=456; total time=   5.5s
[CV] END ......................max_depth=8, n_estimators=456; total time=   5.5s
[CV] END ......................max_depth=8, n_estimators=456; total time=   5.6s
[CV] END .....................max_depth=10, n_estimators=563; total time=   8.2s
[CV] END .....................max_depth=10, n_es

### XGBRegressor

In [90]:
max_depth=randint(5,20)
n_estimators=randint(2,1000)
param_grid = dict(max_depth=max_depth,n_estimators=n_estimators)

XGB = XGBRegressor()

random = RandomizedSearchCV(estimator=XGB, param_distributions=param_grid, n_jobs=1,verbose=2)
random_result =random.fit(x_train, y_train)

model_XGB=XGBRegressor(**random_result.best_params_) 
history=model_XGB.fit(x_train,y_train)

y_pred = model_XGB.predict(x_test)
RMSE = mean_squared_error(np.exp(y_test), np.exp(y_pred), squared = False)
print(RMSE)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END .....................max_depth=10, n_estimators=195; total time=   1.3s
[CV] END .....................max_depth=10, n_estimators=195; total time=   1.3s
[CV] END .....................max_depth=10, n_estimators=195; total time=   1.3s
[CV] END .....................max_depth=10, n_estimators=195; total time=   1.3s
[CV] END .....................max_depth=10, n_estimators=195; total time=   1.3s
[CV] END ......................max_depth=9, n_estimators=925; total time=   4.9s
[CV] END ......................max_depth=9, n_estimators=925; total time=   4.1s
[CV] END ......................max_depth=9, n_estimators=925; total time=   4.2s
[CV] END ......................max_depth=9, n_estimators=925; total time=   4.2s
[CV] END ......................max_depth=9, n_estimators=925; total time=   4.2s
[CV] END ......................max_depth=7, n_estimators=813; total time=   3.7s
[CV] END ......................max_depth=7, n_es

In [91]:
prediction = np.exp(model_XGB.predict(testdata_r))

In [92]:
final=pd.DataFrame({'Id':testdata['Id'], 'SalePrice':prediction})
final.to_csv('./final.csv',index=False)