# **House price prediction with PyCaret Library**

# 1. Installation

Install Pycaret and Pandas Profiling Libraries

In [1]:
#!pip install pycaret
#!pip install pandas
#!pip install pandas_profiling

# 2. Import libraries

In [2]:
from pycaret.regression import *
import pandas as pd 
import pandas_profiling as pp

# 3. Read train and test csv files

In [3]:
train_house=pd.read_csv('../data/train.csv')
test_house=pd.read_csv('../data/test.csv')

# 4. Pandas profiling train data analysis

Data analysis to understand the distribution of each variable on the dataset

#pp.ProfileReport(train_house)

# 4. Pycaret Regression

The setup function initializes the training environment and creates the transformation pipeline. Must be called before executing any other function. It takes two mandatory parameters: **data** and **target**. All the other parameters are **optional**.

In [5]:
regression_setup =setup(
             data = train_house, # Train data dataset
             target = 'SalePrice', # Name of the target column
             train_size=0.7, # Size of the dataset to be used for training and validation
             ignore_features = ['Id'], # Features to ignore
             categorical_features = ['MSZoning','Exterior1st','Exterior2nd','Functional','SaleType',
                                     'Street','LotShape','LandContour','LotConfig','LandSlope','Neighborhood',   
                                     'Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl',    
                                     'MasVnrType','ExterCond','Foundation','BsmtCond',   
                                     'BsmtFinType1','BsmtFinType2','Heating','CentralAir',   
                                     'Electrical','GarageType','GarageFinish','GarageQual','GarageCond','PavedDrive',
                                     'SaleCondition'], # Categorical features based on the pandas data analysis
             ordinal_features= {'ExterQual': ['Fa', 'TA', 'Gd', 'Ex'],
                                'BsmtQual' : ['Fa', 'TA', 'Gd', 'Ex'], 
                                'BsmtExposure' : ['No', 'Mn', 'Av', 'Gd'],
                                'HeatingQC' : ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
                                'KitchenQual' : ['Fa', 'TA', 'Gd', 'Ex'],
                                },
             handle_unknown_categorical=True,
             unknown_categorical_method='most_frequent',  # Categorical features missing values are filled with the most frequent value 
             normalize=True, # Transforms the numeric features by scaling them to a given range. 
             normalize_method='zscore', # Scaling method.
             remove_outliers=True, # Outliers from the training data are removed using the Singular Value Decomposition,
             profile=False, #a data profile for Exploratory Data Analysis will be displayed in an interactive HTML report. It also generates pandas profiling report
             silent=True # To continue the execution without confirmation
     )

Unnamed: 0,Description,Value
0,session_id,2324
1,Target,SalePrice
2,Original Data,"(1460, 81)"
3,Missing Values,True
4,Numeric Features,18
5,Categorical Features,61
6,Ordinal Features,True
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(970, 336)"


## 5.1 Models Comparision

This function trains and evaluates performance of all estimators available in the model library using cross validation. The output of this function is a score grid with average cross validated scores. Metrics evaluated during CV can be accessed using the **get_metrics** function. Custom metrics can be added or removed using **add_metric** and **remove_metric** function.

In [6]:
compare_models(
    include = ['ransac', 'tr', 'rf', 'et', 'ada', 'gbr'] # To omit certain models from training and evaluation
    )

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,16028.7483,593994662.4724,24030.2338,0.8905,0.1291,0.094,0.177
rf,Random Forest Regressor,16964.1324,740939475.2581,26772.6053,0.8631,0.1431,0.1007,0.708
et,Extra Trees Regressor,18440.997,878411497.901,29328.776,0.8382,0.1516,0.1077,0.533
ada,AdaBoost Regressor,23191.6135,1042491314.7221,32108.2394,0.8066,0.1842,0.147,0.148
tr,TheilSen Regressor,19761762.3194,9.051843275988291e+16,130566046.4314,-18479728.8058,2.4993,160.2983,203.882
ransac,Random Sample Consensus,160424416521260.4,5.4560045973475176e+29,571689046829576.9,-9.56503856155315e+19,7.6579,1036449298.9191,1.231


GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=2324, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

## 5.2 Model Creation
This function trains and evaluates the performance of a given estimator using cross validation. The output of this function is a score grid with CV scores by fold.

In [7]:
model = create_model(
    estimator='gbr', # ID of an estimator available in model library
    fold=5 # Number of folds in which to split the dataset
)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,16276.3632,513372519.0048,22657.7254,0.9198,0.1262,0.0959
1,14939.8013,601398017.2435,24523.4177,0.8976,0.1064,0.079
2,17958.7871,731713693.8945,27050.2069,0.869,0.1517,0.1066
3,16159.0117,638472143.0636,25268.0063,0.8643,0.1479,0.1054
4,15836.4285,571494241.2754,23905.9457,0.8836,0.1309,0.0925
Mean,16234.0784,611290122.8964,24681.0604,0.8869,0.1326,0.0959
SD,981.3584,72815000.8891,1461.2939,0.0202,0.0163,0.01


## 5.3 Evaluate Model
This function displays a user interface for analyzing performance of a trained model. 
It calls the plot_model function internally.

In [8]:
evaluate_model(model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

IntProgress(value=0, description='Processing: ', max=5)

## 5.4 Predict Model
This function predicts Label using a trained model.

In [9]:
pred_house =  predict_model(model, data=test_house)
pred_house.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Label
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2010,WD,Normal,131961.621224
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,,,Gar2,12500,6,2010,WD,Normal,162123.343288
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,3,2010,WD,Normal,187084.545696
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,185807.533408
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,0,,,,0,1,2010,WD,Normal,183578.079615


# 6. Result 

## 6.1 Rename the given columns in the dataframe.

In [10]:
pred_house.rename(
    columns={'Label':'SalePrice'}, # Columns to rename
    inplace=True # Whether to return a new DataFrame. If True then value of copy is ignored.
)

## 6.2 Convert the dataframe into a csv file.

In [11]:
pred_house[['Id','SalePrice']].to_csv(
    'pred_house.csv', # Csv filename
    index=False # Whether to write row names or no
)