# **House Price Prediction**

In [None]:
import numpy as np
import pandas as pd
import os

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_squared_log_error

# Loading the train data and initial exploration

In [None]:
housing = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
housing.info()

* Droping the columns with signifficant amout of null values due to irrelevance. 

In [None]:
housing = housing.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature'],axis=1)

In [None]:
housing.isna().sum()

In [None]:
housing.describe().T

## Looking to correlation between the columns to see which columns can cause collinearity or multicollinearity

In [None]:
fig, ax = plt.subplots(figsize=(14,14))
sns.heatmap(housing.corr(), cmap="Blues");

## Select numerical columns with higher correlation to ```SalePrice```. Will use these numerical columns for further exploration, which provides more precise information about pricing strategy. 

In [None]:
def select_cols_corr(df_corr, target_col, min_corr, max_corr): 
    #creating df target_corr
    target_corr = df_corr[target_col].reset_index()
    return target_corr.loc[(target_corr.iloc[:,1] < max_corr) & (target_corr.iloc[:,1] > min_corr),:]

select_cols_corr(housing.corr(), 'SalePrice', min_corr=.4, max_corr=.95)

In [None]:
num_col = select_cols_corr(housing.corr(), 'SalePrice', min_corr=.4, max_corr=.95).iloc[:,0].tolist()

In [None]:
cat_col = housing.select_dtypes(include=["object"]).columns.to_list()

### Droping all columns except for numerical, categorical and 'SalePrice'

In [None]:
housing.drop(housing.columns.difference(cat_col + num_col + ['SalePrice']), axis=1, inplace=True)

* some columns have missing values, which will be fitted using ```SimpleImputer``` and ```OneHotEncoder``` while creating the pipeline. 

# Split the dataset

* Split dataset into train and test datasets. Try the model on train set and implemetn on test set, to check the accuracy. 

In [None]:
X = housing.drop(columns=['SalePrice'], axis=1)
y = housing['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.75, random_state=8)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

# Create Pipeline

I will check different ```regressions``` and ```performance metrics``` to evaluate the best model for house price prediction:

Below ```performance metrics``` will be computed: 

* Mean Absolute Error (MAE): ```absolute``` value of the difference between the ```predicted``` value and the ```true``` value. Tells us how big of an error we can expect from the forecast on average.
* Mean Squared Error (MSE): average ```squared difference``` between the ```predicted``` values and the ```true``` value.
* Root Mean Squared Error (RMSE): estimator measuring the ```quality of the fit``` of the model. ```Small``` RMSE means ```predicted``` value to be close to ```true``` values.
* R² score (R_SQR): proportion of the variance for a ```dependent``` variable that's explained by an ```independent``` variable. Range between ```0``` and ```1```.
* Explained variance score: computes the explained ```variance regression score```. The best possible score is ```1.0```, lower values are worse.
* Max error: computes the ````maximum residual error````, a metric that captures the worst case error between the ```predicted``` value and the ```true``` value. 
* Root Mean Squared Logarithmic Error (RMSLE): computes a risk metric corresponding to the ```predicted``` value of the squared logarithmic (quadratic) error or loss.

## LinearRegression

In [None]:
# Categorical pipeline - filling the missing values in categorical columns using SimpleImputer and OneHotEncoder. 
cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one_hot_enc', OneHotEncoder(drop='first'))
])

# Numerical pipeline - filling the missing values in numerical columns using SimpleImputer and scaling with MinMaxScaler to preserve the shape of the original distribution. 
num_pipeline = Pipeline(steps=[
    ('num_impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

full_processor = ColumnTransformer(transformers=[
    ('number', num_pipeline, num_col), 
    ('category', cat_pipeline, cat_col)
])

lin_model_pipeline = Pipeline(steps=[
    ('processor', full_processor), 
    ('model', LinearRegression())
])

lm = lin_model_pipeline.fit(X_train, y_train)

In [None]:
lin_model_pipeline.named_steps.model.get_params()

In [None]:
def plot_predictions(y_true, y_pred):
    print(
        f"""
        MSE: {mean_squared_error(y_true, y_pred)}
        RMSE: {mean_squared_error(y_true, y_pred)**0.5}
        MAE: {mean_absolute_error(y_true, y_pred)}
        R_SQR: {r2_score(y_true, y_pred)}
        EXV: {explained_variance_score(y_true, y_pred)}
        ME: {max_error(y_true, y_pred)}
        RMSLE: {mean_squared_log_error(y_true, y_pred)**0.5}
        """
    )
    max_preds = min([max(y_pred.tolist()), max(y_true.tolist())])
    min_preds = max([min(y_pred.tolist()), min(y_true.tolist())])
    print(max_preds, min_preds)
    # plot
    plt.figure(figsize=(8,8))
    sns.scatterplot(x=y_pred, y=y_true)
    sns.lineplot(x=[min_preds,max_preds], y=[min_preds, max_preds], color='red')
    plt.ylabel('Reference')
    plt.xlabel('Predictions')
    plt.show()
    
    errors = y_pred - y_true
    plt.subplots(figsize=(10, 6))
    sns.histplot(errors)
    plt.vlines(x = 0, ymin = 0, ymax = 140, color = 'red')
    plt.show()
    
    p_df = (
        pd.DataFrame({'y_true':y_true, 'y_pred':y_pred})
        .assign(error = lambda x: x['y_pred'] - x['y_true'])
        .sort_values(by = 'y_true')
        )
    
    plt.subplots(figsize = (10, 6))
    sns.scatterplot(data=p_df, x = 'y_true', y = 'error')
    plt.hlines(y = 0, xmin = 0, xmax = 700000, color = 'red')
    plt.show()
    
plot_predictions(y_train, lin_model_pipeline.predict(X_train))

In [None]:
lm_test = lin_model_pipeline.fit(X_test, y_test)

results = pd.DataFrame({
    'prediction': lin_model_pipeline.predict(X_test), 
    'true_value': y_test
})

results.head()

In [None]:
plot_predictions(y_test, lin_model_pipeline.predict(X_test))

In [None]:
coef_lr = pd.DataFrame(lin_model_pipeline['model'].coef_)
coef_lr

In [None]:
intercept_lr = lin_model_pipeline['model'].intercept_
intercept_lr

## RandomForestRegressor

In [None]:
rf_pipeline = Pipeline(steps=[
    ('processor', full_processor), 
    ('model', RandomForestRegressor())
])

rf = rf_pipeline.fit(X_train, y_train)

In [None]:
plot_predictions(y_train, rf_pipeline.predict(X_train))

In [None]:
rf_test = rf_pipeline.fit(X_test, y_test)

results = pd.DataFrame({
    'prediction': rf_pipeline.predict(X_test), 
    'true_value': y_test
})

results.head()

In [None]:
plot_predictions(y_test, rf_pipeline.predict(X_test))

## Ridge Regression

In [None]:
ridge_pipeline = Pipeline(steps=[
    ('processor', full_processor), 
    ('model', Ridge())
])

rd = ridge_pipeline.fit(X_train, y_train)

In [None]:
plot_predictions(y_train, ridge_pipeline.predict(X_train))

In [None]:
rd_test = ridge_pipeline.fit(X_test, y_test)

results = pd.DataFrame({
    'prediction': ridge_pipeline.predict(X_test), 
    'true_value': y_test
})

results.head()

In [None]:
plot_predictions(y_test, ridge_pipeline.predict(X_test))

## Lasso Regression

In [None]:
lasso_pipeline = Pipeline(steps=[
    ('processor', full_processor), 
    ('model', Ridge())
])

ls = lasso_pipeline.fit(X_train, y_train)

In [None]:
plot_predictions(y_train, lasso_pipeline.predict(X_train))

In [None]:
ls_test = lasso_pipeline.fit(X_test, y_test)

results = pd.DataFrame({
    'prediction': lasso_pipeline.predict(X_test), 
    'true_value': y_test
})

results.head()

In [None]:
plot_predictions(y_test, lasso_pipeline.predict(X_test))

## GradientBoostingRegressor

In [None]:
gbr_pipeline = Pipeline(steps=[
    ('processor', full_processor), 
    ('model', GradientBoostingRegressor())
])

rd = gbr_pipeline.fit(X_train, y_train)

In [None]:
plot_predictions(y_train, gbr_pipeline.predict(X_train))

In [None]:
gbr_test = gbr_pipeline.fit(X_test, y_test)

results = pd.DataFrame({
    'prediction': gbr_pipeline.predict(X_test), 
    'true_value': y_test
})
results.head()

In [None]:
plot_predictions(y_test, gbr_pipeline.predict(X_test))

## Logarithmic  transformation

* Random Forest Regression

In [None]:
rf_pipeline.fit(X_train, np.log(y_train))
plot_predictions(y_train, np.exp(rf_pipeline.predict(X_train)))

* RGB Regression

In [None]:
gbr_pipeline.fit(X_train, np.log(y_train))
plot_predictions(y_train, np.exp(gbr_pipeline.predict(X_train)))

# Loading the test data

In [None]:
house_test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
house_test.info()

In [None]:
house_test = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
house_test.info()

In [None]:
final_submission = house_test.drop(house_test.columns.difference(cat_col + num_col), axis=1)

## Fit the model

In [None]:
gbr_pipeline.fit(X, np.log(y))

In [None]:
SalePricePreds = np.exp(gbr_pipeline.predict(final_submission))

### Create the DF and save for submission fie 

In [None]:
submission_file = pd.DataFrame({
    'Id': house_test['Id'],
    'SalePrice': SalePricePreds
}).to_csv('submission.csv', index=None)

# Load final submission file 

In [None]:
house_submission = pd.read_csv("submission.csv")
house_submission