# <p style="padding:10px;background-color:#2856D4;margin:0;color:#FFFFFF;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px;overflow:hidden;font-weight:500">Housing Price Prediction</p>


<img src= "https://assets.weforum.org/article/image/large_15gAQdddPzO5Cn18bJ--4zQTUHrfpVxWQbAQ6jhvTSc.jpg" alt ="Titanic" style='width: 800px;height:300px'>

# <p style="padding:10px;background-color:#2856D4;margin:0;color:#FFFFFF;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px;overflow:hidden;font-weight:500">Table of Contents</p>

- Imports
- EDA
- Data Processing + Encoding
- Model Making
- ML Model Ensembling + Cross Validation
- Stacking ML Model Result


### Imports

In [None]:
import numpy as np  
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import RobustScaler, QuantileTransformer
from sklearn.metrics import mean_absolute_error,mean_squared_error

from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge

In [None]:
### Importing Data
house_train = pd.read_csv('../input/home-data-for-ml-course/train.csv')
house_test = pd.read_csv('../input/home-data-for-ml-course/test.csv')
house_train.head()

In [None]:
house_train.info()

In [None]:
# Finding the Correlation between columns and the Target (Here, its SalePrice)
house_train.corr()['SalePrice'].sort_values(ascending=False)

### • For Living Area

In [None]:
# For Living Area
sns.scatterplot(data=house_train, x="GrLivArea", y="SalePrice", size='SaleType')
plt.legend([],[], frameon=False)

In [None]:
#Since there's outlier after 4000, we filter it out
house_train=house_train.drop(house_train[(house_train['GrLivArea']>3000) & (house_train['SalePrice']<300000)].index)

### • For Open Porch 

In [None]:
sns.scatterplot(data=house_train, x="OpenPorchSF", y="SalePrice", size='SaleType')
plt.legend([],[], frameon=False)

In [None]:
house_train=house_train.drop(house_train[(house_train['OpenPorchSF']>500) & (house_train['SalePrice']<100000)].index)

### • For Finished Square Feet

In [None]:
sns.scatterplot(data=house_train, x="BsmtFinSF1", y="SalePrice", size='SaleType')
plt.legend([],[], frameon=False)

In [None]:
house_train=house_train.drop(house_train[(house_train['BsmtFinSF1']>5000) & (house_train['SalePrice']<300000)].index)

### • For Year Built

In [None]:
sns.scatterplot(data=house_train, x="YearBuilt", y="SalePrice", size='SaleType')
plt.legend([],[], frameon=False)

In [None]:
house_train=house_train.drop(house_train[(house_train['YearBuilt']<1900) & (house_train['SalePrice']>400000)].index)

### • For Total sqft of Basement Area

In [None]:
sns.scatterplot(data=house_train, x="TotalBsmtSF", y="SalePrice", size='SaleType')
plt.legend([],[], frameon=False)

In [None]:
house_train=house_train.drop(house_train[(house_train['TotalBsmtSF']>3000) & (house_train['SalePrice']<300000)].index)

### • For Overall Condition

In [None]:
sns.scatterplot(data=house_train, x="OverallCond", y="SalePrice", size='SaleType')
plt.legend([],[], frameon=False)

In [None]:
house_train=house_train.drop(house_train[(house_train['OverallCond']==2) & (house_train['SalePrice']>300000)].index)

### Correlation Graph

In [None]:
corr = house_train.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 15, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

### Distribution of the Target

In [None]:
from scipy.stats import norm

sns.displot(data=house_train, x="SalePrice", kde='kde')
  

We use distlots to check the skewness of the Target Variable. It should be a normal distribution in most cases, but here it is right skewed, so we apply log function on the Target variable during the X,y Split.

## Data Processing

First we join all the data together so it'll be easier and less time consumption when we do data processing, encoding which makes the train and test data ready for model at the same time.

In [None]:
all_data = pd.concat([house_train , house_test])
all_data

### Changing Data Types, Combining Data, Making New Features

In [None]:
all_data['MSSubClass'] = all_data['MSSubClass'].astype(str)
all_data['time'] = all_data['YrSold'] - all_data['YearBuilt']
all_data['Total_Area'] = all_data['GrLivArea'] + all_data['TotalBsmtSF']
all_data['Final_Area'] = all_data['Total_Area'] + all_data['GarageArea']
all_data['Overall_Plus'] = all_data['OverallQual'] + all_data['OverallCond']
all_data['Overall_Mi'] = all_data['OverallQual'] - all_data['OverallCond']
all_data['area'] = all_data['Final_Area'] + all_data['LotArea'] * 0.1
all_data

Since we need only the Features, 'Id' is not necessary so we drop. and SalePrice is the Target Variable so we drop that too from the training data.

In [None]:
all_data2 = all_data.drop(columns = ['Id', 'SalePrice'])
all_data3 = pd.get_dummies(all_data2)
all_data3 = all_data3.fillna(-1)

#Scaling the Feature Values/
rs = RobustScaler()
all_data4 = rs.fit_transform(all_data3)
all_data4 = pd.DataFrame(all_data4, columns=all_data3.columns)

In [None]:
#Reassigning Id Values to the Test Data
new_id = all_data['Id'].to_list()

all_data4['Id'] = new_id

all_data5 = all_data4

train2 = all_data5[:len(house_train)]
train2 = train2.drop('Id', axis=1)

test2 = all_data5[len(house_train):]
test2 = test2.set_index('Id')

In [None]:
all_data5.head()

## Modelling

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(train2, np.log1p(house_train['SalePrice']), test_size = 0.2, random_state = 42, shuffle = True)

### Catboost Regression Model

In [None]:
cat = CatBoostRegressor(iterations=4000,
                        verbose = 500,
                        eval_metric='MAE',
                        max_depth = 6,
                        subsample=0.7,
                        learning_rate = 0.04)
cat.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], early_stopping_rounds=1000)

print('MAE : ', mean_absolute_error(np.exp(y_valid), np.exp(cat.predict(x_valid))))

result_cat = cat.predict(test2)

### Ridge Regression

In [None]:
rg = Ridge(alpha=20, random_state = 42)

rg.fit(x_train, y_train)

pred = rg.predict(x_valid)

print('MAE : ', mean_absolute_error(np.exp(y_valid), np.exp(pred)))

result_ridge = rg.predict(test2)

### Catboost Regression KFold Cross Validation (5 Folds)

In [None]:
y = np.log(house_train['SalePrice'])
kfold = KFold(n_splits=5, random_state = 42, shuffle = True)

result_cat = 0

for fold, (train_index, valid_index) in enumerate(kfold.split(train2)):
    x_train, y_train = train2.iloc[train_index], y.iloc[train_index]
    x_valid, y_valid = train2.iloc[valid_index], y.iloc[valid_index]
    
    cat = CatBoostRegressor(iterations=4000,
                        verbose = 500,
                        eval_metric='MAE',
                        max_depth = 6,
                        subsample=0.7,
                        learning_rate = 0.04)
    print('----------Fold', fold+1, 'Started!--------')
    
    cat.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], early_stopping_rounds=1000)
    
    print('Fold', fold+1, 'MAE :', mean_absolute_error(np.exp(y_valid), np.exp(cat.predict(x_valid))))
    print('----------Fold', fold+1, 'Finished!--------')
    
    result_cat += np.exp(cat.predict(test2)) / 5

print('All Folds Completed!')

### Ridge Regression KFold Cross Validation (5 Folds)

In [None]:
kfold = KFold(n_splits=5, random_state = 42, shuffle = True)

result_ridge = 0

for fold, (train_index, valid_index) in enumerate(kfold.split(train2)):
    x_train, y_train = train2.iloc[train_index], y.iloc[train_index]
    x_valid, y_valid = train2.iloc[valid_index], y.iloc[valid_index]
    
    ridge = Ridge(alpha = 10)
    print('----------Fold', fold+1, 'Start!--------')
    
    ridge.fit(x_train, y_train)
    
    print('Fold', fold+1, 'MAE :', mean_absolute_error(np.exp(y_valid), np.exp(ridge.predict(x_valid))))
    print('----------Fold', fold+1, 'Done!--------')
    
    result_ridge += np.exp(ridge.predict(test2)) / 5

print('All Done!')

## Exporting Output & Stacking (50% From both Model)

In [None]:
# Save test predictions to file
output = pd.DataFrame({'Id': test2.index,
                       'SalePrice':  result_cat * 0.5 + result_ridge * 0.5})
output.to_csv('submission.csv', index=False)
output

### Conclusion

We can see that Processing of the data plays a vital role in making the model more accurate and helps in making good predictions. This, dataset has over 33 rows of categorical and 47 rows of Numerical data, Processing them including, Scaling, Encoding, Filling with mean and making new features based on existing ones will be very much helpful in building more complex models.

The algorithms perform better by making the data better.