# Real Estate Price Prediction

## Exploratory Data Analysis

The Background Introduction  
What are the things that a potential home buyer considers before purchasing a house? The location, the size of the property, vicinity to offices, schools, parks, restaurants, hospitals or the stereotypical white picket fence? What about the most important factor — the price?

The Approach  
1. Perform some quick EDA(Exploratory Data Analysis)
2. Feature Engineering
3. Data Cleaning
4. Encoding, Scaling and Preprocessing
5. Traning Machine Learning Models
5. Cross Validation and Ensembling Predictions

Prediction Used Model:
  *   1. Multivariate Linear Regression

Target Cluster Datasets is about a set of data regarding about profit based on  following fields.  

Analyze information  
4 continuous features: Bedroom, Totoal_Sqft, Bathroom  
4 possible drop feature: Area Type, Society, Balcony, Availablility  
1 target variable: Price  

In [2]:
# import from google colab
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/3.0 Colab Project/1.0 Colab Playground
!pip install -r requirements.txt

Mounted at /content/drive
/content/drive/MyDrive/3.0 Colab Project/1.0 Colab Playground
Collecting catboost (from -r requirements.txt (line 1))
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting execdata (from -r requirements.txt (line 2))
  Downloading execdata-5.2.2-py3-none-any.whl (15 kB)
Collecting fuzzy_pandas (from -r requirements.txt (line 3))
  Downloading fuzzy_pandas-0.1-py3-none-any.whl (5.2 kB)
Collecting pickle5 (from -r requirements.txt (line 4))
  Downloading pickle5-0.0.11.tar.gz (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.1/132.1 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyinstaller (from -r requirements.txt (line 5))
  Downloading pyinstaller-6.5.0-py3-none-manylinux2014_x86_64.whl (679 kB)
[2K     [

In [3]:
#Code to verify the package installation
try:
  import tqdm
  print('package is installed.')
except:
  print('Package not found')

package is installed.


In [4]:
# Configuration Setting
# from google.colab import data_table
# data_table.enable_dataframe_formatter()

In [5]:
# Import necessary packages
import warnings
warnings.filterwarnings('ignore')
import execdata as exe
from tabulate import tabulate
import pickle
import time
from rich.progress import track

import re
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick # for showing percentage in it
import seaborn as sns
import itertools
%matplotlib inline
matplotlib.rcParams["figure.figsize"] = (20,10)
sns.set(style='darkgrid', font_scale=1.4)

#Imbalanced Dataset Learning
from imblearn.combine import SMOTEENN

#Scikit-Learn Fundamental Packages
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.model_selection import cross_val_score, train_test_split, ShuffleSplit, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.impute import SimpleImputer

# Models
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDRegressor, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier

In [6]:
# for i in track(range(15), description="[green]Loading..."):
#   time.sleep(1)

In [7]:
root_dir = '/content/drive/MyDrive'
work_dir = '/content/drive/MyDrive/3.0 Colab Project/3.0 Github Portfolio Project'
folder_dir = 'S6PO3OP-Real Estate Price Prediction'
data_dir =f'{work_dir}/{folder_dir}/data'
result_dir = f'{work_dir}/{folder_dir}/result'

In [8]:
import os
if not os.path.exists(data_dir):
  os.mkdir(data_dir)

In [9]:
import os
if not os.path.exists(result_dir):
  os.mkdir(result_dir)

In [10]:
%cd '{data_dir}'
!pwd

/content/drive/MyDrive/3.0 Colab Project/3.0 Github Portfolio Project/S6PO3OP-Real Estate Price Prediction/data
/content/drive/MyDrive/3.0 Colab Project/3.0 Github Portfolio Project/S6PO3OP-Real Estate Price Prediction/data


In [11]:
# Alternative of Reading the dataset
# pwd = os.getcwd()
# data_dir = os.path.join(pwd, '50_Startups.csv')
# df = pd.read_csv(data_dir, encoding = 'utf-8')
# Read the dataset - switch to second link if first one not work, OP mean On Progressing
dataset_url_1 = 'S603_Mugged_Data.csv'
dataset_url_2 = 'S603_Mugged_Data.csv'
# Upload dataset into data frame
df = pd.read_csv(os.path.join(result_dir,dataset_url_1), encoding = 'utf-8')
df2 = pd.read_csv(os.path.join(result_dir,dataset_url_2), encoding = 'utf-8')
df.head(3)

Unnamed: 0,Location,Bedroom,Total_Sqft,Bathroom,Price,Price/Sqft
0,1st Block Jayanagar,4,2850.0,4.0,42800000,15017.54
1,1st Block Jayanagar,3,1630.0,3.0,19400000,11901.84
2,1st Block Jayanagar,3,1875.0,2.0,23500000,12533.33


In [12]:
target_feature = 'Price'
target_feature

'Price'

## Modeling

In [13]:
df.head(20)

Unnamed: 0,Location,Bedroom,Total_Sqft,Bathroom,Price,Price/Sqft
0,1st Block Jayanagar,4,2850.0,4.0,42800000,15017.54
1,1st Block Jayanagar,3,1630.0,3.0,19400000,11901.84
2,1st Block Jayanagar,3,1875.0,2.0,23500000,12533.33
3,1st Block Jayanagar,3,1200.0,2.0,13000000,10833.33
4,1st Block Jayanagar,2,1235.0,2.0,14800000,11983.81
5,1st Block Jayanagar,4,2750.0,4.0,41300000,15018.18
6,1st Block Jayanagar,4,2450.0,4.0,36800000,15020.41
7,1st Block Koramangala,2,1415.0,2.0,11000000,7773.85
8,1st Block Koramangala,2,860.0,2.0,6550000,7616.28
9,1st Block Koramangala,4,3000.0,3.0,30000000,10000.0


In [14]:
location_dummies = pd.get_dummies(df.Location)
location_dummies.head(3)

Unnamed: 0,1st Block Jayanagar,1st Block Koramangala,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
location_dummies.shape

(10128, 255)

In [16]:
df2 = pd.concat([df.drop('Location',axis='columns'), location_dummies.drop('Others',axis='columns')], axis='columns')

In [17]:
df2.head(5)

Unnamed: 0,Bedroom,Total_Sqft,Bathroom,Price,Price/Sqft,1st Block Jayanagar,1st Block Koramangala,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,4,2850.0,4.0,42800000,15017.54,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,1630.0,3.0,19400000,11901.84,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1875.0,2.0,23500000,12533.33,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1200.0,2.0,13000000,10833.33,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,1235.0,2.0,14800000,11983.81,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df3 = df2.drop('Price/Sqft', axis='columns')
df3.head()

Unnamed: 0,Bedroom,Total_Sqft,Bathroom,Price,1st Block Jayanagar,1st Block Koramangala,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,4,2850.0,4.0,42800000,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,1630.0,3.0,19400000,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1875.0,2.0,23500000,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1200.0,2.0,13000000,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,1235.0,2.0,14800000,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Data Train Prepartion

In [19]:
print(target_feature)

Price


In [20]:
X, y = exe.eda.sep(df3, target_feature)

In [21]:
X

Unnamed: 0,Bedroom,Total_Sqft,Bathroom,1st Block Jayanagar,1st Block Koramangala,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,4,2850.0,4.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,1630.0,3.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1875.0,2.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1200.0,2.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,1235.0,2.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10123,3,1676.0,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
10124,3,2503.0,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
10125,3,1855.0,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
10126,3,1876.0,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [22]:
X_train, X_test, y_train, y_test = exe.eda.sep_split(df3,target_feature)

In [23]:
%cd '{result_dir}'
!pwd

/content/drive/MyDrive/3.0 Colab Project/3.0 Github Portfolio Project/S6PO3OP-Real Estate Price Prediction/result
/content/drive/MyDrive/3.0 Colab Project/3.0 Github Portfolio Project/S6PO3OP-Real Estate Price Prediction/result


In [24]:
print("X_train dataset's shape is:", X_train.shape)
print("y_train dataset's shape is:", y_train.shape)
print("X_test dataset's shape is:", X_test.shape)
print("y_test dataset's shape is:", y_test.shape)

X_train dataset's shape is: (8102, 257)
y_train dataset's shape is: (8102,)
X_test dataset's shape is: (2026, 257)
y_test dataset's shape is: (2026,)


In [25]:
X_train.to_csv('S603_X_train.csv')
y_train.to_csv('S603_y_train.csv')
X_test.to_csv('S603_X_test.csv')
y_test.to_csv('S603_y_test.csv')

In [26]:
y_train.head()

8999     8400000
5178     4300000
2435     8600000
3784     8300000
5782    11000000
Name: Price, dtype: int64

In [27]:
X_train.head()

Unnamed: 0,Bedroom,Total_Sqft,Bathroom,1st Block Jayanagar,1st Block Koramangala,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
8999,3,1240.0,2.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5178,2,1003.0,2.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2435,4,1905.0,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3784,4,1200.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5782,6,2400.0,7.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


###Apply Machine Learning Algorithm

In [28]:
# Linear Regression - Linear Regression Classifier
lr_clf = LinearRegression()
lr_clf.fit(X_train, y_train)
lr_clf.score(X_test, y_test)

0.8237990217519291

In [29]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=66)
cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.82379902, 0.74651919, 0.7550879 , 0.845826  , 0.81919772])

In [30]:
def find_best_model_using_gridsearchcv(X, y):
  algos = {
      'linear_regression':{
          'model': LinearRegression(),
          'params': {
              'positive': [True, False]
          }
      },
      'lasso':{
          'model': Lasso(),
          'params':{
            'alpha': [1,2],
            'selection': ['random', 'cyclic']
          }
      },
      'decision_tree':{
          'model': DecisionTreeRegressor(),
          'params': {
              'criterion': ['mse', 'friedman_mse'],
              'splitter': ['best', 'random']
          }
      }
  }
  scores = []
  cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=66)
  for algo_name, config in algos.items():
    gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
    gs.fit(X, y)
    scores.append({
        'model':algo_name,
        'best_score': gs.best_score_,
        'best_params': gs.best_params_
    })
  return pd.DataFrame(scores, columns=['model','best_score','best_params'])


In [31]:
score = find_best_model_using_gridsearchcv(X,y)

In [32]:
score

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.798086,{'positive': False}
1,lasso,0.798089,"{'alpha': 2, 'selection': 'random'}"
2,decision_tree,0.630117,"{'criterion': 'friedman_mse', 'splitter': 'best'}"


In [33]:
def predict_price(location, sqft, bedroom, bathroom):
  x = np.zeros(len(X.columns))
  loc_index = np.where(X.columns == location)[0][0]
  x[1] = sqft
  x[0] = bedroom
  x[2] = bathroom
  if loc_index >= 0:
    x[loc_index] = 1
  return lr_clf.predict([x])[0]

In [34]:
loc_index = np.where(X.columns == '1st Phase JP Nagar')[0][0]
loc_index

5

In [35]:
price = predict_price('1st Phase JP Nagar', 1000, 2, 2)

In [36]:
final_price = float(price/100000)
final_price

89.61524590067906

In [37]:
price2 = predict_price('1st Phase JP Nagar', 1111, 2, 2)
price2 = float(price2/100000)
price2

97.93230110228095

In [38]:
price3 = predict_price('Indira Nagar', 1111, 2, 2)
price3 = float(price3/100000)
price3

167.0839464336903

In [39]:
price4 = predict_price('Indira Nagar', 1200, 3, 3)
price4 = float(price4/100000)
price4

172.9161823331508

In [42]:
!pwd

/content/drive/MyDrive/3.0 Colab Project/3.0 Github Portfolio Project/S6PO3OP-Real Estate Price Prediction/result


In [44]:
with open('banglore_home_prices_model.pickle','wb') as f:
  pickle.dump(lr_clf,f)

In [45]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open('columns.json', 'w') as f:
  f.write(json.dumps(columns))