**IMPORTING NECESSARY LIBRARIES**

In [None]:
import pandas as pd

**LOADING DATASET**

In [None]:
data=pd.read_csv('/content/bengaluru_house_prices.csv')

**INFORMATION ABOUT DATASET**

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


**EXTRACTING REQUIRED FEATURES**

In [None]:
data=data.drop(['area_type','availability','society',],axis=1)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   balcony     12711 non-null  float64
 5   price       13320 non-null  float64
dtypes: float64(3), object(3)
memory usage: 624.5+ KB


**HANDLING MISSING OR NULL VALUES**

In [None]:
data['location']=data['location'].fillna('Whitefield')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   balcony     12711 non-null  float64
 5   price       13320 non-null  float64
dtypes: float64(3), object(3)
memory usage: 624.5+ KB


In [None]:
data['bath']=data['bath'].fillna(1)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   balcony     12711 non-null  float64
 5   price       13320 non-null  float64
dtypes: float64(3), object(3)
memory usage: 624.5+ KB


In [None]:
data['balcony']=data['balcony'].fillna(1)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   balcony     13320 non-null  float64
 5   price       13320 non-null  float64
dtypes: float64(3), object(3)
memory usage: 624.5+ KB


**PREPROCESSING AREA AND SIZE COLUMN**

In [None]:
def convert_sqft(x):
    try:
        x = str(x).strip().lower()
        if '-' in x:
            tokens = x.split('-')
            return (float(tokens[0]) + float(tokens[1])) / 2
        elif 'sqft' in x:
            return float(x.replace('sqft', '').strip())
        return float(x)
    except:
        return 1000
def extract_bhk(x):
    try:
        x = str(x).lower()
        if 'bhk' in x:
            return int(x.split()[0])
        elif 'bedroom' in x:
            return int(x.split()[0])
        return 1
    except:
        return 1

data['total_sqft'] = data['total_sqft'].apply(convert_sqft)
data['size'] = data['size'].apply(extract_bhk)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  int64  
 2   total_sqft  13320 non-null  float64
 3   bath        13320 non-null  float64
 4   balcony     13320 non-null  float64
 5   price       13320 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 624.5+ KB


In [None]:
data.head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2,1056.0,2.0,1.0,39.07
1,Chikka Tirupathi,4,2600.0,5.0,3.0,120.0
2,Uttarahalli,3,1440.0,2.0,3.0,62.0
3,Lingadheeranahalli,3,1521.0,3.0,1.0,95.0
4,Kothanur,2,1200.0,2.0,1.0,51.0
5,Whitefield,2,1170.0,2.0,1.0,38.0
6,Old Airport Road,4,2732.0,4.0,1.0,204.0
7,Rajaji Nagar,4,3300.0,4.0,1.0,600.0
8,Marathahalli,3,1310.0,3.0,1.0,63.25
9,Gandhi Bazar,6,1020.0,6.0,1.0,370.0


**LOADING THE PREPROCESSORS AND SCALERS**

In [None]:
from sklearn.preprocessing import LabelEncoder,MinMaxScaler

**ENCODING THE LOCATION VALUES**

In [None]:
location_encoder=LabelEncoder()
data['location']=location_encoder.fit_transform(data['location'])

In [None]:
data.head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,419,2,1056.0,2.0,1.0,39.07
1,317,4,2600.0,5.0,3.0,120.0
2,1179,3,1440.0,2.0,3.0,62.0
3,757,3,1521.0,3.0,1.0,95.0
4,716,2,1200.0,2.0,1.0,51.0
5,1252,2,1170.0,2.0,1.0,38.0
6,895,4,2732.0,4.0,1.0,204.0
7,976,4,3300.0,4.0,1.0,600.0
8,799,3,1310.0,3.0,1.0,63.25
9,434,6,1020.0,6.0,1.0,370.0


In [None]:
data.head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,419,2,1056.0,2.0,1.0,39.07
1,317,4,2600.0,5.0,3.0,120.0
2,1179,3,1440.0,2.0,3.0,62.0
3,757,3,1521.0,3.0,1.0,95.0
4,716,2,1200.0,2.0,1.0,51.0
5,1252,2,1170.0,2.0,1.0,38.0
6,895,4,2732.0,4.0,1.0,204.0
7,976,4,3300.0,4.0,1.0,600.0
8,799,3,1310.0,3.0,1.0,63.25
9,434,6,1020.0,6.0,1.0,370.0


**NORMALIZING AREA AND PRICE VALUES**

In [None]:
area_scaler=MinMaxScaler()
data['total_sqft']=area_scaler.fit_transform(data[['total_sqft']])

In [None]:
price_scaler=MinMaxScaler()
data['price']=price_scaler.fit_transform(data[['price']])

In [None]:
data.head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,419,2,0.020183,2.0,1.0,0.00865
1,317,4,0.049722,5.0,3.0,0.03118
2,1179,3,0.02753,2.0,3.0,0.015033
3,757,3,0.029079,3.0,1.0,0.02422
4,716,2,0.022938,2.0,1.0,0.011971
5,1252,2,0.022364,2.0,1.0,0.008352
6,895,4,0.052247,4.0,1.0,0.054566
7,976,4,0.063113,4.0,1.0,0.164811
8,799,3,0.025043,3.0,1.0,0.015381
9,434,6,0.019495,6.0,1.0,0.10078


**CONVERTING INPUT FEATURES TO ARRAYS**

In [None]:
X=data[['location','size','total_sqft','bath','balcony']].values

In [None]:
X

array([[4.19000000e+02, 2.00000000e+00, 2.01832756e-02, 2.00000000e+00,
        1.00000000e+00],
       [3.17000000e+02, 4.00000000e+00, 4.97216430e-02, 5.00000000e+00,
        3.00000000e+00],
       [1.17900000e+03, 3.00000000e+00, 2.75296053e-02, 2.00000000e+00,
        3.00000000e+00],
       ...,
       [9.72000000e+02, 2.00000000e+00, 2.18094163e-02, 2.00000000e+00,
        1.00000000e+00],
       [9.07000000e+02, 4.00000000e+00, 8.96864418e-02, 4.00000000e+00,
        1.00000000e+00],
       [3.96000000e+02, 1.00000000e+00, 1.05029557e-02, 1.00000000e+00,
        1.00000000e+00]])

**EXTACTING THE TARGET VALUES INTO ARRAY**

In [None]:
Y=data['price'].values

In [None]:
Y

array([0.00864978, 0.0311804 , 0.01503341, ..., 0.01447661, 0.13363029,
       0.00250557])

In [None]:
print(price_scaler.inverse_transform(Y[0].reshape(-1,1)))

[[39.07]]


**IMPORTING MACHINE LEARNING MODEL AND OTHER REQUIREMENTS**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


**LOADING THE MODEL BY PARAMETER TUNING**

In [None]:
model=GradientBoostingRegressor(n_estimators=150,learning_rate=0.2,min_samples_split=2,min_samples_leaf=1,subsample=1,max_depth=6,max_features=0.8)

**SPLITTING DATA INTO TRAIN AND TEST SET**

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.1,random_state=42)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(11988, 5)
(1332, 5)
(11988,)
(1332,)


**TRAINING THE MODEL**

In [None]:
model.fit(X,Y)

**PREDICTING THE MODEL**

In [None]:
y_pre=model.predict(x_train)

In [None]:
y_pre_inv=price_scaler.inverse_transform(y_pre.reshape(-1,1))
y_test_inv=price_scaler.inverse_transform(y_train.reshape(-1,1))

**VERIFYING THE PREDICTED VALUES BY ACTUAL VALLUES MANUALLY**

In [None]:
print("y_test           y_pre")
for i in range(50):
  print(round(y_test_inv[i][0]),"       ",round(y_pre_inv[i][0],2))

y_test           y_pre
175         170.73
25         33.13
40         45.32
65         76.61
60         71.79
58         76.17
148         149.92
37         37.06
48         44.4
152         178.02
220         258.78
35         40.55
75         79.21
34         48.12
55         57.66
120         89.14
449         470.15
100         85.48
67         76.11
62         81.2
101         77.59
82         84.98
48         55.42
30         41.07
140         186.19
34         45.99
53         50.24
55         49.17
85         71.69
65         83.65
65         70.15
45         93.97
20         21.64
105         79.35
270         198.93
44         47.18
46         57.35
16         27.92
57         130.91
56         93.26
46         51.77
46         63.57
170         132.08
68         78.77
82         121.67
65         61.97
70         87.82
51         48.53
145         141.69
95         86.26


**FINDING THE DEVIATION BETWEEN ACTUAL AND PREDICTED VALUE**

In [None]:
mse=mean_squared_error(y_test_inv,y_pre_inv)
print(mse)

956.5085294379579


**SAVING THE MODEL**

In [None]:
import joblib
from google.colab import files
joblib.dump(location_encoder, 'location_encoder.joblib')
joblib.dump(area_scaler, 'area_scaler.joblib')
joblib.dump(price_scaler, 'price_scaler.joblib')
joblib.dump(model, 'gradient_boosting_model.joblib')

# Download the files
files.download('location_encoder.joblib')
files.download('area_scaler.joblib')
files.download('price_scaler.joblib')
files.download('gradient_boosting_model.joblib')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print(x_train[0])

[1.2610000e+03 4.0000000e+00 4.9721643e-02 4.0000000e+00 3.0000000e+00]


In [None]:
res=model.predict([x_train[0]])
print(res)

[0.04530309]


In [None]:
print(price_scaler.inverse_transform(res.reshape(-1,1)))

[[170.72870713]]


In [None]:
print(price_scaler.inverse_transform(y_train[0].reshape(-1,1)))

[[175.]]


In [None]:
import joblib

In [None]:
mymodel=joblib.load('/content/gradient_boosting_model.joblib')
area_scaler=joblib.load('/content/area_scaler.joblib')
location_encoder=joblib.load('/content/location_encoder.joblib')
price_scaler=joblib.load('/content/price_scaler.joblib')

In [None]:
def price_prediction(loc,size,area,bath,colony):
  loc=location_encoder.transform([loc])[0]
  area=area_scaler.transform([[area]])[0][0]
  res=mymodel.predict([[loc,size,area,bath,colony]])
  res=price_scaler.inverse_transform(res.reshape(-1,1))[0][0]
  print(f"{res}L")
price_prediction('Chikka Tirupathi',4,2500,5,3)


113.46446440247156L
