In [1]:
#======================= IMPORT PACKAGES ============================

import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

#======================= DATA SELECTION =========================

print("=======================================")
print("---------- Data Selection -------------")
print("=======================================")
data=pd.read_csv('Bengaluru_House_Data.csv')
print(data.head(10))
print()


---------- Data Selection -------------
             area_type   availability                  location       size  \
0  Super built-up Area         19-Dec  Electronic City Phase II      2 BHK   
1            Plot Area  Ready To Move          Chikka Tirupathi  4 Bedroom   
2        Built-up Area  Ready To Move               Uttarahalli      3 BHK   
3  Super built-up Area  Ready To Move        Lingadheeranahalli      3 BHK   
4  Super built-up Area  Ready To Move                  Kothanur      2 BHK   
5  Super built-up Area  Ready To Move                Whitefield      2 BHK   
6  Super built-up Area         18-May          Old Airport Road      4 BHK   
7  Super built-up Area  Ready To Move              Rajaji Nagar      4 BHK   
8  Super built-up Area  Ready To Move              Marathahalli      3 BHK   
9            Plot Area  Ready To Move              Gandhi Bazar  6 Bedroom   

   society total_sqft  bath  balcony   price  
0  Coomee        1056   2.0      1.0   39.07  
1  Thea

In [2]:
#==================== PREPROCESSING =======================================

#checking missing values

print("=====================================================")
print("--------- Before Checking missing values ------------")
print("=====================================================")
print(data.isnull().sum())
print()


print("=====================================================")
print("--------- After Checking missing values ------------")
print("=====================================================")
data=data.fillna(0)
print(data.isnull().sum())
print()

--------- Before Checking missing values ------------
area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

--------- After Checking missing values ------------
area_type       0
availability    0
location        0
size            0
society         0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64



In [3]:
#==== LABEL ENCODING ====

from sklearn import preprocessing

print("-----------------------------------------------------------")
print("================== Before label Encoding ==================")
print("-----------------------------------------------------------")
print()

print(data['area_type'].head(20))


label_encoder = preprocessing.LabelEncoder()


data['area_type']=label_encoder.fit_transform(data['area_type'])
data['availability']=label_encoder.fit_transform(data['availability'])
data['location']=label_encoder.fit_transform(data['location'].astype(str))
data['size']=label_encoder.fit_transform(data['size'].astype(str))
data['society']=label_encoder.fit_transform(data['society'].astype(str))

print("-----------------------------------------------------------")
print("================== After label Encoding ==================")
print("-----------------------------------------------------------")
print()

print(data['area_type'].head(20))


# import numpy as np
data['total_sqft']=data['total_sqft'].replace('-','')

data=data.drop('total_sqft',axis=1)



-----------------------------------------------------------
-----------------------------------------------------------

0     Super built-up Area
1               Plot Area
2           Built-up Area
3     Super built-up Area
4     Super built-up Area
5     Super built-up Area
6     Super built-up Area
7     Super built-up Area
8     Super built-up Area
9               Plot Area
10    Super built-up Area
11              Plot Area
12    Super built-up Area
13          Built-up Area
14              Plot Area
15    Super built-up Area
16    Super built-up Area
17    Super built-up Area
18    Super built-up Area
19    Super built-up Area
Name: area_type, dtype: object
-----------------------------------------------------------
-----------------------------------------------------------

0     3
1     2
2     0
3     3
4     3
5     3
6     3
7     3
8     3
9     2
10    3
11    2
12    3
13    0
14    2
15    3
16    3
17    3
18    3
19    3
Name: area_type, dtype: int32


In [4]:
#========================= DATA SPLITTING ============================

#=== TEST AND TRAIN ===

x=data.drop('price',axis=1)
y=data['price']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2)

print("-----------------------------------------------------------")
print("======================= Data splitting ====================")
print("-----------------------------------------------------------")
print()
print("Total No Of data          :",data.shape[0])
print()
print("Total No of Training data :",X_train.shape[0])
print()
print("Total No of Testing data :",X_test.shape[0])
print()

-----------------------------------------------------------
-----------------------------------------------------------

Total No Of data          : 13320

Total No of Training data : 9324

Total No of Testing data : 3996



In [5]:
#========================= CLASSIFICATION ============================

from sklearn.linear_model import Ridge
from sklearn import metrics

#=== ridge regression ===

#initialize the model
ridgeR = Ridge(alpha = 1)

#fitting the model
ridgeR.fit(X_train, y_train)

#predict the model
y_pred = ridgeR.predict(X_test)


print("-----------------------------------------------------------")
print("======================= RIDGE REGRESSION ===================")
print("-----------------------------------------------------------")
print()


mae_ridge=metrics.mean_absolute_error(y_test, y_pred)

print("1.Mean Absolute Error : ",mae_ridge)



-----------------------------------------------------------
-----------------------------------------------------------

1.Mean Absolute Error :  53.87566264620391


In [6]:
#===== Random Forest Regression Model =======

from sklearn.ensemble import RandomForestRegressor
 
 # create regressor object
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
 
# fit the regressor with x and y data
regressor.fit(X_train, y_train) 

y_pred_rf=regressor.predict(X_test)

print("-----------------------------------------------------------")
print("============== RANDOM FOREST REGRESSION ===================")
print("-----------------------------------------------------------")
print()


mae_rf=metrics.mean_absolute_error(y_pred_rf,y_test)

print("1.Mean Absolute Error :",mae_rf)
print()


-----------------------------------------------------------
-----------------------------------------------------------

1.Mean Absolute Error : 39.97624260715149



In [7]:
#========================= PREDICTION ============================

print("-----------------------------------------------------------")
print("======================= PREDICTION ========================")
print("-----------------------------------------------------------")
print()

for i in range(0,10):
    Results=y_pred_rf[i]
    print("------------------------------------------")
    print()
    print([i],"The predicted house price is ", Results)
    print()


-----------------------------------------------------------
-----------------------------------------------------------

------------------------------------------

[0] The predicted house price is  382.6025

------------------------------------------

[1] The predicted house price is  84.62495000000001

------------------------------------------

[2] The predicted house price is  1016.545

------------------------------------------

[3] The predicted house price is  199.20225974025976

------------------------------------------

[4] The predicted house price is  220.68635000000003

------------------------------------------

[5] The predicted house price is  83.23

------------------------------------------

[6] The predicted house price is  63.01521000000008

------------------------------------------

[7] The predicted house price is  64.8627

------------------------------------------

[8] The predicted house price is  116.00366666666665

------------------------------------------


In [10]:
#=============================== PREDICTION ===========================

import numpy as np
print()
print("-------------------------------------------------------------")
print()
print("======== Input data 1 =============")
print()
input_1 = np.array([2,80,671,23,0,7,0]).reshape(1, -1)
print()
print("The Actuall input data is : ",input_1)
predicted_data = ridgeR.predict(input_1)
print()
print("Thepredicted house price is : ", predicted_data)
print()

areatype=int(input("Enter ARea type"))

ava=int(input("Enter availability"))

loc=int(input("Enter location"))

area=int(input("Enter size"))

soc=int(input("Enter society"))

bath=int(input("Enter bath"))

bal=int(input("Enter balcony"))

input_1 = np.array([areatype,ava,loc,area,soc,bath,bal]).reshape(1, -1)

predicted_data = ridgeR.predict(input_1)
print()
print("Thepredicted house price is : ", predicted_data)
print()







-------------------------------------------------------------



The Actuall input data is :  [[  2  80 671  23   0   7   0]]

Thepredicted house price is :  [335.91666918]

Enter ARea type1
Enter availability2
Enter location3
Enter size3
Enter society4
Enter bath5
Enter balcony6

Thepredicted house price is :  [175.73472168]



In [None]:
# ===== Graphs =========

import matplotlib.pyplot as plt 

import seaborn as sns
sns.scatterplot(x=data['area_type'], y=data['location'], hue=data['price'])
plt.title("Scatter Plot")
plt.show()



#---

sns.barplot(y=[mae_rf,mae_ridge],x=['RF','Ridge'])
plt.title("Comparison")
plt.show()

# ---

import matplotlib.pyplot as plt
plt.hist(y)
plt.title("Histogram")

plt.show() 

# ----

import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize = (7,7))
counts = data['area_type'].value_counts()
plt.pie(counts, labels = counts.index, startangle = 90, counterclock = False, wedgeprops = {'width' : 0.6},autopct='%1.1f%%', pctdistance = 0.55, textprops = {'color': 'black', 'fontsize' : 15}, shadow = True,colors = sns.color_palette("Paired")[3:])
plt.text(x = -0.35, y = 0, s = 'Total Price: {}'.format(data.shape[0]))
plt.title('Area Type', fontsize = 14);
plt.savefig("graph.png")
plt.show()

# ---- 

fig, ax = plt.subplots(figsize=(6,6)) 
sns.heatmap(data[["area_type", "location", "society"]].corr(), annot = True)
plt.title("Heat map")

plt.show()

