In [29]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [30]:
df = pd.read_csv('mumbai.csv')

In [31]:
df.columns

Index(['price', 'Address', 'area', 'latitude', 'longitude', 'Bedrooms',
       'Bathrooms', 'Balcony', 'Status', 'neworold', 'parking',
       'Furnished_status', 'Lift', 'Landmarks', 'type_of_building', 'desc',
       'Price_sqft'],
      dtype='object')

In [32]:
df = df[['price','Address','area','Bedrooms','Bathrooms']]
# df = df[['price','Address','area','Bedrooms','Bathrooms','Price_sqft']]

In [33]:
df.describe()

Unnamed: 0,price,area,Bedrooms,Bathrooms
count,6255.0,6255.0,6255.0,6255.0
mean,26841870.0,1185.62526,2.452278,2.42558
std,27888310.0,636.973259,0.749534,0.762604
min,1500000.0,503.0,2.0,0.0
25%,13000000.0,846.5,2.0,2.0
50%,19000000.0,1000.0,2.0,2.0
75%,30000000.0,1300.0,3.0,3.0
max,360000000.0,8000.0,10.0,10.0


In [34]:
regions = {
    "West": ['Andheri','Bandra','Borivali','Dahisar','Goregaon','Jogeshwari','Juhu','Kandivali','Khar','Malad','Mira Bhayandar','Santacruz','Vile Parle','Vasai Virar'],
    "East": ['Bhandup','Ghatkopar','Kanjurmarg','Kurla','Mulund','Nahur','Powai','Vidyavihar','Vikhroli'],
    "Harbour": ['Chembur','Wadala','Govandi','Mankhurd','Trombay'],
    "South": ['Antop Hill','Byculla','Colaba','Dadar','Fort','Girgaon','Kalbadevi','Kamathipura','Matunga','Parel','Tardeo']
}

def set_region(string):
    for region, region_list in regions.items():
        if any(region in string for region in region_list):
            return region
    return "Other"


In [35]:
df['Address'] = df['Address'].apply(set_region)

In [36]:
def remove_outliers(df,cols):
    new_df = None
    for col in cols:
        low,high = df[col].quantile([0.1,0.9])
        new_df = df[(df[col]>=low)&(df[col]<=high)]
        df=new_df
    return df

In [37]:
a = remove_outliers(df,['area','Bedrooms','Bathrooms'])
# a = remove_outliers(df,['area','Bedrooms','Bathrooms','Price_sqft'])
a.describe()

Unnamed: 0,price,area,Bedrooms,Bathrooms
count,4915.0,4915.0,4915.0,4915.0
mean,21313660.0,1049.492981,2.273652,2.226857
std,14187550.0,250.136548,0.445878,0.418842
min,1500000.0,700.0,2.0,2.0
25%,12700000.0,852.0,2.0,2.0
50%,18500000.0,1000.0,2.0,2.0
75%,27000000.0,1200.0,3.0,2.0
max,200000000.0,1800.0,3.0,3.0


In [38]:
b = pd.get_dummies(a,drop_first=True)
y = b['price']
x = b.drop(['price'],axis=1)
b

Unnamed: 0,price,area,Bedrooms,Bathrooms,Address_Harbour,Address_Other,Address_South,Address_West
1,35000000.0,974.0,3.0,2.0,True,False,False,False
2,31700000.0,968.0,3.0,3.0,False,False,False,False
4,13500000.0,1090.0,2.0,2.0,False,False,False,True
6,20700000.0,1188.0,2.0,2.0,False,False,False,True
7,22900000.0,968.0,3.0,3.0,False,False,False,True
...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,2.0,2.0,False,True,False,False
6251,22000000.0,1400.0,3.0,3.0,False,True,False,False
6252,20000000.0,750.0,2.0,2.0,False,False,False,True
6253,11000000.0,700.0,2.0,2.0,False,True,False,False


In [39]:
b.corr()['price']

price              1.000000
area               0.456931
Bedrooms           0.371585
Bathrooms          0.400952
Address_Harbour    0.027271
Address_Other     -0.181234
Address_South      0.201579
Address_West       0.120038
Name: price, dtype: float64

In [40]:
b.head()

Unnamed: 0,price,area,Bedrooms,Bathrooms,Address_Harbour,Address_Other,Address_South,Address_West
1,35000000.0,974.0,3.0,2.0,True,False,False,False
2,31700000.0,968.0,3.0,3.0,False,False,False,False
4,13500000.0,1090.0,2.0,2.0,False,False,False,True
6,20700000.0,1188.0,2.0,2.0,False,False,False,True
7,22900000.0,968.0,3.0,3.0,False,False,False,True


In [41]:
x.shape
y.shape

(4915,)

In [42]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=16)
model = RandomForestRegressor(n_estimators=30)
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.2513473097309368

In [43]:
price = model.predict(x_test)
format_price(price)

'Rs. 3,14,21,241'

In [44]:
def format_price(temp):
    temp = str(price[0]).split(".")[0]
    ans = ""
    temp = temp[::-1]
    c =0
    for i in range(len(temp)):
        if temp[i]==',':
            break
        if c==3:
            ans+=","
        c+=1
        ans+=temp[i]
    left,ans = ans.split(",")[0],ans.split(",")[-1]
    t = ""
    c = 0
    for i in range(len(ans)):
        if c==2:
            t+=","
            c =0 
        c+=1
        t+=ans[i]
    t = t[::-1]
    finalans = "Rs. "+t+","+left
    return finalans

In [45]:
import pickle

In [46]:
with open('house_prediction.pkl','wb') as file:
    pickle.dump(model,file)

In [47]:
with open('house_df.pkl','wb') as file:
    pickle.dump(df,file)

In [48]:
with open('house_prediction.pkl','rb') as pkl_file:
    house_model = pickle.load(pkl_file)