In [29]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [30]:
df = pd.read_csv('mumbai.csv')

In [31]:
df.columns

Index(['price', 'Address', 'area', 'latitude', 'longitude', 'Bedrooms',
       'Bathrooms', 'Balcony', 'Status', 'neworold', 'parking',
       'Furnished_status', 'Lift', 'Landmarks', 'type_of_building', 'desc',
       'Price_sqft'],
      dtype='object')

In [32]:
df = df[['price','Address','area','Bedrooms','Bathrooms']]
# df = df[['price','Address','area','Bedrooms','Bathrooms','Price_sqft']]

In [33]:
df.describe()

Unnamed: 0,price,area,Bedrooms,Bathrooms
count,6255.0,6255.0,6255.0,6255.0
mean,26841870.0,1185.62526,2.452278,2.42558
std,27888310.0,636.973259,0.749534,0.762604
min,1500000.0,503.0,2.0,0.0
25%,13000000.0,846.5,2.0,2.0
50%,19000000.0,1000.0,2.0,2.0
75%,30000000.0,1300.0,3.0,3.0
max,360000000.0,8000.0,10.0,10.0


In [34]:
regions = {
    "West": ['Andheri','Bandra','Borivali','Dahisar','Goregaon','Jogeshwari','Juhu','Kandivali','Khar','Malad','Mira Bhayandar','Santacruz','Vile Parle','Vasai Virar'],
    "East": ['Bhandup','Ghatkopar','Kanjurmarg','Kurla','Mulund','Nahur','Powai','Vidyavihar','Vikhroli'],
    "Harbour": ['Chembur','Wadala','Govandi','Mankhurd','Trombay'],
    "South": ['Antop Hill','Byculla','Colaba','Dadar','Fort','Girgaon','Kalbadevi','Kamathipura','Matunga','Parel','Tardeo']
}

def set_region(string):
    for region, region_list in regions.items():
        if any(region in string for region in region_list):
            return region
    return "Other"


In [35]:
df['Address'] = df['Address'].apply(set_region)

In [36]:
def remove_outliers(df,cols):
    new_df = None
    for col in cols:
        low,high = df[col].quantile([0.1,0.9])
        new_df = df[(df[col]>=low)&(df[col]<=high)]
        df=new_df
    return df

In [37]:
a = remove_outliers(df,['area','Bedrooms','Bathrooms'])
# a = remove_outliers(df,['area','Bedrooms','Bathrooms','Price_sqft'])
a.describe()

Unnamed: 0,price,area,Bedrooms,Bathrooms
count,4915.0,4915.0,4915.0,4915.0
mean,21313660.0,1049.492981,2.273652,2.226857
std,14187550.0,250.136548,0.445878,0.418842
min,1500000.0,700.0,2.0,2.0
25%,12700000.0,852.0,2.0,2.0
50%,18500000.0,1000.0,2.0,2.0
75%,27000000.0,1200.0,3.0,2.0
max,200000000.0,1800.0,3.0,3.0


In [38]:
b = pd.get_dummies(a,drop_first=True)
y = b['price']
x = b.drop(['price'],axis=1)
b

Unnamed: 0,price,area,Bedrooms,Bathrooms,Address_Harbour,Address_Other,Address_South,Address_West
1,35000000.0,974.0,3.0,2.0,True,False,False,False
2,31700000.0,968.0,3.0,3.0,False,False,False,False
4,13500000.0,1090.0,2.0,2.0,False,False,False,True
6,20700000.0,1188.0,2.0,2.0,False,False,False,True
7,22900000.0,968.0,3.0,3.0,False,False,False,True
...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,2.0,2.0,False,True,False,False
6251,22000000.0,1400.0,3.0,3.0,False,True,False,False
6252,20000000.0,750.0,2.0,2.0,False,False,False,True
6253,11000000.0,700.0,2.0,2.0,False,True,False,False


In [39]:
b.corr()['price']

price              1.000000
area               0.456931
Bedrooms           0.371585
Bathrooms          0.400952
Address_Harbour    0.027271
Address_Other     -0.181234
Address_South      0.201579
Address_West       0.120038
Name: price, dtype: float64

In [40]:
b.head()

Unnamed: 0,price,area,Bedrooms,Bathrooms,Address_Harbour,Address_Other,Address_South,Address_West
1,35000000.0,974.0,3.0,2.0,True,False,False,False
2,31700000.0,968.0,3.0,3.0,False,False,False,False
4,13500000.0,1090.0,2.0,2.0,False,False,False,True
6,20700000.0,1188.0,2.0,2.0,False,False,False,True
7,22900000.0,968.0,3.0,3.0,False,False,False,True


In [41]:
x.shape
y.shape

(4915,)

In [42]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=16)
model = RandomForestRegressor(n_estimators=30)
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.2513473097309368

In [43]:
price = model.predict(x_test)
format_price(price)

'Rs. 3,14,21,241'

In [44]:
def format_price(temp):
    temp = str(price[0]).split(".")[0]
    ans = ""
    temp = temp[::-1]
    c =0
    for i in range(len(temp)):
        if temp[i]==',':
            break
        if c==3:
            ans+=","
        c+=1
        ans+=temp[i]
    left,ans = ans.split(",")[0],ans.split(",")[-1]
    t = ""
    c = 0
    for i in range(len(ans)):
        if c==2:
            t+=","
            c =0 
        c+=1
        t+=ans[i]
    t = t[::-1]
    finalans = "Rs. "+t+","+left
    return finalans

In [45]:
import pickle

In [46]:
with open('house_prediction.pkl','wb') as file:
    pickle.dump(model,file)

In [47]:
with open('house_df.pkl','wb') as file:
    pickle.dump(df,file)

In [48]:
with open('house_prediction.pkl','rb') as pkl_file:
    house_model = pickle.load(pkl_file)

# Dataset 2

In [148]:
import pandas as pd

In [149]:
df = pd.read_csv('mumbai-house-prices.csv')
df.tail()
df.shape

(6347, 19)

In [150]:
df = df.drop(['Unnamed: 0'],axis=1)

In [151]:
counts = df['Location'].value_counts()
df = df[df.groupby('Location')['Location'].transform(lambda x: x.count()) > 10]

In [152]:
counts = df['Location'].value_counts()
df.loc[df['Location'].map(counts) < 10, 'Location'] = 'other'

In [153]:
df['Location'].value_counts()

Location
Kharghar          533
Thane West        418
Mira Road East    390
Ulwe              319
Borivali West     176
                 ... 
Ghatkopar          12
matunga east       12
Kalyan East        11
Kurla West         11
Palghar            11
Name: count, Length: 89, dtype: int64

In [154]:
df = pd.get_dummies(df,drop_first=True)
df

Unnamed: 0,Price,Area,No. of Bedrooms,New/Resale,Gymnasium,Lift Available,Car Parking,Maintenance Staff,24x7 Security,Children's Play Area,...,Location_Vashi,Location_Ville Parle East,Location_Virar,Location_Virar East,Location_Virar West,Location_Wadala,Location_Wadala East Wadala,Location_Worli,Location_matunga east,Location_mumbai
0,4850000,720,1,0,0,1,1,1,1,0,...,False,False,False,False,False,False,False,False,False,False
1,4500000,600,1,0,1,1,1,1,1,0,...,False,False,False,False,False,False,False,False,False,False
2,6700000,650,1,0,1,1,1,1,1,1,...,False,False,False,False,False,False,False,False,False,False
3,4500000,650,1,0,0,1,1,1,1,0,...,False,False,False,False,False,False,False,False,False,False
4,5000000,665,1,0,0,1,1,1,1,0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6341,7000000,1020,2,1,1,1,0,0,0,1,...,False,False,False,False,False,False,False,False,False,False
6343,14500000,900,2,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
6344,14500000,900,2,0,0,1,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
6345,4100000,1380,3,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False


In [155]:
df.corr()['Price']

Price                          1.000000
Area                           0.711896
No. of Bedrooms                0.614398
New/Resale                     0.048967
Gymnasium                      0.129441
                                 ...   
Location_Wadala                0.066997
Location_Wadala East Wadala    0.029351
Location_Worli                 0.295892
Location_matunga east          0.058550
Location_mumbai               -0.004679
Name: Price, Length: 105, dtype: float64

In [156]:
x = df.drop(['Price'],axis=1)
y = df['Price']

In [157]:
from sklearn.preprocessing import StandardScaler

In [158]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [159]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [160]:
x_train,x_test,y_train,y_test = train_test_split(x_scaled,y,test_size=0.2,random_state=10)

In [161]:
# model = RandomForestRegressor(n_estimators=5)
# model.fit(x_train,y_train)
# model.score(x_test,y_test)
cross_val_score(RandomForestRegressor(n_estimators=5),x_scaled,y,cv=5).mean()

0.6953391325446312

In [162]:
model.predict(x_test)

array([ 2606666.66666667,  6205800.        ,  5500000.        , ...,
       19700000.        , 15100000.        ,  3065000.        ])