In [29]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [30]:
df = pd.read_csv('mumbai.csv')

In [31]:
df.columns

Index(['price', 'Address', 'area', 'latitude', 'longitude', 'Bedrooms',
       'Bathrooms', 'Balcony', 'Status', 'neworold', 'parking',
       'Furnished_status', 'Lift', 'Landmarks', 'type_of_building', 'desc',
       'Price_sqft'],
      dtype='object')

In [32]:
df = df[['price','Address','area','Bedrooms','Bathrooms']]
# df = df[['price','Address','area','Bedrooms','Bathrooms','Price_sqft']]

In [33]:
df.describe()

Unnamed: 0,price,area,Bedrooms,Bathrooms
count,6255.0,6255.0,6255.0,6255.0
mean,26841870.0,1185.62526,2.452278,2.42558
std,27888310.0,636.973259,0.749534,0.762604
min,1500000.0,503.0,2.0,0.0
25%,13000000.0,846.5,2.0,2.0
50%,19000000.0,1000.0,2.0,2.0
75%,30000000.0,1300.0,3.0,3.0
max,360000000.0,8000.0,10.0,10.0


In [34]:
regions = {
    "West": ['Andheri','Bandra','Borivali','Dahisar','Goregaon','Jogeshwari','Juhu','Kandivali','Khar','Malad','Mira Bhayandar','Santacruz','Vile Parle','Vasai Virar'],
    "East": ['Bhandup','Ghatkopar','Kanjurmarg','Kurla','Mulund','Nahur','Powai','Vidyavihar','Vikhroli'],
    "Harbour": ['Chembur','Wadala','Govandi','Mankhurd','Trombay'],
    "South": ['Antop Hill','Byculla','Colaba','Dadar','Fort','Girgaon','Kalbadevi','Kamathipura','Matunga','Parel','Tardeo']
}

def set_region(string):
    for region, region_list in regions.items():
        if any(region in string for region in region_list):
            return region
    return "Other"


In [35]:
df['Address'] = df['Address'].apply(set_region)

In [36]:
def remove_outliers(df,cols):
    new_df = None
    for col in cols:
        low,high = df[col].quantile([0.1,0.9])
        new_df = df[(df[col]>=low)&(df[col]<=high)]
        df=new_df
    return df

In [37]:
a = remove_outliers(df,['area','Bedrooms','Bathrooms'])
# a = remove_outliers(df,['area','Bedrooms','Bathrooms','Price_sqft'])
a.describe()

Unnamed: 0,price,area,Bedrooms,Bathrooms
count,4915.0,4915.0,4915.0,4915.0
mean,21313660.0,1049.492981,2.273652,2.226857
std,14187550.0,250.136548,0.445878,0.418842
min,1500000.0,700.0,2.0,2.0
25%,12700000.0,852.0,2.0,2.0
50%,18500000.0,1000.0,2.0,2.0
75%,27000000.0,1200.0,3.0,2.0
max,200000000.0,1800.0,3.0,3.0


In [38]:
b = pd.get_dummies(a,drop_first=True)
y = b['price']
x = b.drop(['price'],axis=1)
b

Unnamed: 0,price,area,Bedrooms,Bathrooms,Address_Harbour,Address_Other,Address_South,Address_West
1,35000000.0,974.0,3.0,2.0,True,False,False,False
2,31700000.0,968.0,3.0,3.0,False,False,False,False
4,13500000.0,1090.0,2.0,2.0,False,False,False,True
6,20700000.0,1188.0,2.0,2.0,False,False,False,True
7,22900000.0,968.0,3.0,3.0,False,False,False,True
...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,2.0,2.0,False,True,False,False
6251,22000000.0,1400.0,3.0,3.0,False,True,False,False
6252,20000000.0,750.0,2.0,2.0,False,False,False,True
6253,11000000.0,700.0,2.0,2.0,False,True,False,False


In [39]:
b.corr()['price']

price              1.000000
area               0.456931
Bedrooms           0.371585
Bathrooms          0.400952
Address_Harbour    0.027271
Address_Other     -0.181234
Address_South      0.201579
Address_West       0.120038
Name: price, dtype: float64

In [40]:
b.head()

Unnamed: 0,price,area,Bedrooms,Bathrooms,Address_Harbour,Address_Other,Address_South,Address_West
1,35000000.0,974.0,3.0,2.0,True,False,False,False
2,31700000.0,968.0,3.0,3.0,False,False,False,False
4,13500000.0,1090.0,2.0,2.0,False,False,False,True
6,20700000.0,1188.0,2.0,2.0,False,False,False,True
7,22900000.0,968.0,3.0,3.0,False,False,False,True


In [41]:
x.shape
y.shape

(4915,)

In [42]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=16)
model = RandomForestRegressor(n_estimators=30)
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.2513473097309368

In [43]:
price = model.predict(x_test)
format_price(price)

'Rs. 3,14,21,241'

In [44]:
def format_price(temp):
    temp = str(price[0]).split(".")[0]
    ans = ""
    temp = temp[::-1]
    c =0
    for i in range(len(temp)):
        if temp[i]==',':
            break
        if c==3:
            ans+=","
        c+=1
        ans+=temp[i]
    left,ans = ans.split(",")[0],ans.split(",")[-1]
    t = ""
    c = 0
    for i in range(len(ans)):
        if c==2:
            t+=","
            c =0 
        c+=1
        t+=ans[i]
    t = t[::-1]
    finalans = "Rs. "+t+","+left
    return finalans

In [45]:
import pickle

In [46]:
with open('house_prediction.pkl','wb') as file:
    pickle.dump(model,file)

In [47]:
with open('house_df.pkl','wb') as file:
    pickle.dump(df,file)

In [48]:
with open('house_prediction.pkl','rb') as pkl_file:
    house_model = pickle.load(pkl_file)

# Dataset 2

In [541]:
import pandas as pd

In [542]:
df = pd.read_csv('mumbai-house-prices.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Price,Area,Location,No. of Bedrooms,New/Resale,Gymnasium,Lift Available,Car Parking,Maintenance Staff,24x7 Security,Children's Play Area,Clubhouse,Intercom,Landscaped Gardens,Indoor Games,Gas Connection,Jogging Track,Swimming Pool
0,0,4850000,720,Kharghar,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0
1,1,4500000,600,Kharghar,1,0,1,1,1,1,1,0,1,0,0,0,0,1,1
2,2,6700000,650,Kharghar,1,0,1,1,1,1,1,1,1,1,0,0,0,1,1
3,3,4500000,650,Kharghar,1,0,0,1,1,1,1,0,0,1,1,0,0,0,0
4,4,5000000,665,Kharghar,1,0,0,1,1,1,1,0,0,1,1,0,0,0,0


In [543]:
df = df.drop(['Unnamed: 0'],axis=1)
df.head()
df.shape

(6347, 18)

In [544]:
df_temp = df.drop('Location',axis=1)
df_temp.corr()['Price']

Price                   1.000000
Area                    0.722336
No. of Bedrooms         0.594865
New/Resale              0.032428
Gymnasium               0.098097
Lift Available          0.083656
Car Parking             0.055221
Maintenance Staff       0.027604
24x7 Security           0.045107
Children's Play Area    0.045126
Clubhouse               0.084775
Intercom                0.063060
Landscaped Gardens      0.082225
Indoor Games            0.122352
Gas Connection          0.118245
Jogging Track           0.042325
Swimming Pool           0.123902
Name: Price, dtype: float64

In [545]:
import matplotlib.pyplot as plt

In [546]:
df_temp.describe()

Unnamed: 0,Price,Area,No. of Bedrooms,New/Resale,Gymnasium,Lift Available,Car Parking,Maintenance Staff,24x7 Security,Children's Play Area,Clubhouse,Intercom,Landscaped Gardens,Indoor Games,Gas Connection,Jogging Track,Swimming Pool
count,6347.0,6347.0,6347.0,6347.0,6347.0,6347.0,6347.0,6347.0,6347.0,6347.0,6347.0,6347.0,6347.0,6347.0,6347.0,6347.0,6347.0
mean,15154010.0,1004.327084,1.910036,0.341736,0.581377,0.801481,0.562943,0.281393,0.562943,0.559319,0.496297,0.484796,0.360643,0.219631,0.243107,0.38144,0.458327
std,20159430.0,556.375703,0.863304,0.474329,0.493372,0.398916,0.496061,0.449714,0.496061,0.496508,0.500026,0.499808,0.480225,0.414029,0.428993,0.485778,0.4983
min,2000000.0,200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5300000.0,650.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,9500000.0,905.0,2.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,17500000.0,1182.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
max,420000000.0,8511.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [547]:
# Using IQR
def remove_outlier(df,cols):
    newdf = None
    for col in cols:
        q1,q3 = df[col].quantile(0.25),df[col].quantile(0.75)
        iqr = q3-q1
        low = q1 - 1.5*iqr
        high = q3+1.5*iqr
        newdf = df[(df[col]<=high) & (df[col]>=low)]
        df = newdf
    return df

In [548]:
df['Price'] = np.log(df['Price'])

In [549]:
df_outlier_removed = remove_outlier(df,['Area','No. of Bedrooms'])
df_outlier_removed.describe()

Unnamed: 0,Price,Area,No. of Bedrooms,New/Resale,Gymnasium,Lift Available,Car Parking,Maintenance Staff,24x7 Security,Children's Play Area,Clubhouse,Intercom,Landscaped Gardens,Indoor Games,Gas Connection,Jogging Track,Swimming Pool
count,6008.0,6008.0,6008.0,6008.0,6008.0,6008.0,6008.0,6008.0,6008.0,6008.0,6008.0,6008.0,6008.0,6008.0,6008.0,6008.0,6008.0
mean,16.026443,914.106525,1.799434,0.336385,0.570739,0.794274,0.555759,0.280126,0.557091,0.552597,0.484854,0.478029,0.351698,0.209554,0.238848,0.373668,0.44524
std,0.774861,351.789133,0.726134,0.472512,0.495012,0.404265,0.496923,0.449098,0.496771,0.497267,0.499812,0.499559,0.47754,0.407024,0.426415,0.483817,0.497034
min,14.508658,200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,15.444751,640.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,16.012735,880.0,2.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,16.588099,1131.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
max,18.538464,1980.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [550]:
df_temp = df_outlier_removed.drop('Location',axis=1)
df_temp.corr()['Price']

Price                   1.000000
Area                    0.652071
No. of Bedrooms         0.672782
New/Resale              0.035134
Gymnasium               0.183355
Lift Available          0.078704
Car Parking             0.067717
Maintenance Staff       0.034300
24x7 Security           0.065756
Children's Play Area    0.048799
Clubhouse               0.140336
Intercom                0.139415
Landscaped Gardens      0.038982
Indoor Games            0.135262
Gas Connection          0.201469
Jogging Track           0.071917
Swimming Pool           0.201250
Name: Price, dtype: float64

In [551]:
df_outlier_removed.columns

Index(['Price', 'Area', 'Location', 'No. of Bedrooms', 'New/Resale',
       'Gymnasium', 'Lift Available', 'Car Parking', 'Maintenance Staff',
       '24x7 Security', 'Children's Play Area', 'Clubhouse', 'Intercom',
       'Landscaped Gardens', 'Indoor Games', 'Gas Connection', 'Jogging Track',
       'Swimming Pool'],
      dtype='object')

In [553]:
# Get the value counts of each location
location_counts = df_outlier_removed['Location'].value_counts()

# Identify locations with counts less than 10
locations_less_than_10 = location_counts[location_counts < 10].index

# Replace the identified locations with 'Other'
df_outlier_removed.loc[df_outlier_removed['Location'].isin(locations_less_than_10), 'Location'] = 'Other'



In [554]:
df_outlier_removed = pd.get_dummies(df_outlier_removed,drop_first=True)
df_outlier_removed

Unnamed: 0,Price,Area,No. of Bedrooms,New/Resale,Gymnasium,Lift Available,Car Parking,Maintenance Staff,24x7 Security,Children's Play Area,...,Location_Vasai West,Location_Vasai east,Location_Ville Parle East,Location_Virar,Location_Virar East,Location_Virar West,Location_Wadala,Location_Wadala East Wadala,Location_matunga east,Location_mumbai
0,15.394489,720,1,0,0,1,1,1,1,0,...,False,False,False,False,False,False,False,False,False,False
1,15.319588,600,1,0,1,1,1,1,1,0,...,False,False,False,False,False,False,False,False,False,False
2,15.717618,650,1,0,1,1,1,1,1,1,...,False,False,False,False,False,False,False,False,False,False
3,15.319588,650,1,0,0,1,1,1,1,0,...,False,False,False,False,False,False,False,False,False,False
4,15.424948,665,1,0,0,1,1,1,1,0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6342,14.725783,700,1,1,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
6343,16.489659,900,2,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
6344,16.489659,900,2,0,0,1,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
6345,15.226498,1380,3,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False


In [555]:
df_outlier_removed.corr()['Price']

Price                          1.000000
Area                           0.652071
No. of Bedrooms                0.672782
New/Resale                     0.035134
Gymnasium                      0.183355
                                 ...   
Location_Virar West           -0.062020
Location_Wadala                0.077733
Location_Wadala East Wadala    0.041055
Location_matunga east          0.075668
Location_mumbai               -0.033460
Name: Price, Length: 106, dtype: float64

In [556]:
x = df_outlier_removed.drop(['Price', 'New/Resale','Gymnasium', 'Lift Available', 'Car Parking', 'Maintenance Staff',
                              '24x7 Security', "Children's Play Area", 'Clubhouse', 'Intercom','Landscaped Gardens', 
                              'Indoor Games', 'Gas Connection', 'Jogging Track','Swimming Pool'],axis=1)
y = df_outlier_removed['Price']
x

Unnamed: 0,Area,No. of Bedrooms,Location_Ambernath East,Location_Ambernath West,Location_Andheri,Location_Andheri East,Location_Andheri West,Location_Badlapur East,Location_Bandra East,Location_Bandra West,...,Location_Vasai West,Location_Vasai east,Location_Ville Parle East,Location_Virar,Location_Virar East,Location_Virar West,Location_Wadala,Location_Wadala East Wadala,Location_matunga east,Location_mumbai
0,720,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,600,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,650,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,650,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,665,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6342,700,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6343,900,2,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6344,900,2,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6345,1380,3,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [557]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso,Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [558]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=12)

In [559]:
model = RandomForestRegressor(n_estimators=18,random_state=12)
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.8506569980197715

In [560]:
y_pred = model.predict(x_test)
np.exp(y_pred)

array([ 8812590.88399312,  6259913.37793835, 18741561.85181666, ...,
       10670612.5037832 , 27184955.77199336,  8812590.88399312])