In [29]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [30]:
df = pd.read_csv('mumbai.csv')

In [31]:
df.columns

Index(['price', 'Address', 'area', 'latitude', 'longitude', 'Bedrooms',
       'Bathrooms', 'Balcony', 'Status', 'neworold', 'parking',
       'Furnished_status', 'Lift', 'Landmarks', 'type_of_building', 'desc',
       'Price_sqft'],
      dtype='object')

In [32]:
df = df[['price','Address','area','Bedrooms','Bathrooms']]
# df = df[['price','Address','area','Bedrooms','Bathrooms','Price_sqft']]

In [33]:
df.describe()

Unnamed: 0,price,area,Bedrooms,Bathrooms
count,6255.0,6255.0,6255.0,6255.0
mean,26841870.0,1185.62526,2.452278,2.42558
std,27888310.0,636.973259,0.749534,0.762604
min,1500000.0,503.0,2.0,0.0
25%,13000000.0,846.5,2.0,2.0
50%,19000000.0,1000.0,2.0,2.0
75%,30000000.0,1300.0,3.0,3.0
max,360000000.0,8000.0,10.0,10.0


In [34]:
regions = {
    "West": ['Andheri','Bandra','Borivali','Dahisar','Goregaon','Jogeshwari','Juhu','Kandivali','Khar','Malad','Mira Bhayandar','Santacruz','Vile Parle','Vasai Virar'],
    "East": ['Bhandup','Ghatkopar','Kanjurmarg','Kurla','Mulund','Nahur','Powai','Vidyavihar','Vikhroli'],
    "Harbour": ['Chembur','Wadala','Govandi','Mankhurd','Trombay'],
    "South": ['Antop Hill','Byculla','Colaba','Dadar','Fort','Girgaon','Kalbadevi','Kamathipura','Matunga','Parel','Tardeo']
}

def set_region(string):
    for region, region_list in regions.items():
        if any(region in string for region in region_list):
            return region
    return "Other"


In [35]:
df['Address'] = df['Address'].apply(set_region)

In [36]:
def remove_outliers(df,cols):
    new_df = None
    for col in cols:
        low,high = df[col].quantile([0.1,0.9])
        new_df = df[(df[col]>=low)&(df[col]<=high)]
        df=new_df
    return df

In [37]:
a = remove_outliers(df,['area','Bedrooms','Bathrooms'])
# a = remove_outliers(df,['area','Bedrooms','Bathrooms','Price_sqft'])
a.describe()

Unnamed: 0,price,area,Bedrooms,Bathrooms
count,4915.0,4915.0,4915.0,4915.0
mean,21313660.0,1049.492981,2.273652,2.226857
std,14187550.0,250.136548,0.445878,0.418842
min,1500000.0,700.0,2.0,2.0
25%,12700000.0,852.0,2.0,2.0
50%,18500000.0,1000.0,2.0,2.0
75%,27000000.0,1200.0,3.0,2.0
max,200000000.0,1800.0,3.0,3.0


In [38]:
b = pd.get_dummies(a,drop_first=True)
y = b['price']
x = b.drop(['price'],axis=1)
b

Unnamed: 0,price,area,Bedrooms,Bathrooms,Address_Harbour,Address_Other,Address_South,Address_West
1,35000000.0,974.0,3.0,2.0,True,False,False,False
2,31700000.0,968.0,3.0,3.0,False,False,False,False
4,13500000.0,1090.0,2.0,2.0,False,False,False,True
6,20700000.0,1188.0,2.0,2.0,False,False,False,True
7,22900000.0,968.0,3.0,3.0,False,False,False,True
...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,2.0,2.0,False,True,False,False
6251,22000000.0,1400.0,3.0,3.0,False,True,False,False
6252,20000000.0,750.0,2.0,2.0,False,False,False,True
6253,11000000.0,700.0,2.0,2.0,False,True,False,False


In [39]:
b.corr()['price']

price              1.000000
area               0.456931
Bedrooms           0.371585
Bathrooms          0.400952
Address_Harbour    0.027271
Address_Other     -0.181234
Address_South      0.201579
Address_West       0.120038
Name: price, dtype: float64

In [40]:
b.head()

Unnamed: 0,price,area,Bedrooms,Bathrooms,Address_Harbour,Address_Other,Address_South,Address_West
1,35000000.0,974.0,3.0,2.0,True,False,False,False
2,31700000.0,968.0,3.0,3.0,False,False,False,False
4,13500000.0,1090.0,2.0,2.0,False,False,False,True
6,20700000.0,1188.0,2.0,2.0,False,False,False,True
7,22900000.0,968.0,3.0,3.0,False,False,False,True


In [41]:
x.shape
y.shape

(4915,)

In [42]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=16)
model = RandomForestRegressor(n_estimators=30)
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.2513473097309368

In [43]:
price = model.predict(x_test)
format_price(price)

'Rs. 3,14,21,241'

In [44]:
def format_price(temp):
    temp = str(price[0]).split(".")[0]
    ans = ""
    temp = temp[::-1]
    c =0
    for i in range(len(temp)):
        if temp[i]==',':
            break
        if c==3:
            ans+=","
        c+=1
        ans+=temp[i]
    left,ans = ans.split(",")[0],ans.split(",")[-1]
    t = ""
    c = 0
    for i in range(len(ans)):
        if c==2:
            t+=","
            c =0 
        c+=1
        t+=ans[i]
    t = t[::-1]
    finalans = "Rs. "+t+","+left
    return finalans

In [45]:
import pickle

In [46]:
with open('house_prediction.pkl','wb') as file:
    pickle.dump(model,file)

In [47]:
with open('house_df.pkl','wb') as file:
    pickle.dump(df,file)

In [48]:
with open('house_prediction.pkl','rb') as pkl_file:
    house_model = pickle.load(pkl_file)

# Dataset 2

In [210]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso,Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pickle as pkl

In [211]:
df = pd.read_csv('mumbai-house-prices.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Price,Area,Location,No. of Bedrooms,New/Resale,Gymnasium,Lift Available,Car Parking,Maintenance Staff,24x7 Security,Children's Play Area,Clubhouse,Intercom,Landscaped Gardens,Indoor Games,Gas Connection,Jogging Track,Swimming Pool
0,0,4850000,720,Kharghar,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0
1,1,4500000,600,Kharghar,1,0,1,1,1,1,1,0,1,0,0,0,0,1,1
2,2,6700000,650,Kharghar,1,0,1,1,1,1,1,1,1,1,0,0,0,1,1
3,3,4500000,650,Kharghar,1,0,0,1,1,1,1,0,0,1,1,0,0,0,0
4,4,5000000,665,Kharghar,1,0,0,1,1,1,1,0,0,1,1,0,0,0,0


In [212]:
df = df.drop(['Unnamed: 0','New/Resale','Gymnasium', 'Lift Available', 'Car Parking', 'Maintenance Staff','24x7 Security', 
               "Children's Play Area", 'Clubhouse', 'Intercom','Landscaped Gardens', 'Indoor Games', 'Gas Connection', 
               'Jogging Track','Swimming Pool'],axis=1)
df.head()
df.shape

(6347, 4)

In [213]:
df_temp = df.drop('Location',axis=1)
df_temp.corr()['Price']

Price              1.000000
Area               0.722336
No. of Bedrooms    0.594865
Name: Price, dtype: float64

In [214]:
df_temp.describe()

Unnamed: 0,Price,Area,No. of Bedrooms
count,6347.0,6347.0,6347.0
mean,15154010.0,1004.327084,1.910036
std,20159430.0,556.375703,0.863304
min,2000000.0,200.0,1.0
25%,5300000.0,650.0,1.0
50%,9500000.0,905.0,2.0
75%,17500000.0,1182.0,2.0
max,420000000.0,8511.0,7.0


In [215]:
# Using IQR
def remove_outlier(df,cols):
    newdf = None
    for col in cols:
        q1,q3 = df[col].quantile(0.25),df[col].quantile(0.75)
        iqr = q3-q1
        low = q1 - 1.5*iqr
        high = q3+1.5*iqr
        newdf = df[(df[col]<=high) & (df[col]>=low)]
        df = newdf
    return df

In [216]:
df['Price'] = np.log(df['Price'])

In [217]:
df_outlier_removed = remove_outlier(df,['Area','No. of Bedrooms'])
df_outlier_removed.describe()

Unnamed: 0,Price,Area,No. of Bedrooms
count,6008.0,6008.0,6008.0
mean,16.026443,914.106525,1.799434
std,0.774861,351.789133,0.726134
min,14.508658,200.0,1.0
25%,15.444751,640.0,1.0
50%,16.012735,880.0,2.0
75%,16.588099,1131.0,2.0
max,18.538464,1980.0,3.0


In [218]:
df_temp = df_outlier_removed.drop('Location',axis=1)
df_temp.corr()['Price']

Price              1.000000
Area               0.652071
No. of Bedrooms    0.672782
Name: Price, dtype: float64

In [219]:
df_outlier_removed.columns

Index(['Price', 'Area', 'Location', 'No. of Bedrooms'], dtype='object')

In [220]:
# Get the value counts of each location
location_counts = df_outlier_removed['Location'].value_counts()

# Identify locations with counts less than 10
locations_less_than_10 = location_counts[location_counts < 10].index

# Replace the identified locations with 'Other'
df_outlier_removed.loc[df_outlier_removed['Location'].isin(locations_less_than_10), 'Location'] = 'Other'

df_outlier_removed.reset_index(drop=True,inplace=True)

In [221]:
x = df_outlier_removed.drop(['Price'],axis=1)
y = df_outlier_removed['Price']
x,y

(      Area       Location  No. of Bedrooms
 0      720       Kharghar                1
 1      600       Kharghar                1
 2      650       Kharghar                1
 3      650       Kharghar                1
 4      665       Kharghar                1
 ...    ...            ...              ...
 6003   700          Other                1
 6004   900     Thane West                2
 6005   900     Thane West                2
 6006  1380         Boisar                3
 6007   700  Badlapur East                1
 
 [6008 rows x 3 columns],
 0       15.394489
 1       15.319588
 2       15.717618
 3       15.319588
 4       15.424948
           ...    
 6003    14.725783
 6004    16.489659
 6005    16.489659
 6006    15.226498
 6007    14.827111
 Name: Price, Length: 6008, dtype: float64)

In [258]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=12)

In [259]:
step1 = ColumnTransformer(transformers=[
    ('coln_transform', OneHotEncoder(sparse=False, drop='first'), [1])
], remainder='passthrough')

step2 = RandomForestRegressor(n_estimators=25,random_state=12)
pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])
pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
r2_score(y_test,y_pred)



0.8506059776666055

In [260]:
df_outlier_removed

Unnamed: 0,Price,Area,Location,No. of Bedrooms
0,15.394489,720,Kharghar,1
1,15.319588,600,Kharghar,1
2,15.717618,650,Kharghar,1
3,15.319588,650,Kharghar,1
4,15.424948,665,Kharghar,1
...,...,...,...,...
6003,14.725783,700,Other,1
6004,16.489659,900,Thane West,2
6005,16.489659,900,Thane West,2
6006,15.226498,1380,Boisar,3


In [261]:
np.exp(pipe.predict([[650,"Bhandup West",2]])[0])



14239373.37334788

In [262]:
# with open('house_model.pkl','wb') as file:
#     pkl.dump(pipe,file)

In [263]:
# with open('dataframe.pkl','wb') as file:
#     pkl.dump(df_outlier_removed,file)

In [228]:
x_train

Unnamed: 0,Area,Location,No. of Bedrooms
1593,555,Kamothe,1
2112,1500,Ville Parle East,2
1317,1100,Ulwe,2
5644,950,Kalyan West,2
5519,580,Borivali West,1
...,...,...,...
3714,550,Panvel,1
3325,575,Nala Sopara,1
1414,940,Mira Road and Beyond,2
5787,1100,Koproli,2
