# Importing libraries

In [1]:
import pandas as pd 
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

dropping columns

In [3]:
data=pd.read_csv('Bengaluru_House_Data.csv')

In [4]:
data.drop(columns=['area_type','availability','society','balcony'],inplace=True)

In [5]:
data.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [7]:
data['size']= data['size'].fillna('2 BHK')

In [8]:
data['bath']= data['bath'].fillna(data['bath'].median())

In [9]:
data['bhk']=data['size'].str.split().str.get(0).astype(int)

In [10]:
def convertRange(x):
    temp=x.split('_')
    if len(temp)==2:
        return(float(temp[0])+float(temp[1]))/2
    try:
        return float(x)
    except:
        return None

In [11]:
data['total_sqft']=data['total_sqft'].apply(convertRange)

price per square ft

In [13]:
data['price_per_sqft']=data['price']*100000/data['total_sqft']

In [14]:
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13073.0,13320.0,13320.0,13320.0,13073.0
mean,1554.942029,2.688814,112.565627,2.802778,7949.6
std,1238.458773,1.338754,148.971674,1.294496,107244.0
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4265.734
50%,1275.0,2.0,72.0,3.0,5454.545
75%,1670.0,3.0,120.0,3.0,7338.057
max,52272.0,40.0,3600.0,43.0,12000000.0


In [15]:
data['location']=data['location'].apply(lambda x: x.strip())
location_count=data['location'].value_counts()

In [16]:
location_count_less_10=location_count[location_count<=10]
location_count_less_10

location
BTM 1st Stage                         10
Nagadevanahalli                       10
Basapura                              10
Sector 1 HSR Layout                   10
Dairy Circle                          10
                                      ..
1Channasandra                          1
Hosahalli                              1
Vijayabank bank layout                 1
near Ramanashree California resort     1
Abshot Layout                          1
Name: count, Length: 1054, dtype: int64

In [18]:
data['location']=data['location'].apply(lambda x:'other' if x in location_count_less_10 else x) 

# Outlier detection and removal

In [19]:
data=data[((data['total_sqft']/data['bhk'])>=300)]
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,12329.0,12329.0,12329.0,12329.0,12329.0
mean,1590.166773,2.561441,111.444236,2.651472,6322.476758
std,1261.827604,1.072551,152.759322,0.973754,4187.479096
min,300.0,1.0,8.44,1.0,267.829813
25%,1118.0,2.0,49.34,2.0,4207.119741
50%,1300.0,2.0,70.0,3.0,5300.0
75%,1700.0,3.0,115.0,3.0,6938.483548
max,52272.0,16.0,3600.0,16.0,176470.588235


In [22]:
def remove_outlier_sqft(df):
    df_output=pd.DataFrame()
    for key,subdf in df.groupby('location'):
        m=np.mean(subdf.price_per_sqft)
        st=np.std(subdf.price_per_sqft)
        gen_df=subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_output=pd.concat([df_output,gen_df],ignore_index=True)
    return df_output
data=remove_outlier_sqft(data)
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,10153.0,10153.0,10153.0,10153.0,10153.0
mean,1502.725758,2.47188,91.104765,2.573525,5669.621675
std,873.997461,0.971938,86.241338,0.893533,2274.73975
min,300.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4242.424242
50%,1283.0,2.0,67.0,2.0,5183.823529
75%,1650.0,3.0,100.0,3.0,6451.612903
max,30400.0,16.0,2200.0,16.0,24509.803922


In [28]:
def bhk_outlier_remover(df):
    exclude_indices=np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats={}
        for bhk , bhk_df in location_df.groupby('bhk'):
            bhk_stats['bhk']={
                'mean':np.mean(bhk_df.price_per_sqft),
                'std':np.std(bhk_df.price_per_sqft),
                'count':bhk_df.shape[0]
            }
        for bhk , bhk_df in location_df.groupby('bhk'):
            stats=bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices=np.append(exclude_indices,bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')

In [29]:
data=bhk_outlier_remover(data)

In [30]:
data.drop(columns=['size','price_per_sqft'],inplace=True)

# Cleaned data

In [31]:
data.to_csv("Cleaned_data.csv")

In [32]:
X=data.drop(columns=['price'])
y=data['price']

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [38]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [39]:

print(X_test.shape)
print(X_train.shape)

(2031, 4)
(8122, 4)


# Applying Linear Regression


In [41]:
column_trans =make_column_transformer((OneHotEncoder(sparse=False),['location']),remainder='passthrough')

In [42]:
scaler=StandardScaler()

In [45]:
lr=LinearRegression()

In [46]:
pipe=make_pipeline(column_trans,scaler,lr)

In [47]:
pipe.fit(X_train,y_train)



In [48]:
y_pred_lr=pipe.predict(X_test)


In [49]:
r2_score(y_test,y_pred_lr)

0.8057092291739887

# Applying LASSO


In [50]:
lasso=Lasso()
pipe=make_pipeline(column_trans,scaler,lasso)
pipe.fit(X_train,y_train)
y_pred_lasso=pipe.predict(X_test)
r2_score(y_test,y_pred_lasso)



0.7971081825428273

# Applying RIDGE

In [51]:
ridge=Ridge()
pipe=make_pipeline(column_trans,scaler,ridge)
pipe.fit(X_train,y_train)



In [52]:
y_pred_ridge=pipe.predict(X_test)
r2_score(y_test,y_pred_ridge)

0.8057561111629195

In [54]:
print("No regularization:",r2_score(y_test,y_pred_lr))
print("Lasso:",r2_score(y_test,y_pred_lasso))
print("Ridge: ",r2_score(y_test,y_pred_ridge))

No regularization: 0.8057092291739887
Lasso: 0.7971081825428273
Ridge:  0.8057561111629195


In [55]:
import pickle


In [56]:
pickle.dump(pipe,open('RidgeModel.pkl','wb'))