# Importing Libraries

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
import pickle

In [4]:
df=pd.read_csv("Dataset\\Bengaluru_House_Data.csv")

In [5]:
df.head()

In [6]:
df.shape

In [7]:
df.info()

In [8]:
df.isnull().sum()

In [9]:
df.drop(columns=['area_type', 'availability', 'society', 'balcony'], inplace=True)

In [10]:
df.head()

In [11]:
df.describe()

In [12]:
df.info()

In [13]:
df['location'].value_counts()

In [14]:
df['location']=df['location'].fillna('Sarjapur Road')

In [15]:
df['size'].value_counts()

In [16]:
df['size']=df['size'].fillna('2 BHK')

In [17]:
df['bath']=df['bath'].fillna(df['bath'].median())

In [18]:
df.info()

In [19]:
df['bhk']=df['size'].str.split().str.get(0).astype(int)

In [20]:
df.head()

Detecting Outliers

In [21]:
df[df.bhk>20]

In [22]:
df['total_sqft'].unique()

There are ranges in the values. So we're going to find the median of the ranges

Funtion to do the above mentioned task

In [23]:
def convertRange(x):
    temp=x.split('-')
    if len(temp)==2:
        return (float(temp[0])+float(temp[1]))/2
    try:
        return float(x)
    except:
        return None

In [24]:
df['total_sqft']=df['total_sqft'].apply(convertRange)

In [25]:
df.head()

Price per square feet

In [26]:
df['price_per_sqft']=df['price']*100000/df['total_sqft']

In [27]:
df['price_per_sqft']

In [28]:
df.describe()

In [29]:
df['location'].value_counts()

We're going to make all the locations with occurences <= 10 as 'Others' to reduce no. of locations, so that the model can learn efficicently

In [30]:
df['location']=df['location'].apply(lambda x: x.strip())
location_counts=df['location'].value_counts()

In [31]:
location_count_less_than_10=location_counts[location_counts<=10]
location_count_less_than_10

In [32]:
df['location']=df['location'].apply(lambda x: 'other' if x in location_count_less_than_10 else x)

In [33]:
df['location'].value_counts()

In [34]:
df.describe()

How many Square Feet in 1 BHK?

In [35]:
(df['total_sqft']/df['bhk']).describe()

We're going to remove the flats with 1 BHK less 300 sqft.

In [36]:
df=df[((df['total_sqft']/df['bhk'])>=300)]

In [37]:
df.describe()

In [38]:
df.shape

In [39]:
df['price_per_sqft'].describe()

Max price for a single sqft is too high.

We're gonna give a function to remove outliers in price_per_sqft

We're removing the prices that are the out of range of [mean-sd, mean-sd]. Any point out of this range is considered as an outlier

In [40]:
def remove_outlier_sqft(df):
    df_output=pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m=np.mean(subdf['price_per_sqft'])
        sd=np.std(subdf['price_per_sqft'])

        gen_df=subdf[(subdf.price_per_sqft > (m-sd)) & (subdf.price_per_sqft <= (m+sd))]
        df_output=pd.concat([df_output, gen_df], ignore_index=True)
    return df_output
df=remove_outlier_sqft(df)

In [41]:
df.describe()

Creating a function to removie outliers in bhk

In [42]:
def remove_outlier_bhk(df):
    exclude_indices=np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats={}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk]={
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats=bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices=np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices, axis='index')
df=remove_outlier_bhk(df)

In [43]:
df.shape

In [44]:
df.head()

We are removing the unnecessary columns: 'size' and 'price_per_sqft' (We used price_per_sqft just to remove the outliers)

In [45]:
df.drop(columns=['size', 'price_per_sqft'], inplace=True)

# Cleaned Data

In [46]:
df

In [47]:
df.to_csv("cleaned_data.csv")

In [48]:
X=df.drop(columns=['price'])
y=df['price']

In [49]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)

In [50]:
print(X_train.shape)
print(X_test.shape)

# Linear Regression

In [51]:
column_trans=make_column_transformer((OneHotEncoder(sparse=False), ['location']),
                                    remainder='passthrough')

In [52]:
scaler=StandardScaler()

In [53]:
lr=LinearRegression()

In [54]:
pipe=make_pipeline(column_trans, scaler, lr)

In [55]:
pipe.fit(X_train, y_train)

In [56]:
y_pred_lr=pipe.predict(X_test)

In [57]:
r2_score(y_test, y_pred_lr)

# Lasso

In [58]:
lasso=Lasso()

In [59]:
pipe=make_pipeline(column_trans, scaler, lasso)

In [60]:
pipe.fit(X_train, y_train)

In [61]:
y_pred_lasso=pipe.predict(X_test)

In [62]:
r2_score(y_test, y_pred_lasso)

# Ridge

In [63]:
ridge=Ridge()

In [64]:
pipe=make_pipeline(column_trans, scaler, ridge)

In [65]:
pipe.fit(X_train, y_train)

In [66]:
y_pred_ridge=pipe.predict(X_test)

In [67]:
r2_score(y_test, y_pred_ridge)

In [68]:
print("No Regularization: ", r2_score(y_test, y_pred_lr))
print("Lasso: ", r2_score(y_test, y_pred_lasso))
print("Ridge: ", r2_score(y_test, y_pred_ridge))

Both Rigde and No Regularization are same. So we take Ridge Model.

In [69]:
pickle.dump(pipe, open('RidgeModel.pkl', 'wb'))