In [None]:
import pandas as pd
import numpy as np

In [None]:
data=pd.read_csv('/content/Bengaluru_House_Data.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
for column in data.columns:
  print(data[column].value_counts())
  print("*"*20)

In [None]:
data.isna().sum()

In [None]:
data.drop(columns=['area_type','availability','society','balcony'],inplace=True)

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data['location'].value_counts()

In [None]:
data['location'] = data['location'].fillna('Sarjapur Road')

In [None]:
data['size'].value_counts()

In [None]:
data['size'] = data['size'].fillna('2 BHK')

In [None]:
data['bath'] = data['bath'].fillna(data['bath'].median())

In [None]:
data.info()

In [None]:
data['bhk'] = data['size'].str.split().str.get(0).astype(int)

In [None]:
data[data.bhk > 20]

In [None]:
data['total_sqft'].unique()

In [None]:
def convertRange(x):

  temp = x.split('-')
  if len(temp) == 2:
     return(float(temp[0]) + float(temp[1]))/2
  try:
     return float(x)
  except:
     return None

In [None]:
data['total_sqft'] = data['total_sqft'].apply(convertRange)

In [None]:
data.head()

**Price Per Square Feet**

In [None]:
data['price_per_sqft'] = data['price'] * 100000 / data['total_sqft']

In [None]:
data['price_per_sqft']

In [None]:
data.describe()

In [None]:
data['location'].value_counts()

In [None]:
data['location'] = data['location'].apply(lambda x : x.strip())
location_counts = data['location'].value_counts()

In [None]:
location_counts

In [None]:
location_counts_less_10 = location_counts[location_counts<=10]
location_counts_less_10

In [None]:
data['location'] = data['location'].apply(lambda x: 'other' if x in location_counts_less_10 else x)

In [None]:
data['location'].value_counts()

**Outlier detection and removal**

In [None]:
data.describe()

In [None]:
(data['total_sqft']/data['bhk']).describe()

In [None]:
data = data[((data['total_sqft']/data['bhk']) >= 300)]
data.describe()

In [None]:
data.shape

In [None]:
data.price_per_sqft.describe()

In [None]:
def remove_outliers_sqft(df):
 df_output = pd.DataFrame()
 for key,subdf in df.groupby('location'):
    m = np.mean(subdf.price_per_sqft)

    st = np.std(subdf.price_per_sqft)

    gen_df = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m+st))]
    df_output = pd.concat([df_output,gen_df],ignore_index=True)
 return df_output
data = remove_outliers_sqft(data)
data.describe()

In [None]:
def bhk_outlier_remover(df):
  exclude_indces =np.array([])
  for location, location_df in df.groupby('location'):
    bhk_stats = {}
    for bhk, bhk_df in location_df.groupby('bhk'):
      bhk_stats[bhk] = {
          'mean': np.mean(bhk_df.price_per_sqft),
          'std': np.std(bhk_df.price_per_sqft),
          'count': bhk_df.shape[0]
       }
    print(location, bhk_stats)
#    for bhk, bhk_df in location_df.groupby('bhk'):
#     stats = bhk_stats.get(bhk-1)
#     if stats and stats['count']>5:
#         exclude_indces = np.append(exclude_indces,bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
  return df.drop(exclude_indces,axis='index')

In [None]:
data=bhk_outlier_remover(data)

In [None]:
data.shape

(10301, 7)

In [None]:
data

In [None]:
data.drop(columns=['size','price_per_sqft'],inplace=True)

**Cleaned Data**

In [None]:
data.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [None]:
data.to_csv("Cleaned_data.csv")

In [None]:
X=data.drop(columns=['price'])
y=data['price']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [None]:
print (X_train.shape)
print (X_test.shape)

(8240, 4)
(2061, 4)


**Applying Linear Regression**

In [None]:
column_trans = make_column_transformer((OneHotEncoder(sparse=False), ['location']), remainder= 'passthrough')

In [None]:
scaler = StandardScaler()

In [None]:
lr = LinearRegression(normalize=True)

In [None]:
pipe = make_pipeline(column_trans,scaler, lr)

In [None]:
pipe.fit(X_train,y_train)

In [None]:
y_pred_lr = pipe.predict(X_test)

In [None]:
r2_score(y_test,y_pred_lr)

0.8284353255504224

**Applying Lasso**

In [None]:
lasso = Lasso()

In [None]:
pipe = make_pipeline(column_trans,scaler, lasso)

In [None]:
pipe.fit(X_train,y_train)

In [None]:
y_pred_lasso = pipe.predict(X_test)
r2_score(y_test, y_pred_lasso)

**Applying Ridge**

In [None]:
ridge = Ridge()

In [None]:
pipe = make_pipeline(column_trans,scaler, ridge)

In [None]:
pipe.fit(X_train,y_train)

In [None]:
y_pred_ridge = pipe.predict(X_test)
r2_score(y_test,y_pred_ridge)

In [None]:
print("No regularization: ",r2_score(y_test,y_pred_lasso))
print("Lasso: ",r2_score(y_test,y_pred_lasso))
print("Ridge: ",r2_score(y_test,y_pred_lasso))

In [None]:
import pickle

In [None]:
pickle.dump(pipe, open('RidgeModel.pkl','wb'))