In [None]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=RuntimeWarning)

In [None]:
#loading dataset
df = pd.read_csv('/kaggle/input/bengaluru-house-price-data/Bengaluru_House_Data.csv')

In [None]:
df.sample(5)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
#percentage missing in cols
(df.isnull().sum() / df.shape[0]) * 100

In [None]:
#since society have 41% missing value so this feature will get droped
df.drop(columns=['society'],inplace=True)

In [None]:
#checking unique values in all columns
df.nunique()

In [None]:
for cols in df.columns:
    print((df[cols].value_counts()/df.shape[0])*100)

<h1>Analysis of each column seperately:</h1>

<h2>Analysis of area_type column:</h2>

In [None]:
df['area_type'].value_counts().plot.pie(autopct='%.2f',shadow=True,startangle=90,wedgeprops={'edgecolor':'black'},rotatelabels=True)
plt.show()

In [None]:
sns.displot(df, x="price", col="area_type", kde=True, col_wrap=4)
plt.show()

In [None]:
# area_type have 4 categories out of which one is 66% and also probablity density function of
# all four category is almost same so droping this feature
df.drop(columns=['area_type'],inplace=True)

<h2>Analysis of availability column:</h2>

In [None]:
(df['availability'].value_counts()/df.shape[0])*100

In [None]:
# availability have 81 categories but one value is around 80% so this feature is not usefull
df.drop(columns=['availability'],inplace=True)

In [None]:
df.sample(3)

<h2>Analysis of location column:</h2>

In [None]:
df['location'].value_counts()

In [None]:
df['location'].isna().sum()

In [None]:
#filling missing value with mode
df['location'] = df['location'].fillna('Whitefield')

In [None]:
df['location'].nunique()

In [None]:
location_counts = df['location'].value_counts()

In [None]:
len(location_counts[location_counts<=10])

In [None]:
len(location_counts[location_counts>10])

In [None]:
# There are 1064 locations with 10 or fewer occurrences.
# To reduce the number of unique categories, these locations will be grouped under a new category named "other".
df['location'] = df['location'].apply(lambda x:'other' if location_counts[x]<=10 else x)

In [None]:
df['location'].nunique()

now there are only 242 categories 

In [None]:
df.sample(3)

<h2>Analysis of size column:</h2>

In [None]:
# now let's work on size column
df['size'].isnull().sum()

In [None]:
df['size'].value_counts()

In [None]:
# filling missing value with 2 BHK
df['size'] = df['size'].fillna('2 BHK')

In [None]:
df['size'].value_counts()

The 'size' column contains two types of categories: 'Bedroom' and 'BHK'.<br>
To ensure consistency, this needs to be standardized.<br>
making a new column BHK 

In [None]:
df['BHK'] = df['size'].str.split(' ').str[0].astype(int)

In [None]:
df['BHK'].value_counts()

In [None]:
sns.boxplot(x=df['BHK'],y=df['price'])
plt.show()

there are many outliers that need to be fixed,will fixed later

In [None]:
# removinf size column
df.drop(columns=['size'],inplace=True)

In [None]:
df.sample(5)

<h2> Analysis of total_sqft column:</h2>

In [None]:
df['total_sqft'].nunique()

In [None]:
df['total_sqft'].unique()

In [None]:
# there are few values in ranges 
def convert_ranges_to_sqft(x):
    temp = x.split('-')
    if(len(temp)==2):
        return (float(temp[0])+float(temp[1]))/2
    try:
        return float(x)
    except:
        return None

In [None]:
df['total_sqft'] = df['total_sqft'].apply(convert_ranges_to_sqft)

In [None]:
df.sample(3)

<h4>creating a new columns price per sqft because it's a very important feature in real state:</h4>

In [None]:
df['price_per_sqft'] = df['price']*100000/df['total_sqft']

In [None]:
df['price_per_sqft']

In [None]:
df.describe()

In [None]:
df.sample(3)

<h2>Analysis of bath column:</h2>

In [None]:
df['bath'].nunique()

In [None]:
df['bath'].value_counts()

In [None]:
df['bath'].unique()

In [None]:
df['bath'].isnull().sum()

In [None]:
#filling missing value with median
df['bath'] = df['bath'].fillna(df['bath'].median())

In [None]:
sns.countplot(x=df['bath'])

This column contains outliers, which will be handled later.

In [None]:
df.sample(3)

<h2>Analysis of balcony column:</h2>

In [None]:
df['balcony'].nunique()

In [None]:
df['balcony'].value_counts()

In [None]:
df['balcony'].isnull().sum()

In [None]:
df['balcony'].value_counts().plot.pie(autopct='%.2f',shadow=True,startangle=90,wedgeprops={'edgecolor':'black'},rotatelabels=True)
plt.show()

In [None]:
sns.displot(df, x="price", col="balcony", kde=True, col_wrap=4)
plt.show()

since there are only four categories and pdf is almost similar in all so this feature doesn't impact price that much so it will get dropped

In [None]:
df.drop(columns=['balcony'],inplace=True)

In [None]:
df.sample(3)

<h4># The analysis of all columns is complete. Next, the focus will be on detecting and removing outliers.
</h4>

<h1>Outlier's detection and removal</h1>

In [None]:
df.sample(3)

In [None]:
df.describe()

<h2>outlier's of total_sqft column :</h2>

let's see area per BHK:

In [None]:
(df['total_sqft']/df['BHK']).describe()

In [None]:
# in 1BHK area should be >= 300 so removing rows in area per BHK is less than 300
df = df[(df['total_sqft']/df['BHK']) >= 300]

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
sns.displot(x=df['total_sqft'])

In [None]:
df.sample(3)

<h2>Outlier's of price_per_sqft:</h2>

In [None]:
df['price_per_sqft'].describe()

In [None]:
#seeing price_per_sqft location wise
df.groupby('location')['price_per_sqft'].describe()

For each location group, price_per_sqft will be kept within the range [mean - standard deviation, mean + standard deviation].

In [None]:
#function to kept price_per_sqft in range
def remove_price_per_sqft_outliers(df):
    temp_df = pd.DataFrame()
    for location,sub_df in df.groupby('location'):
        mean = sub_df['price_per_sqft'].mean()
        std_dev = sub_df['price_per_sqft'].std()
        
        filtered_df = sub_df[(sub_df['price_per_sqft'] >= mean-std_dev) & (sub_df['price_per_sqft'] <= mean+std_dev)]
        temp_df = pd.concat([temp_df,filtered_df],ignore_index=True)
    return temp_df
df = remove_price_per_sqft_outliers(df)

In [None]:
df['price_per_sqft'].describe()

In [None]:
df.sample(2)

<h2>Outlier's of BHK column:</h2>

In [None]:
sns.countplot(x=df['BHK'])

In [None]:
sns.boxplot(x=df['BHK'])

Houses with more than 7 BHK are uncommon, so they will be removed from the dataset.

In [None]:
df = df[df['BHK'] <= 7].copy()

In [None]:
sns.boxplot(x=df['BHK'])
plt.show()

In [None]:
df.sample(2)

<h2>Outlier's of bath column:</h2>

In [None]:
sns.countplot(x=df['bath'])
plt.show()

In [None]:
# Houses with more than 6 bathrooms are uncommon, so they will be removed from the dataset.
df = df[df['bath'] <= 6].copy()

In [None]:
# Ensuring that the number of bathrooms does not exceed BHK + 2
df = df[df['bath'] <= df['BHK'] + 2].copy()

In [None]:
sns.boxplot(x=df['bath'])
plt.show()

In [None]:
df.describe()

In [None]:
df.sample(2)

In [None]:
# Dropping the price_per_sqft column as it will no longer be used in further analysis.
df.drop(columns=['price_per_sqft'], inplace=True)

In [None]:
df.sample(3)

<h3>Now the dataset is cleaned</h3>

<h1>Model Building:</h1>

In [None]:
#importing libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.metrics import r2_score

In [None]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['price']),df['price'],test_size=.2,random_state=2)

In [None]:
X_train.sample(3)

In [None]:
# transformer for OneHotEncoding
one_hot_transformer = ColumnTransformer([
    ('one_hot_encoder',OneHotEncoder(drop='first',sparse_output=False,dtype=int,handle_unknown='ignore'),['location'])
],remainder='passthrough')

In [None]:
#scaling
scaler = StandardScaler()

<h2>Applying Linear Regression:</h2>

In [None]:
lr = LinearRegression()

In [None]:
pipe_lr = make_pipeline(one_hot_transformer,scaler,lr)

In [None]:
pipe_lr.fit(X_train,y_train)

In [None]:
y_pred_lr = pipe_lr.predict(X_test)

In [None]:
r2_score(y_test,y_pred_lr)

<h2>Applying Lasso:</h2>

In [None]:
lasso = Lasso()

In [None]:
pipe_lasso = make_pipeline(one_hot_transformer,scaler,lasso)

In [None]:
pipe_lasso.fit(X_train,y_train)

In [None]:
y_pred_lasso = pipe_lasso.predict(X_test)
r2_score(y_test,y_pred_lasso)

<h2>Applying Ridge:</h2>

In [None]:
ridge = Ridge()

In [None]:
pipe_ridge = make_pipeline(one_hot_transformer,scaler,ridge)

In [None]:
pipe_ridge.fit(X_train,y_train)

In [None]:
y_pred_ridge = pipe_ridge.predict(X_test)
r2_score(y_test,y_pred_ridge)

In [None]:
print("LinearRegression r2_score: ",r2_score(y_test,y_pred_lr))
print("LassoRegression r2_score: ",r2_score(y_test,y_pred_lasso))
print("RidgeRegression r2_score: ",r2_score(y_test,y_pred_ridge))