In [None]:
# import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# load the dataset
df=pd.read_csv("house_price.csv")
df

In [None]:
# check the information about the dataframe
df.info()

In [None]:
# summary statistics of the dataframe
df.describe()

## Outlier detection and removal using mean function

In [None]:
# calculating mean and standard deviation
mean_price_per_sqft=df["price_per_sqft"].mean()
mean_price_per_sqft

In [None]:
std_dev_price_per_sqft=df["price_per_sqft"].std()
std_dev_price_per_sqft

In [None]:
# define a threshold
threshold=3


In [None]:
# detect outliers
outliers_mean=df[(df["price_per_sqft"]<mean_price_per_sqft-threshold*std_dev_price_per_sqft)|
(df["price_per_sqft"]>mean_price_per_sqft+threshold*std_dev_price_per_sqft)]

In [None]:
# remove outliers
df_cleaned_mean=df[~df["price_per_sqft"].isin(outliers_mean["price_per_sqft"])]

In [None]:
#print the number of outliers detected
print("Number of outliers detected using mean function : ",len(outliers_mean))

## Outlier detection and removal using percentile method

In [None]:
# calculate lower and upper percentile values
lower_percentile=df["price_per_sqft"].quantile(0.05)
lower_percentile


In [None]:
upper_percentile=df["price_per_sqft"].quantile(0.95)
upper_percentile

In [None]:
# detect outliers
outliers_percentile=df[(df["price_per_sqft"]<lower_percentile)|(df["price_per_sqft"]>upper_percentile)]

In [None]:
# remove outliers
df_cleaned_percentile=df[~df["price_per_sqft"].isin(outliers_percentile["price_per_sqft"])]

In [None]:
# print number of outliers detected
print("Number of outliers detected using percentile method : ",len(outliers_percentile))

## Outlier detection and removal using IQR method

In [None]:
# calculate the first quartile
q1=df["price_per_sqft"].quantile(0.25)
q1

In [None]:
# calculate the third quartile
q3=df["price_per_sqft"].quantile(0.75)
q3

In [None]:
# calculate the inter quartile range
IQR=q3-q1
IQR

In [None]:
# define lower and upper bounds for outlier detection
lower_bound=q1-1.5*IQR
lower_bound

In [None]:
upper_bound=q3+1.5*IQR
upper_bound

In [None]:
# detect outliers
outliers_IQR=df[(df["price_per_sqft"]<lower_bound)|(df["price_per_sqft"]>upper_bound)]

In [None]:
# remove outliers
df_cleaned_IQR=df[~df["price_per_sqft"].isin(outliers_IQR["price_per_sqft"])]

In [None]:
# print number of outliers detected
print("Number of outliers detected by IQR method : ",len(outliers_IQR))

## Outlier detection and removal using zscore method


In [None]:
from scipy.stats import zscore


In [None]:
# calculate z scores
df["z_score"]=zscore(df["price_per_sqft"])
df.head()



In [None]:
# define threshold
threshold=3

In [None]:
# outlier detection
outliers_zscore=df[(df["price_per_sqft"]>threshold)|(df["price_per_sqft"]<-threshold)]

In [None]:
# remove outliers
df_cleaned_zscore=df[~df["price_per_sqft"].isin(outliers_zscore["price_per_sqft"])]

In [None]:
# print the number of outliers detected
print("Number of outliers detected using Zscore method : ",len(outliers_zscore))

## Boxplot for all numeric columns

In [None]:
df.boxplot(figsize=(12, 8))
plt.title('Boxplot for Numeric Columns')
plt.xticks(rotation=45)
plt.show()

## Histplot for price_per_sqft column

In [None]:
# Create a histogram plot for the "price per sqft" column
plt.figure(figsize=(8, 6))
sns.histplot(df['price_per_sqft'],kde=True)
plt.title('Histogram of Price per Sqft')
plt.xlabel('Price per Sqft')
plt.ylabel('Frequency')
plt.show()

## Check correlation between all numeric columns and plot heatmap

In [None]:
# Select only numeric columns
numeric_columns = df.select_dtypes(include=['number'])

# Calculate the correlation matrix
correlation_matrix = numeric_columns.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True)
plt.title('Correlation Heatmap of Numerical Columns')
plt.show()

## Scatterplot between variables to check correltaion between them

In [None]:
# Create a pairplot
sns.pairplot(df, diag_kind='kde')
plt.suptitle('Pairplot of Numerical Columns')
plt.show()