In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Read the dataframe
dataframe= pd.read_csv("/kaggle/input/housesalesprediction/kc_house_data.csv")
dataframe.head(10)

# Exploratory Data Analysis

In [None]:
# shape of the dataframe
dataframe.shape

In [None]:
# Describe the dataframe
dataframe.describe()

In [None]:
# Display the values of date column
dataframe["date"]

In [None]:
# Convert the 'date' column to datetime and extract the year
dataframe['date'] = pd.to_datetime(dataframe['date']).dt.year
dataframe['date']

In [None]:
# Plot the date and price column
plt.figure(figsize=(10,8))
plt.scatter(dataframe["date"], dataframe["price"])
plt.title("Price vs. Date", fontsize=14)
plt.xlabel("Date")
plt.ylabel("Price")
plt.show()

In [None]:
dataframe.drop(["id", "date"], inplace=True, axis=1)
dataframe.head(10)

In [None]:
dataframe.info()

In [None]:
dataframe.isnull().sum()

In [None]:
dataframe.duplicated().sum()

In [None]:
dataframe.drop_duplicates(inplace=True)

In [None]:
corelation=dataframe.corr()
corelation

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(corelation, annot=True, fmt='.2f', cmap='Wistia')
plt.show()

# Detecting Outliers 

In [None]:
# check the percentile and median base distribution(For visulaising the outliers)
fig= plt.figure(figsize=(18,10))

ax=fig.add_subplot(2,2,1)
sns.boxplot(data=dataframe, x=dataframe["bedrooms"], y=dataframe["price"], hue=None, color='r', ax=ax)
ax.set_title("Price vs bedrooms ")

ax=fig.add_subplot(2,2,2)
sns.boxplot(data=dataframe, x=dataframe["floors"], y=dataframe["price"], hue=None, color='b', ax=ax)
ax.set_title("Price vs floors")

ax=fig.add_subplot(2,2,3)
sns.boxplot(data=dataframe, x=dataframe["bathrooms"], y=dataframe["price"], hue=None, color='c', ax=ax)
ax.set_title("Price vs bathrooms")


ax=fig.add_subplot(2,2,4)
sns.boxplot(data=dataframe, x=dataframe["grade"], y=dataframe["price"], hue=None, color='y', ax=ax)
ax.set_title("Price vs grade")

plt.show()

# Train RandomForestRegresso before removing Outliers

In [None]:
# Split the data into train and test split and we use 20 percent data for testing
x_train,x_test,y_train,y_test= train_test_split(dataframe.drop("price", axis=1),
                                                dataframe["price"],
                                                test_size=0.2,
                                                random_state=42)
x_train.shape,x_test.shape,y_train.shape, y_test.shape

In [None]:
# Data Preprocessing (--normalise the values of dataset)
std= StandardScaler()
x_train= std.fit_transform(x_train)  
x_test=std.transform(x_test)

In [None]:
model_parameters= {"RandomForestregressor":  [RandomForestRegressor(), {'n_estimators':[100,200], 'max_depth':[6,8],'min_samples_split':[2,4],'criterion':['squared_error']}]}

In [None]:
result={}

for key, value in model_parameters.items():
    regressor= GridSearchCV(value[0], value[1], cv=2, scoring="r2", n_jobs=-1).fit(x_train, y_train)
    regressor.best_params_
    y_pred=regressor.predict(x_test)
    mse=mean_squared_error(y_test, y_pred)
    rmse= np.sqrt(mse)
    result[key] =rmse 


In [None]:
print(result)

# Removing Outliers with help of Z_Score Method

In [None]:
columns=dataframe.columns.to_list()
outliers=[]

for col in columns:
    #print(col)
    mean=np.mean(dataframe[col])
    std=np.std(dataframe[col])
    for i in range(len(dataframe)):
        item = dataframe[col].iloc[i]
        z_score=(item-mean)/std
        if np.abs(z_score)>3:
            outliers.append(i)

outliers = set(outliers)
outliers = list(outliers)

# Ratio of outliers present in dataset

In [None]:
ratio= len(outliers)/len(dataframe)
ratio

In [None]:
dataframe.drop(dataframe.index[outliers],inplace=True)

In [None]:
dataframe.shape

# Boxplot after removing the outliers

In [None]:
# check the percentile and median base distribution
fig= plt.figure(figsize=(18,10))

ax=fig.add_subplot(2,2,1)
sns.boxplot(data=dataframe, x=dataframe["bedrooms"], y=dataframe["price"], hue=None, color='r', ax=ax)
ax.set_title("Price vs bedrooms ")

ax=fig.add_subplot(2,2,2)
sns.boxplot(data=dataframe, x=dataframe["floors"], y=dataframe["price"], hue=None, color='b', ax=ax)
ax.set_title("Price vs floors")

ax=fig.add_subplot(2,2,3)
sns.boxplot(data=dataframe, x=dataframe["bathrooms"], y=dataframe["price"], hue=None, color='c', ax=ax)
ax.set_title("Price vs bathrooms")


ax=fig.add_subplot(2,2,4)
sns.boxplot(data=dataframe, x=dataframe["grade"], y=dataframe["price"], hue=None, color='y', ax=ax)
ax.set_title("Price vs grade")

plt.show()

# Train the model again after removing the outliers

In [None]:
result1={}

for key, value in model_parameters.items():
    regressor= GridSearchCV(value[0], value[1], cv=2, scoring="r2", n_jobs=-1).fit(x_train, y_train)
    regressor.best_params_
    y_pred=regressor.predict(x_test)
    mse=mean_squared_error(y_test, y_pred)
    rmse= np.sqrt(mse)
    result1[key] =rmse 


In [None]:
result1

# Remove the column sqft_above 

In [None]:
dataframe.drop("sqft_above", axis=1, inplace=True)

In [None]:
dataframe.shape

In [None]:
result2={}

for key, value in model_parameters.items():
    temp=[]
    regressor= GridSearchCV(value[0], value[1], cv=2, scoring="r2", n_jobs=-1).fit(x_train, y_train)
    regressor.best_params_
    y_pred=regressor.predict(x_test)
    mse=mean_squared_error(y_test, y_pred)
    rmse= np.sqrt(mse)
    result2[key] =rmse 


In [None]:
result2

In [None]:
result_df= pd.DataFrame({'Result_With_Outliers': result, 'Result_without_outliers': result1, 'Result_after_Collineraity':result2})
result_df= result_df.T
result_df

In [None]:
result_df.plot(kind="bar", figsize=(10, 7)).legend(bbox_to_anchor=(1.0, 1.0));