# House Price Prediction

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

##  sklearn imports

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Machine Learning Algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

## 1. Data importing 

In [None]:
data = pd.read_csv('/kaggle/input/housing-price-prediction/Housing.csv')

In [None]:
# checking first 5 records in dataset
data.head()

## 2. Column description
- There are 13 columns in our dataset.
1. `Price`: The price of the house.
2. `Area`: The total area of the house in square feet.
3. `Bedrooms`: The number of bedrooms in the house.
4. `Bathrooms`: The number of bathrooms in the house.
5. `Stories`: The number of stories in the house.
6. `Mainroad`: Whether the house is connected to the main road (Yes/No).
7. `Guestroom`: Whether the house has a guest room (Yes/No).
8. `Basement`: Whether the house has a basement (Yes/No).
9. `Hot water heating`: Whether the house has a hot water heating system (Yes/No).
10. `Airconditioning`: Whether the house has an air conditioning system (Yes/No).
11. `Parking`: The number of parking spaces available within the house.
12. `Prefarea`: Whether the house is located in a preferred area (Yes/No).
13. `Furnishing status`: The furnishing status of the house (Fully Furnished, Semi-Furnished, Unfurnished).

## 3. Basic Programatical Analysis 

In [None]:
data.head()

In [None]:
# shape of dataset
print('Shape of original dataset: ', data.shape)
print('Number of rows in dataset: ', data.shape[0])
print('Number of columns in dataset: ',data.shape[1])

In [None]:
data.info()

##  conclusion 1: Basic info about dataset and  its column count
- Here in dataset, total 13 columns are given.
- 6 columns are having numerical data ('int64')
- 7 columns are having categorical data
- Total number of rows in data is 545

In [None]:
# check missing values in dataset
data.isnull().sum()

In [None]:
# check duplicated rows in data
data.duplicated().sum()

## Conclusion 2: about missing and duplicated data
- Here in dataset, no any missing value and duplicated data.

#  Quick Summary about data

In [None]:
# Numerical columns summmary 
round(data.describe(), 2)

In [None]:
# Categorical columns summary
data.describe(include='O')

# 4. Exploratory Data Analysis

## EDA-1. Column Type

In [None]:
num_cols = data.describe().columns
cat_cols = data.describe(include='O').columns

print('Numerical columns are: ', num_cols)
print()
print('Categorical columns are: ', cat_cols)

1. `Numerical Column`: price, area, bedrooms, bathrooms, stories, parking
2. `Categorical Columns`: mainroad, guestroom, basement, hotwaterheating, airconditioning, prefarea, furnishingstatus
3. our target column is `price`

## EDA-2. Univariate Analysis

### 1. Numerical columns univariate analysis 

In [None]:
num_cols

In [None]:
# Descriptive statistics
round(data[num_cols].describe(), 2)

In [None]:
# create a fuction for univariate analysis of numerical columns
def univariate_num(col):
    fig, ax = plt.subplots(1, 2, figsize=(10,5))
    # histogram
    sns.histplot(x=data[col], bins=30, stat='frequency', ax = ax[0], kde=True )
    ax[0].set_title(f'Histogram for {col} distribution.')
    
    # boxplot
    sns.boxplot(x=data[col], ax= ax[1])
    ax[1].set_title(f'Boxplot for {col} column.')
    plt.show()
    # skewness
    skewness = data[col].skew()
    if skewness < 0:
        print(f'skewness for {col} column data is {skewness} --> negative skewed')
    elif skewness > 0:
        print(f'skewness for {col} column data is {skewness} --> positively skewed ')
    else:
        print(f'skewness for {col} column data is {skewness} --> normaly distributed.')
        
   
    


In [None]:
a = 1
for col in num_cols:
    print(f'{a}. Univariate analysis for {col} column:')
    univariate_num(col)
    print('=='*40)
    a+=1

## Conclusion 3: about Univariate analysis on numerical columns
- There are two columns having outliers : price and area
- other columns are also showing outliers, but they won't affected on our analysis
- Column `area` having positive skewd data, we will apply logarithm transform on them to  make them normal skewed


### 2. Univariate analysis on categorical  columns

In [None]:
# create a fuction for univariate analysis on categorical data
def univariate_cat(col):
    count = data[col].value_counts()
    print(count)
    fig, ax = plt.subplots(1, 2, figsize=(10,5))
    # countplot
    sns.countplot(x=data[col], ax = ax[0] )
    ax[0].set_title(f'Countplot for {col} distribution.')
    
    # pie chart
    ax[1].pie(count, labels=count.index, autopct='%1.1f%%', shadow=True )
    ax[1].set_title(f'Piechart for {col} column.')
    plt.show()

In [None]:
a = 1
for col in cat_cols:
    print(f"{a}. Univariate analysis for categorical '{col}' column:")
    univariate_cat(col)
    print('=='*40)
    a+=1

## EDA-3. Bivariate Analysis
- We are doing bivariate analysis between our target column 'price' and  other each column of dataset.

### 1. Bivariate Analysis between each Numerical column with 'price' column


In [None]:
# create fuction for bivariate analysis for numerical columns
def bivariate_num(col):
    fig, ax = plt.subplots(2, 2, figsize=(12, 10))

    # Scatter Plot
    sns.scatterplot(x=data[col], y=data['price'], ax=ax[0, 0])
    ax[0,0].set_title(f'Relation between z and price of diamonds')

    # Regression Plot
    sns.regplot(x=data[col], y=data['price'], ax=ax[0,1])
    ax[0,1].set_title(f'Regression Plot: {col} vs Price')

    # Heatmap
    heatmap_data = data[[col, 'price']].corr()
    sns.heatmap(heatmap_data, annot=True, cmap='coolwarm', ax=ax[1, 0])
    ax[1, 0].set_title(f'Correlation Heatmap: {col} vs Price')

    ax[1,1].axis('off')


    plt.tight_layout()
    plt.show()
    
    

In [None]:
num_cols

In [None]:
a = 1
for col in num_cols[1:]:
    print(f"{a}. Bivariate analysis between 'price' and '{col}' columns.")
    bivariate_num(col)
    print('=='*40)
    a+=1

### 2. Bivariate Analysis between each categorical column with 'price' column

In [None]:
# create a fuction for bivariate analysis between categorical column and 'price' column 
def bivariate_cat(col):
    fig, ax = plt.subplots(2, 2, figsize=(10, 8))

    # Bar Plot
    sns.barplot(x=col, y='price', data=data, ax=ax[0, 0])
    ax[0, 0].set_title(f'Average Price by {col}')

    # Box Plot
    sns.boxplot(x=col, y='price', data=data, ax=ax[0, 1])
    ax[0, 1].set_title(f'Price Distribution by {col}')


    # Point Plot
    sns.pointplot(x=col, y='price', data=data, ax=ax[1, 0])
    ax[1, 0].set_title(f'Average Price by {col}')
    
    ax[1,1].axis('off')

    plt.tight_layout()
    plt.show()

In [None]:
a = 1
for col in cat_cols:
    print(f"{a}. Bivariate analysis between 'price' and '{col}' columns.")
    bivariate_cat(col)
    print('=='*40)
    a+=1

## Conclusion 4:based on Bivariate analysis between 'price' and categorica columns
1. Average price of house beside of main road is greater that price of house which not has main road beside.
2. The avegrage price of house which has guestrooms is greater than non guestroom houses.
3. similarly , houses with basements, hot water heating system, air conditioning, and parking area has greater average price.
4. Average price of house in preferred area is greater than  house in non preffered area.
5. Average price of furnished house is greater than other semi-furnished and unfurnished housed.


In [None]:
### create fuction for Outlier remiving
def remove_outlier(data, column):
    #  quartiles and IQR for the specified column
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    
    # lower and upper bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Remove outliers outside the bounds for the specified column
    data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
    
    return data


In [None]:
# create a new data frame before removing outliers
data1 = data.copy()

In [None]:
# removing outliers from 'area' column
data1 = remove_outlier(data1, 'area')

In [None]:
sns.boxplot(x=data1['area'])

In [None]:
data1['area'].skew()

In [None]:
# removing outliers from 'area' column
data1 = remove_outlier(data1, 'price')

In [None]:
sns.boxplot(x=data1['price'])

## EDA-4. Multivariate Analysis

In [None]:
# pairplot
sns.pairplot(data)

In [None]:
# correlation
data.corr(numeric_only=True)

In [None]:
# heatmap
plt.figure(figsize=(9,7))
sns.heatmap(data.corr(numeric_only=True), annot=True)
plt.title('Correlation between all numerical columns')
plt.show()

In [None]:
### Store a new cleaned doata to a csv file
data1.to_csv('House_price_prediction_cleaned_data.csv')

# split data as dependent and independent columns

In [None]:
X = data1.drop('price', axis=1)
y = data1['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

print('Shape of X_train: ',X_train.shape)
print('Shape of X_test: ',X_test.shape)
print('Shape of y_train: ',y_train.shape)
print('Shape of y_test: ',y_test.shape)

# 5. Feature Encoding

## Encoding Categorical columns


In [None]:
cat_cols

In [None]:
ohe = OneHotEncoder(drop='first')

# fit X_train in onehotencoder
ohe.fit(X_train[cat_cols])

# x_train transformed by ohe and stored in array
ohe_train = ohe.transform(X_train[cat_cols]).toarray()

# x_test transform by ohe and stored in an array
ohe_test = ohe.transform(X_test[cat_cols]).toarray()

# encoded column names 
encoded_column = ohe.get_feature_names_out(cat_cols)

# x_train dataframme
X_train_ohe = pd.DataFrame(ohe_train, columns=encoded_column)

# X_test dataframe for only categorical data
X_test_ohe = pd.DataFrame(ohe_test, columns=encoded_column)

In [None]:
X_train_ohe

# 6. Feature Scaling

### 1. Standard Scaling

In [None]:
std_scaler = StandardScaler()
X_train_num = std_scaler.fit_transform(X_train[num_cols[1:]])
X_test_num = std_scaler.transform(X_test[num_cols[1:]])

# create standard scaled df
X_train_std = pd.DataFrame(X_train_num, columns=num_cols[1:])
X_test_std = pd.DataFrame(X_test_num, columns=num_cols[1:])

In [None]:
# before apllying Standard Scaler
round(X_train[num_cols[1:]].describe(),2)

In [None]:
# after applied standard sclaer---> mean=0 and std = 1
round(X_train_std.describe(),2)

### 2. MinMax Scaler

In [None]:
minmax = MinMaxScaler()
X_train_num1 = minmax.fit_transform(X_train[num_cols[1:]])
X_test_num1 = minmax.transform(X_test[num_cols[1:]])

# create standard scaled df
X_train_minmax = pd.DataFrame(X_train_num1, columns=num_cols[1:])
X_test_minmax = pd.DataFrame(X_test_num1, columns=num_cols[1:])

In [None]:
# after applied standard sclaer---> min=0 and max = 1
round(X_train_minmax.describe(),2)

# 7. Concat both encoded and scaled data


### 1. Encoded and Standard Scaled Data

In [None]:
# enocded and standard scaled data
X_train_ohe_std = pd.concat([X_train_ohe, X_train_std], axis=1)
X_test_ohe_std = pd.concat([X_test_ohe, X_test_std], axis=1)

X_train_ohe_std.head()

In [None]:
X_train_ohe_std.isnull().sum()

### 2. Encoded and minmax scaled data


In [None]:
# enocded and Minmax scaled data
X_train_ohe_minmax = pd.concat([X_train_ohe, X_train_minmax], axis=1)
X_test_ohe_minmax = pd.concat([X_test_ohe, X_test_minmax], axis=1)

X_train_ohe_minmax.head()

# 8. Feature Selection

In [None]:
from sklearn.feature_selection import mutual_info_regression
a = data1[num_cols].drop('price', axis=1)
b = data1[num_cols]['price']
ar = mutual_info_regression(a,b)
s1 = pd.Series(ar, index= a.columns)
s1.sort_values(ascending=True).plot(kind='barh')

In [None]:
from sklearn.ensemble import RandomForestClassifier  # for classification tasks
from sklearn.ensemble import RandomForestRegressor  # for regression tasks
from sklearn.feature_selection import SelectFromModel

# Assuming X_train is your training feature matrix and y_train is your training target variable
model = RandomForestClassifier(n_estimators=100, random_state=42)  # for classification
# model = RandomForestRegressor(n_estimators=100, random_state=42)  # for regression
model.fit(X_train_ohe_std, y_train)

# Select top k features based on importance scores
sfm = SelectFromModel(model, threshold=-np.inf, max_features=8)  # Choose the number of features you want to keep (e.g., 5)
sfm.fit(X_train_ohe_std, y_train)

X_train_selected = sfm.transform(X_train_ohe_std)
X_test_selected = sfm.transform(X_test_ohe_std)

X_train_selected_df = pd.DataFrame(X_train_selected, columns=sfm.get_feature_names_out(X_train_ohe_std.columns))
X_test_selected_df = pd.DataFrame(X_test_selected, columns=sfm.get_feature_names_out(X_test_ohe_std.columns))

In [None]:
X_train_selected_df

In [None]:
X_test_selected_df

In [None]:
# Linear Regression with sstandard scaled data
from sklearn.ensemble import RandomForestRegressor
# Evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

model = RandomForestRegressor()
model.fit(X_train_selected_df, y_train)
y_pred = model.predict(X_test_selected_df)

# mean square error
mse = mean_squared_error(y_test,y_pred)
print("mean squared error of our ml model is :" , mse, "\n")

# root mean square error
rmse = np.sqrt(mse)
print("Root mean square : ",rmse, '\n')

# mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print("mean absolute error of our ml model is : ",mae,'\n' )

# r2  score
r2_value = r2_score(y_test, y_pred)
print("r2_score is : ", r2_value)

# 9. Trainin machine Learning Models

In [None]:
# Linear Regression with sstandard scaled data
from sklearn.linear_model import LinearRegression
# Evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

model = LinearRegression()
model.fit(X_train_ohe_std, y_train)
y_pred = model.predict(X_test_ohe_std)

# mean square error
mse = mean_squared_error(y_test,y_pred)
print("mean squared error of our ml model is :" , mse, "\n")

# root mean square error
rmse = np.sqrt(mse)
print("Root mean square : ",rmse, '\n')

# mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print("mean absolute error of our ml model is : ",mae,'\n' )

# r2  score
r2_value = r2_score(y_test, y_pred)
print("r2_score is : ", r2_value)

### lets try other regression machine learning models

In [None]:
lr = LinearRegression()
svr = SVR()
knn_r = KNeighborsRegressor()
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor()

regressors = {'Linear Regression':lr,
        'Support Vector Regressor':svr,
        'K-Nearest Neighbors Regressor':knn_r,
        'Decision Tree Regressor':dtr,
        'Random Forest Regressor':rfr,
        }

def train_model(model,X_train,y_train,X_test,y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2score = r2_score(y_test,y_pred)
    
    return mse,r2score

## 1. Performance of few ML model with Standard Scaled data

In [None]:
mse_scores = []
rmse_scores = []
r2_scores = []
for name, model in regressors.items():
    mse,r2score = train_model(model, X_train_ohe_std , y_train, X_test_ohe_std, y_test)
    mse_scores.append(mse)
    rmse_scores.append(np.sqrt(mse))
    r2_scores.append(r2score)
    
    
performance_with_standard_scaler = pd.DataFrame({'Models':regressors.keys(),
                               'Mean Squared Error':mse_scores,
                               'Root mse': rmse_scores,
                               'R2_Scores':r2_scores}).sort_values('R2_Scores', ascending=False)

performance_with_standard_scaler

## 2. Performance of few ML model with MinMax Scaled data

In [None]:
mse_scores = []
rmse_scores = []
r2_scores = []
for name, model in regressors.items():
    mse,r2score = train_model(model, X_train_ohe_minmax , y_train, X_test_ohe_minmax, y_test)
    mse_scores.append(mse)
    rmse_scores.append(np.sqrt(mse))
    r2_scores.append(r2score)
    
    
performance_with_minmax_scaler = pd.DataFrame({'Models':regressors.keys(),
                               'Mean Squared Error':mse_scores,
                               'Root mse': rmse_scores,
                               'R2_Scores':r2_scores}).sort_values('R2_Scores', ascending=False)

performance_with_minmax_scaler

# we have to retrain our model, because performance is not good now. Please comment and suggest me that what should i do in this notebook to improve model performance.   Thank you for visiting this notebook.