# Import Required Libraries
Import the necessary libraries, including pandas, matplotlib, seaborn, and others.

In [None]:
# Import Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import boxcox

# Load Dataset
Load the dataset using pandas.

In [None]:
# Load Dataset
df = pd.read_excel('/content/Real_estate_valuation_data_set.xlsx')

# Display the first few rows of the dataset
df.head()

# Correlation Heatmap
Create a heatmap to visualize the correlation between different features.

In [None]:
# Correlation Heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

# Pair Plot
Create a pair plot to visualize the relationships between pairs of features.

In [None]:
# Pair Plot
sns.pairplot(df)
plt.title('Pair Plot of Features')
plt.show()

# Distribution of Target Variable
Plot the distribution of the target variable to understand its distribution.

In [None]:
# Distribution of Target Variable
plt.figure(figsize=(10, 6))
sns.histplot(df['Y house price of unit area'], kde=True, bins=30)
plt.title('Distribution of Target Variable: House Price of Unit Area')
plt.xlabel('House Price of Unit Area')
plt.ylabel('Frequency')
plt.show()

# Box Plots for Numerical Features
Create box plots for numerical features to visualize their distributions and identify outliers.

In [None]:
# Box Plots for Numerical Features
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()

plt.figure(figsize=(15, 10))
for i, feature in enumerate(numerical_features):
    plt.subplot(3, 3, i + 1)
    sns.boxplot(y=df[feature])
    plt.title(f'Box Plot of {feature}')
    plt.tight_layout()

plt.show()

# Scatter Plots for Feature Relationships
Create scatter plots to visualize relationships between the target variable and other features.

In [None]:
# Scatter Plots for Feature Relationships

# Define the target variable
target_variable = 'Y house price of unit area'

# List of features to plot against the target variable
features = df.columns.tolist()
features.remove(target_variable)

# Create scatter plots
plt.figure(figsize=(20, 15))
for i, feature in enumerate(features):
    plt.subplot(3, 3, i + 1)
    sns.scatterplot(x=df[feature], y=df[target_variable])
    plt.title(f'Scatter Plot: {feature} vs {target_variable}')
    plt.xlabel(feature)
    plt.ylabel(target_variable)
    plt.tight_layout()

plt.show()

# PCA Explained Variance Plot
Create a plot to show the explained variance by each principal component in PCA.

In [None]:
# PCA Explained Variance Plot

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.drop(columns=['Y house price of unit area']))

# Perform PCA
pca = PCA()
pca.fit(scaled_data)

# Calculate explained variance
explained_variance = pca.explained_variance_ratio_

# Plot explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o', linestyle='--')
plt.title('Explained Variance by Each Principal Component')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.xticks(range(1, len(explained_variance) + 1))
plt.grid(True)
plt.show()