In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import  train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import boxcox



# Read the dataset file
df = pd.read_excel('Real estate valuation data set.xlsx')

# Perform analysis on the dataset
# ...

In [None]:
df.head()

In [None]:
df.info()

In [None]:
print(df.isnull().sum())

In [None]:
print(df.describe())

In [None]:
df.hist(bins=30, figsize=(12,8))
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(data=df, width=0.5)
plt.xticks(rotation=45)
plt.show()

In [None]:
correlation_matrix = df.corr()

plt.figure(figsize=(10,8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm',fmt='.2f')
plt.title("Correlation matrix")
plt.show()

In [None]:
target_column = 'Y house price of unit area'

features = ["X2 house age", "X3 distance to the nearest MRT station", "X4 number of convenience stores"]
for feature in features:
    plt.figure(figsize=(10,6))
    sns.scatterplot(data=df, x=feature, y=target_column)
    plt.title(f"{feature} vs {target_column}")
    plt.show()

In [None]:
df.to_csv('real_estate.csv', index=False)

In [None]:
df = df.drop(columns=['No', 'X1 transaction date'])

In [None]:
X = df.drop(columns=['Y house price of unit area'])
y = df['Y house price of unit area']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test = pd.DataFrame(X_test_scaled, columns=X.columns)

In [None]:
# Histplot of X3 distance to the nearest MRT station
import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(df["X3 distance to the nearest MRT station"], kde=True, bins=30)
plt.title("Distribution of X3 Distance to MRT")
plt.show()


In [None]:


df["X3_log"] = np.log1p(df["X3 distance to the nearest MRT station"])  # log(1+x) to handle zero values


In [None]:


sns.histplot(df["X3_log"], kde=True, bins=30)
plt.title("Log-Transformed Distribution of X3 Distance to MRT")
plt.show()


In [None]:
df[['X3_log', 'Y house price of unit area']].corr()


In [None]:
# High negative correlation between X3_log and Y house price of unit area indicates the price increases as the distance to the nearest MRT station decreases

In [None]:
# Scatter plot with regression line
plt.figure(figsize=(8, 6))
sns.regplot(x=df["X3_log"], y=df["Y house price of unit area"], scatter_kws={"alpha": 0.5}, line_kws={"color": "red"})

# Titles and labels
plt.title("Relationship between Log-Transformed Distance to MRT and House Price")
plt.xlabel("Log-Transformed Distance to MRT (X3_log)")
plt.ylabel("House Price per Unit Area (Y)")


In [None]:
# Scatter plot with regression line
plt.figure(figsize=(8, 6))
sns.regplot(x=df["X3 distance to the nearest MRT station"], y=df["Y house price of unit area"], scatter_kws={"alpha": 0.5}, line_kws={"color": "red"})

# Titles and labels
plt.title("Relationship between X3 distance to the nearest MRT station and House Price")
plt.xlabel("X3 Distance to MRT (X3_log)")
plt.ylabel("House Price per Unit Area (Y)")


In [None]:
# Scatter plot with regression line
plt.figure(figsize=(8, 6))
sns.regplot(x=df["X3_log"], y=df["Y house price of unit area"], scatter_kws={"alpha": 0.5}, line_kws={"color": "red"})

# Titles and labels
plt.title("Relationship between Log-Transformed Distance to MRT and House Price")
plt.xlabel("Log-Transformed Distance to MRT (X3_log)")
plt.ylabel("House Price per Unit Area (Y)")


In [None]:


# List of features to analyze
features = ['X2 house age', 'X4 number of convenience stores']

# Plot histograms and boxplots
for feature in features:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))

    # Histogram with KDE
    sns.histplot(data[feature], kde=True, bins=30, ax=axes[0])
    axes[0].set_title(f"Distribution of {feature}")

    # Boxplot
    sns.boxplot(x=data[feature], ax=axes[1])
    axes[1].set_title(f"Boxplot of {feature}")

    plt.show()


In [None]:
df[['X4 number of convenience stores', 'Y house price of unit area']].corr()


In [None]:
# Scatter plot with regression line
plt.figure(figsize=(8, 6))
sns.regplot(x=df["X4 number of convenience stores"], y=df["Y house price of unit area"], scatter_kws={"alpha": 0.5}, line_kws={"color": "red"})

# Titles and labels
plt.title("Relationship between X4 number of convenience stores and House Price")
plt.xlabel("X4 number of convenience stores")
plt.ylabel("House Price per Unit Area (Y)")


In [None]:
# Multicollineaity check
# Compute correlation matrix
corr_matrix = X.corr()

# Plot the heatmap
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Feature Correlation Matrix")
plt.show()


In [None]:


# Create a dataframe for VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)


In [None]:
X_location = df[['X5 latitude', 'X6 longitude']]

In [None]:
pca = PCA(n_components=1)  # Reduce to 1 component
df['PCA_Location'] = pca.fit_transform(X_location)

In [None]:
df = df.drop(['X5 latitude', 'X6 longitude'], axis=1)


In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_new = df.drop(columns=['Y house price of unit area'])  # Exclude target variable

vif_data = pd.DataFrame()
vif_data["Feature"] = X_new.columns
vif_data["VIF"] = [variance_inflation_factor(X_new.values, i) for i in range(X_new.shape[1])]

print(vif_data)


In [None]:
df['X1_year'] = df['X1 transaction date'].astype(int)  # Extract the integer year
df['X1_fraction'] = df['X1 transaction date'] - df['X1_year']  # Extract decimal part

# Convert fraction to months and assign quarters
df['X1_month'] = (df['X1_fraction'] * 12).round().astype(int)  # Convert fraction to month
df['X1_quarter'] = ((df['X1_month'] - 1) // 3 + 1)  # Convert month to quarter

df.drop(columns=['X1 transaction date', 'X1_fraction'], inplace=True)  # Drop unnecessary columns


In [None]:
df.info()

In [None]:
X_new = df.drop(columns=['Y house price of unit area'])  # Exclude target variable

vif_data = pd.DataFrame()
vif_data["Feature"] = X_new.columns
vif_data["VIF"] = [variance_inflation_factor(X_new.values, i) for i in range(X_new.shape[1])]

print(vif_data)


In [None]:
df.drop(columns=['X1_month', 'X1_quarter'], inplace=True)


In [None]:
X_new = df.drop(columns=['Y house price of unit area'])  # Exclude target variable

vif_data = pd.DataFrame()
vif_data["Feature"] = X_new.columns
vif_data["VIF"] = [variance_inflation_factor(X_new.values, i) for i in range(X_new.shape[1])]

print(vif_data)

In [None]:
df.info()

In [None]:

# Select latitude & longitude columns
location_features = df[['X5 latitude', 'X6 longitude']]

# Perform PCA (reduce to 1 component)
pca = PCA(n_components=1)
df['PCA_Location'] = pca.fit_transform(location_features)


In [None]:
df.info()

In [None]:
df['X1_year'] = df['X1 transaction date'].astype(str).str[:4].astype(int)


In [None]:
df.info()

In [None]:

# Define features and target
X = df[['X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores', 'PCA_Location', 'X1_year']]
y = df['Y house price of unit area']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Performance Metrics
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"With X1_year -> R²: {r2:.4f}, RMSE: {rmse:.4f}")


In [None]:
# Define features excluding X1_year
X_baseline = df[['X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores', 'PCA_Location']]
y = df['Y house price of unit area']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_baseline, y, test_size=0.2, random_state=42)

# Train model
model_baseline = LinearRegression()
model_baseline.fit(X_train, y_train)

# Evaluate
y_pred_baseline = model_baseline.predict(X_test)
r2_baseline = r2_score(y_test, y_pred_baseline)
rmse_baseline = mean_squared_error(y_test, y_pred_baseline, squared=False)

print(f"Without X1_year -> R²: {r2_baseline:.4f}, RMSE: {rmse_baseline:.4f}")


In [None]:
feature_names = X.columns
coefficients = model.coef_
for name, coef in zip(feature_names, coefficients):
    print(f"{name}: {coef:.4f}")


In [None]:

residuals = y_test - y_pred
sns.histplot(residuals, kde=True)
plt.xlabel("Residuals")
plt.title("Residuals Distribution")
plt.show()


In [None]:

# Select numerical columns for visualization
selected_features = ['X2 house age', 'X3 distance to the nearest MRT station', 
                     'X4 number of convenience stores', 'PCA_Location', 'X1_year', 'Y house price of unit area']

# Pairplot to see feature-target relationships
sns.pairplot(df[selected_features])
plt.show()


In [None]:


# Compute correlation matrix
corr_matrix = df[selected_features].corr()

# Plot heatmap
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature-Target Correlation Heatmap")
plt.show()


In [None]:
df.info()


In [None]:
df["X3_log"] = np.log1p(df["X3 distance to the nearest MRT station"])
df.to_csv("modified_data.csv", index=False)  # Save to verify changes persist


In [None]:
df = pd.read_csv("modified_data.csv")
df.info()


In [None]:
# Select numerical columns for visualization
selected_features = ['X2 house age', 'X3 distance to the nearest MRT station', 
                     'X4 number of convenience stores', 'PCA_Location', 'X1_year', 'X3_log','Y house price of unit area']

# Pairplot to see feature-target relationships
sns.pairplot(df[selected_features])
plt.show()


In [None]:
# Compute correlation matrix
corr_matrix = df[selected_features].corr()

# Plot heatmap
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature-Target Correlation Heatmap")
plt.show()


In [None]:


# Selecting relevant numerical predictors
X = df[["X2 house age", "X3 distance to the nearest MRT station", "X4 number of convenience stores", 
        "PCA_Location", "X1_year", "X3_log"]]

# Compute VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

print(vif_data)


In [None]:

# Drop X3 distance to the MRT station
df_vif = df.drop(columns=["X3 distance to the nearest MRT station"])


# Recalculate VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = df_vif.columns
vif_data["VIF"] = [variance_inflation_factor(df_vif.values, i) for i in range(len(df_vif.columns))]

print(vif_data)


In [None]:
# Drop X3 distance to the MRT station
df_vif = df.drop(columns=["X5 latitude", "X6 longitude","X1_year","X1 transaction date","X3_log"])

# Recalculate VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = df_vif.columns
vif_data["VIF"] = [variance_inflation_factor(df_vif.values, i) for i in range(len(df_vif.columns))]

print(vif_data)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select relevant features for pairplot
selected_features = ["X2 house age", "X3 distance to the nearest MRT station", 
                     "X4 number of convenience stores", "PCA_Location", 
                     "Y house price of unit area"]

# Create pairplot
sns.pairplot(df[selected_features], diag_kind='kde')
plt.show()


In [None]:
df['PCA_Location'] = pca.transform(df[['X5 latitude', 'X6 longitude']])


In [None]:

from scipy.stats import boxcox

# Assuming your dataset is named 'df' and 'X3' is the original variable

# Apply different transformations
df['X3_sqrt'] = np.sqrt(df['X3 distance to the nearest MRT station'])   # Square Root Transformation
df['X3_inv'] = 1 / (df['X3 distance to the nearest MRT station'] + 1)   # Inverse Transformation (Avoid division by zero)

# Box-Cox transformation (only if X3 is strictly positive)
df['X3_boxcox'], _ = boxcox(df['X3 distance to the nearest MRT station'] + 1)

# Select features for VIF calculation (update according to your dataset)
features = ['X2 house age', 'X3_sqrt', 'X3_inv', 'X3_boxcox', 'X4 number of convenience stores', 'PCA_Location', 'Y house price of unit area']

# Create a new DataFrame with selected features
df_vif = df[features]

# Compute VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = df_vif.columns
vif_data["VIF"] = [variance_inflation_factor(df_vif.values, i) for i in range(df_vif.shape[1])]

# Display the VIF values
print(vif_data)


In [None]:
df.columns

In [None]:
df_vif = df.drop(columns=["X3_boxcox"])

# Recalculate VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = df_vif.columns
vif_data["VIF"] = [variance_inflation_factor(df_vif.values, i) for i in range(len(df_vif.columns))]

print(vif_data)

In [None]:
from scipy.stats import boxcox

# Assuming your dataset is named 'df' and 'X3' is the original variable

# Apply different transformations
df['X3_sqrt'] = np.sqrt(df['X3 distance to the nearest MRT station'])   # Square Root Transformation
df['X3_inv'] = 1 / (df['X3 distance to the nearest MRT station'] + 1)   # Inverse Transformation (Avoid division by zero)

# Box-Cox transformation (only if X3 is strictly positive)
df['X3_boxcox'], _ = boxcox(df['X3 distance to the nearest MRT station'] + 1)

# Select features for VIF calculation (update according to your dataset)
features = ['X2 house age', 'X3_sqrt', 'X3_inv', 'X4 number of convenience stores', 'PCA_Location', 'Y house price of unit area']

# Create a new DataFrame with selected features
df_vif = df[features]

# Compute VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = df_vif.columns
vif_data["VIF"] = [variance_inflation_factor(df_vif.values, i) for i in range(df_vif.shape[1])]

# Display the VIF values
print(vif_data)


In [None]:
# Compute correlation matrix
corr_matrix = df[features].corr()

# Plot heatmap
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature-Target Correlation Heatmap")
plt.show()


In [None]:
df_refined = df_vif.drop(columns=['X3_sqrt'])


In [None]:
df_refined.info()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Define the features for VIF calculation
features = ['X2 house age', 'X3_inv', 'X4 number of convenience stores', 'PCA_Location', 'Y house price of unit area']
df_vif = df_refined[features]

# Compute VIF
vif_data = pd.DataFrame()
vif_data['Feature'] = df_vif.columns
vif_data['VIF'] = [variance_inflation_factor(df_vif.values, i) for i in range(df_vif.shape[1])]

print(vif_data)
