In [134]:
library(tidyverse)
library(janitor)
library(leaps)

# STAT 306 Group C3 Project

# The Data & Goal Analysis

The data being explored is the [Real Estate Valuation Data Set](https://archive.ics.uci.edu/dataset/477/real+estate+valuation+data+set) that explores real-estate prices in Sindian Dist., New Taipei City, Taiwan. This data consists of n=414 observations that contain various numerical and time-related features. This data was obtained through UC Irvine Machine Learning Repository.

The variables we have are: 
- X1 =  The transaction date in numerical units. For instance, 2013.250 equals March 2013, where the month is depicted as a fraction of the year (3/12 = 0.250) 
- X2 = The house age in years
- X3 = Distance to the nearest MRT station in metres (MRTs are metro transit systems) 
- X4 = Number of convenience stores in the living circle on foot by count (integer)
- X5 = Latitude in degree
- X6 = Longitude in degree

**The primary objective of this analysis is to determine how the real estate price is influenced by various factors such as house age, proximity to transportation (MRT), convenience store availability, and geographical location.**

# Reading In the Data

In [135]:
real_estate_data <- clean_names(real_estate_data)
head(real_estate_data)

no,x1_transaction_date,x2_house_age,x3_distance_to_the_nearest_mrt_station,x4_number_of_convenience_stores,x5_latitude,x6_longitude,y_house_price_of_unit_area
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,2012.917,32.0,84.87882,10,24.98298,121.5402,37.9
2,2012.917,19.5,306.5947,9,24.98034,121.5395,42.2
3,2013.583,13.3,561.9845,5,24.98746,121.5439,47.3
4,2013.5,13.3,561.9845,5,24.98746,121.5439,54.8
5,2012.833,5.0,390.5684,5,24.97937,121.5425,43.1
6,2012.667,7.1,2175.03,3,24.96305,121.5125,32.1


# Feature Engineering & Model Development

In [147]:
df <- real_estate_data

# Time based features
df <- df %>%
  mutate(year = as.integer(df$x1_transaction_date)) %>%
  mutate(month = as.integer(round((x1_transaction_date - year) * 12), 1))

# Create location-based features
df$neighborhood <- kmeans(df[, c('x5_latitude', 'x6_longitude')], centers = 3)$cluster # Adjust centers as needed

# Proximity-based features
df$distance_to_mrt_category <- cut(df$x3_distance_to_the_nearest_mrt_station,
                            breaks = c(-Inf, 250, 500, 750, 1000, Inf),
                            labels = c("under_250m", "250m_500m", "500m_750m", "750m_1000m", "over_1000m"))

model_df <- select(df, -c(no))
model <- lm(y_house_price_of_unit_area ~ ., data = model_df)
summary(model)


Call:
lm(formula = y_house_price_of_unit_area ~ ., data = model_df)

Residuals:
    Min      1Q  Median      3Q     Max 
-34.949  -3.985  -0.520   2.874  70.946 

Coefficients:
                                         Estimate Std. Error t value Pr(>|t|)
(Intercept)                            -3.659e+04  7.557e+03  -4.842 1.84e-06
x1_transaction_date                    -2.857e+03  1.471e+03  -1.943 0.052757
x2_house_age                           -2.689e-01  3.579e-02  -7.513 3.83e-13
x3_distance_to_the_nearest_mrt_station -5.308e-04  7.652e-04  -0.694 0.488254
x4_number_of_convenience_stores         3.363e-01  1.931e-01   1.742 0.082344
x5_latitude                             2.082e+02  4.654e+01   4.474 1.00e-05
x6_longitude                            1.613e+02  5.810e+01   2.776 0.005768
year                                    2.863e+03  1.470e+03   1.947 0.052263
month                                   2.386e+02  1.226e+02   1.947 0.052260
neighborhood                            3.

## Model Selection

In [148]:
s <- regsubsets(y_house_price_of_unit_area ~., data = model_df, method = "exhaustive", nvmax = 100)
ss <- summary(s)

rss <- ss$rss
adjr2 <- ss$adjr2
bic <- ss$bic

variables <- data.frame((ss$which))
variables$model <- rownames(variables)
rownames(variables) <- 1:nrow(variables)

rss <- ss$rss
adjr2 <- ss$adjr2
bic <- ss$bic

output <- cbind(variables, RSS=rss, AdjR2=adjr2, BIC=bic)
output

Unnamed: 0_level_0,X.Intercept.,x1_transaction_date,x2_house_age,x3_distance_to_the_nearest_mrt_station,x4_number_of_convenience_stores,x5_latitude,x6_longitude,year,month,neighborhood,distance_to_mrt_category250m_500m,distance_to_mrt_category500m_750m,distance_to_mrt_category750m_1000m,distance_to_mrt_categoryover_1000m,model,RSS,AdjR2,BIC
Unnamed: 0_level_1,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<chr>,<dbl>,<dbl>,<dbl>
1,True,False,False,False,False,False,False,False,False,False,False,False,False,True,1,39574.66,0.4811666,-260.6073
2,True,False,True,False,False,False,False,False,False,False,False,False,False,True,2,34638.91,0.5447706,-309.731
3,True,False,True,False,False,True,False,False,False,False,False,False,False,True,3,30909.09,0.5927977,-350.871
4,True,False,True,False,False,True,False,False,False,False,False,False,True,True,4,28862.63,0.6188284,-373.2052
5,True,True,True,False,False,True,False,False,False,False,False,False,True,True,5,27938.95,0.6301225,-380.645
6,True,True,True,False,True,True,False,False,False,False,False,False,True,True,6,27223.58,0.6387077,-385.3577
7,True,True,True,False,False,True,False,False,False,False,True,True,True,True,7,26379.54,0.6490469,-392.3707
8,True,True,True,False,False,True,True,False,False,False,True,True,True,True,8,26090.41,0.6520364,-390.9074
9,True,True,True,False,False,True,True,False,False,True,True,True,True,True,9,25438.99,0.6598845,-395.3495
10,True,True,True,False,True,True,True,False,False,True,True,True,True,True,10,25285.77,0.6610942,-391.8247


In [149]:
best_adjr2 <- which.max(output$AdjR2) 
print(best_adjr2)
output %>% slice(best_adjr2)

[1] 12


X.Intercept.,x1_transaction_date,x2_house_age,x3_distance_to_the_nearest_mrt_station,x4_number_of_convenience_stores,x5_latitude,x6_longitude,year,month,neighborhood,distance_to_mrt_category250m_500m,distance_to_mrt_category500m_750m,distance_to_mrt_category750m_1000m,distance_to_mrt_categoryover_1000m,model,RSS,AdjR2,BIC
<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<chr>,<dbl>,<dbl>,<dbl>
True,True,True,False,True,True,True,True,True,True,True,True,True,True,12,25048.42,0.6626009,-383.6774


## Interpretation Of The Results