In [None]:
# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load

library(tidyverse) # metapackage of all tidyverse packages

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

list.files(path = "../input")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
train <- read.csv("../input/house-prices-advanced-regression-techniques/train.csv")
test <- read.csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [3]:
# Data Exploration:
# Dimensionality of training data test data.
# No. of instances and features in training and test data.
dim(train)
dim(test)

In [4]:
# The name of the feautes in training data.
names(train)

In [None]:
# The name of the feautes in test data.
names(test)

In [None]:
# The structure of training data.
str(train)

In [None]:
# The structure of test data.
str(test)

In [None]:
# The first 6 instances of the training data.
head(train)

In [None]:
# A random sample of 10 instances of the training data.
idx <- sample(1:nrow(train), 10)
idx

In [None]:
train[idx, ]

In [None]:
# Remove Id from training and test data since of no use.
# train$Id = NULL
# train$SalePrice = NULL
# test$Id = NULL

In [None]:
# Distribution of every features in training data.
summary(train)

In [None]:
# No. of missing values for each feature in training data.
colSums(sapply(train, is.na))

In [None]:
# Check for duplicated rows in training data.
cat("The number of duplicated rows are", nrow(train) - nrow(unique(train)))

In [None]:
# Exploring the class feature named "SalePrice".
# Summary of the class feature "SalePrice".
summary(train$SalePrice)

In [None]:
# Quartiles and percentiles of the class feature "SalePrice".
quantile(train$SalePrice)

In [None]:
# The variance of the class feature "SalePrice".
var(train$SalePrice)

In [None]:
# Checking is there any missing value for "SalePrice".
sum(is.na(train$SalePrice))

In [None]:
# The histogram of the class feature "SalePrice".
hist(train$SalePrice)

In [None]:
# The density of the class feature "SalePrice".
plot(density(train$SalePrice))

In [None]:
# Create a new training dataset without missing data.
# new_Train <- na.omit(train)
# dim(new_Train)

In [None]:
# Data Preprocessing:
# Converting "MSSubClass" feature as factor from integer.
train$MSSubClass <- as.factor(train$MSSubClass)
test$MSSubClass <- as.factor(test$MSSubClass)
summary(train$MSSubClass)
summary(test$MSSubClass)

In [None]:
plot(train$MSSubClass, train$SalePrice)

In [None]:
# Converting "OverallQual" feature as factor from integer.
train$OverallQual <- as.factor(train$OverallQual)
test$OverallQual <- as.factor(test$OverallQual)
summary(train$OverallQual)
summary(test$OverallQual)

In [None]:
# Converting "OverallCond" feature as factor from integer.
train$OverallCond <- as.factor(train$OverallCond)
test$OverallCond <- as.factor(test$OverallCond)
summary(train$OverallCond)
summary(test$OverallCond)

In [None]:
# Replacing NA values with mean of feature "LotFrontage".
train$LotFrontage[which(is.na(train$LotFrontage))] <- mean(train$LotFrontage, na.rm
summary(train$LotFrontage)
test$LotFrontage[which(is.na(test$LotFrontage))] <- mean(test$LotFrontage, na.rm=
summary(test$LotFrontage)

In [None]:
# Replacing NA values with "No_alley_access" of feature "Alley".
# Get levels and add "No_alley_access"
x <- levels(train$Alley)
x[length(x) + 1] <- "No_alley_access"
# Refactor Alley to include "No_alley_access" as a factor level
# and replace NA with "No_alley_access"
train$Alley <- factor(train$Alley, levels = x)
train$Alley[is.na(train$Alley)] <- "No_alley_access"
summary(train$Alley)
# Test data
y <- levels(test$Alley)
y[length(y) + 1] <- "No_alley_access"
test$Alley <- factor(test$Alley, levels = y)
test$Alley[is.na(test$Alley)] <- "No_alley_access"
summary(test$Alley)

In [None]:
# Replacing NA values with "CBlock" of feature "MasVnrType".
levels <- levels(train$MasVnrType)
levels[length(levels) + 1] <- "CBlock"
train$MasVnrType <- factor(train$MasVnrType, levels = levels)
train$MasVnrType[is.na(train$MasVnrType)] <- "CBlock"
summary(train$MasVnrType)
# Test Data
levels <- levels(test$MasVnrType)
levels[length(levels) + 1] <- "CBlock"
test$MasVnrType <- factor(test$MasVnrType, levels = levels)
test$MasVnrType[is.na(test$MasVnrType)] <- "CBlock"
summary(test$MasVnrType)

In [None]:
# Replacing NA values with mean of feature "MasVnrArea".
train$MasVnrArea[which(is.na(train$MasVnrArea))] <- mean(train$MasVnrArea, na.rm=
summary(train$MasVnrArea)
test$MasVnrArea[which(is.na(test$MasVnrArea))] <- mean(test$MasVnrArea, na.rm=TRUE
summary(test$MasVnrArea)

In [None]:
# Replacing NA values with "No_Basement" of feature "BsmtQual".
levels <- levels(train$BsmtQual)
levels[length(levels) + 1] <- "No_Basement"
train$BsmtQual <- factor(train$BsmtQual, levels = levels)
train$BsmtQual[is.na(train$BsmtQual)] <- "No_Basement"
summary(train$BsmtQual)
# Test Data
levels <- levels(test$BsmtQual)
levels[length(levels) + 1] <- "No_Basement"
test$BsmtQual <- factor(test$BsmtQual, levels = levels)
test$BsmtQual[is.na(test$BsmtQual)] <- "No_Basement"
summary(test$BsmtQual)

In [None]:
# Replacing NA values with "No_Basement" of feature "BsmtCond".
levels <- levels(train$BsmtCond)
levels[length(levels) + 1] <- "No_Basement"
train$BsmtCond <- factor(train$BsmtCond, levels = levels)
train$BsmtCond[is.na(train$BsmtCond)] <- "No_Basement"
summary(train$BsmtCond)
# Test Data
levels <- levels(test$BsmtCond)
levels[length(levels) + 1] <- "No_Basement"
test$BsmtCond <- factor(test$BsmtCond, levels = levels)
test$BsmtCond[is.na(test$BsmtCond)] <- "No_Basement"
summary(test$BsmtQual)

In [None]:
# Replacing NA values with "No_Basement" of feature "BsmtExposure".
levels <- levels(train$BsmtExposure)
levels[length(levels) + 1] <- "No_Basement"
train$BsmtExposure <- factor(train$BsmtExposure, levels = levels)
train$BsmtExposure[is.na(train$BsmtExposure)] <- "No_Basement"
summary(train$BsmtExposure)
# Test Data
levels <- levels(test$BsmtExposure)
levels[length(levels) + 1] <- "No_Basement"
test$BsmtExposure <- factor(test$BsmtExposure, levels = levels)
test$BsmtExposure[is.na(test$BsmtExposure)] <- "No_Basement"
summary(test$BsmtExposure)

In [None]:
# Replacing NA values with "No_Basement" of feature "BsmtFinType1".
levels <- levels(train$BsmtFinType1)
levels[length(levels) + 1] <- "No_Basement"
train$BsmtFinType1 <- factor(train$BsmtFinType1, levels = levels)
train$BsmtFinType1[is.na(train$BsmtFinType1)] <- "No_Basement"
summary(train$BsmtFinType1)
# Test Data
levels <- levels(test$BsmtFinType1)
levels[length(levels) + 1] <- "No_Basement"
test$BsmtFinType1 <- factor(test$BsmtFinType1, levels = levels)
test$BsmtFinType1[is.na(test$BsmtFinType1)] <- "No_Basement"
summary(test$BsmtFinType1)

In [None]:
# Replacing NA values with "No_Basement" of feature "BsmtFinType2".
x <- levels(train$BsmtFinType2)
x[length(x) + 1] <- "No_Basement"
train$BsmtFinType2 <- factor(train$BsmtFinType2, levels = x)
train$BsmtFinType2[is.na(train$BsmtFinType2)] <- "No_Basement"
summary(train$BsmtFinType2)
# Test Data
x <- levels(test$BsmtFinType2)
x[length(x) + 1] <- "No_Basement"
test$BsmtFinType2 <- factor(test$BsmtFinType2, levels = x)
test$BsmtFinType2[is.na(test$BsmtFinType2)] <- "No_Basement"
summary(test$BsmtFinType2)

In [None]:
# Replacing NA values with "Mix" of feature "Electrical".
train$Electrical[is.na(train$Electrical)] <- "Mix"
summary(train$Electrical)

In [None]:
# Replacing NA values with "No_Fireplace" of feature "FireplaceQu".
x <- levels(train$FireplaceQu)
x[length(x) + 1] <- "No_Fireplace"
train$FireplaceQu <- factor(train$FireplaceQu, levels = x)
train$FireplaceQu[is.na(train$FireplaceQu)] <- "No_Fireplace"
summary(train$FireplaceQu)
# Test Data
x <- levels(test$FireplaceQu)
x[length(x) + 1] <- "No_Fireplace"
test$FireplaceQu <- factor(test$FireplaceQu, levels = x)
test$FireplaceQu[is.na(test$FireplaceQu)] <- "No_Fireplace"
summary(test$FireplaceQu)

In [None]:
# Replacing NA values with "No_Garage" of feature "GarageType".
x <- levels(train$GarageType)
x[length(x) + 1] <- "No_Garage"
train$GarageType <- factor(train$GarageType, levels = x)
train$GarageType[is.na(train$GarageType)] <- "No_Garage"
summary(train$GarageType)
# Test Data
x <- levels(test$GarageType)
x[length(x) + 1] <- "No_Garage"
test$GarageType <- factor(test$GarageType, levels = x)
test$GarageType[is.na(test$GarageType)] <- "No_Garage"
summary(test$GarageType)

In [None]:
# Replacing NA values with mean of feature "GarageYrBlt".
train$GarageYrBlt[which(is.na(train$GarageYrBlt))] <- mean(train$GarageYrBlt, na.rm
summary(train$GarageYrBlt)
test$GarageYrBlt[which(is.na(test$GarageYrBlt))] <- mean(test$GarageYrBlt, na.rm=
summary(test$GarageYrBlt)

In [None]:
# Replacing NA values with "No_Garage" of feature "GarageFinish".
x <- levels(train$GarageFinish)
x[length(x) + 1] <- "No_Garage"
train$GarageFinish <- factor(train$GarageFinish, levels = x)
train$GarageFinish[is.na(train$GarageFinish)] <- "No_Garage"
summary(train$GarageFinish)
# Test Data
x <- levels(test$GarageFinish)
x[length(x) + 1] <- "No_Garage"
test$GarageFinish <- factor(test$GarageFinish, levels = x)
test$GarageFinish[is.na(test$GarageFinish)] <- "No_Garage"
summary(test$GarageFinish)

In [None]:
# Replacing NA values with "No_Garage" of feature "GarageQual".
x <- levels(train$GarageQual)
x[length(x) + 1] <- "No_Garage"
train$GarageQual <- factor(train$GarageQual, levels = x)
train$GarageQual[is.na(train$GarageQual)] <- "No_Garage"
summary(train$GarageQual)
# Test Data
x <- levels(test$GarageQual)
x[length(x) + 1] <- "No_Garage"
test$GarageQual <- factor(test$GarageQual, levels = x)
test$GarageQual[is.na(test$GarageQual)] <- "No_Garage"
summary(test$GarageQual)

In [None]:
# Replacing NA values with "No_Garage" of feature "GarageCond".
x <- levels(train$GarageCond)
x[length(x) + 1] <- "No_Garage"
train$GarageCond <- factor(train$GarageCond, levels = x)
train$GarageCond[is.na(train$GarageCond)] <- "No_Garage"
summary(train$GarageCond)
# Test Data
x <- levels(test$GarageCond)
x[length(x) + 1] <- "No_Garage"
test$GarageCond <- factor(test$GarageCond, levels = x)
test$GarageCond[is.na(test$GarageCond)] <- "No_Garage"
summary(test$GarageCond)

In [None]:
# Replacing NA values with "No_Pool" of feature "PoolQC".
x <- levels(train$PoolQC)
x[length(x) + 1] <- "No_Pool"
train$PoolQC <- factor(train$PoolQC, levels = x)
train$PoolQC[is.na(train$PoolQC)] <- "No_Pool"
summary(train$PoolQC)
# Test Data
x <- levels(test$PoolQC)
x[length(x) + 1] <- "No_Pool"
test$PoolQC <- factor(test$PoolQC, levels = x)
test$PoolQC[is.na(test$PoolQC)] <- "No_Pool"
summary(test$PoolQC)

In [None]:
# Replacing NA values with "No_Fence" of feature "Fence".
x <- levels(train$Fence)
x[length(x) + 1] <- "No_Fence"
train$Fence <- factor(train$Fence, levels = x)
train$Fence[is.na(train$Fence)] <- "No_Fence"
summary(train$Fence)
# Test Data
x <- levels(test$Fence)
x[length(x) + 1] <- "No_Fence"
test$Fence <- factor(test$Fence, levels = x)
test$Fence[is.na(test$Fence)] <- "No_Fence"
summary(test$Fence)

In [None]:
# Replacing NA values with "None" of feature "MiscFeature".
x <- levels(train$MiscFeature)
x[length(x) + 1] <- "None"
train$MiscFeature <- factor(train$MiscFeature, levels = x)
train$MiscFeature[is.na(train$MiscFeature)] <- "None"
summary(train$MiscFeature)
# Test Data
x <- levels(test$MiscFeature)
x[length(x) + 1] <- "None"
test$MiscFeature <- factor(test$MiscFeature, levels = x)
test$MiscFeature[is.na(test$MiscFeature)] <- "None"
summary(test$MiscFeature)

In [None]:
# No. of missing values for each feature after data-preprocessing in training data.
# There is no missing value (NA) in training data
colSums(sapply(train, is.na))

In [None]:
# Feature selection:
# A Decision Tree induction is applied feature selection.
library(rpart)
dt <- rpart(SalePrice ~ ., data = train, control = rpart.control(minsplit = 10))
#plot(dt)
#text(dt)
# List of features in training data that are tested in the decision tree.
# Total 31 features are selected from original 79 features.
dt$variable.importance

In [None]:
# test$KitchenQual <- factor(test$KitchenQual, levels=levels(train$KitchenQual))
summary(train$KitchenQual)
test$KitchenQual[is.na(test$KitchenQual)] <- "TA"
summary(test$KitchenQual)

In [None]:
summary(train$GarageArea)
test$GarageArea[which(is.na(test$GarageArea))] <- mean(test$GarageArea, na.rm=TRUE
summary(test$GarageArea)

In [None]:
summary(train$TotalBsmtSF)
test$TotalBsmtSF[which(is.na(test$TotalBsmtSF))] <- mean(test$TotalBsmtSF, na.rm=
summary(test$TotalBsmtSF)