In [None]:
## Importing packages

# This R environment comes with all of CRAN and many other helpful packages preinstalled.
# You can see which packages are installed by checking out the kaggle/rstats docker image: 
# https://github.com/kaggle/docker-rstats

library(tidyverse) # metapackage with lots of helpful functions
library(ggplot2)
library(plyr)
library(dplyr)
library(corrplot)
library(kernlab)
## Running code

# In a notebook, you can run a single code cell by clicking in the cell and then hitting 
# the blue arrow to the left, or by clicking in the cell and pressing Shift+Enter. In a script, 
# you can run code by highlighting the code you want to run and then clicking the blue arrow
# at the bottom of this window.

## Reading in files

# You can access files from datasets you've added to this kernel in the "../input/" directory.
# You can see the files added to this kernel by running the code below. 

list.files(path = "../input")

## Saving data

# If you save any files or images, these will be put in the "output" directory. You 
# can see the output directory by committing and running your kernel (using the 
# Commit & Run button) and then checking out the compiled version of your kernel.

In [None]:
train <- read.csv("../input/train.csv", stringsAsFactors = F)
test <- read.csv("../input/test.csv", stringsAsFactors = F)

In [None]:
dim(train)
dim(test)


nrow(train)
nrow(test)

tail(train)
head(test)


In [None]:
test_ids<-test$Id
head(test_ids)
str(test_ids)

test$Id<- NULL
train$Id<- NULL

test$SalePrice<-NA

all<- rbind(test,train)
dim(all)
head(all)
nrow(all)

In [None]:
is.na(all)
na.omit(all)

In [None]:
options(scipen = 999)
hist(all$SalePrice, freq= FALSE, density=NULL, col= 'BLUE', main = 'Histogram of Sale Price',xlab = 'Sale Price', ylab = 'Frequency')

In [None]:
summary(all$SalePrice)

**Min. 1st Qu.  Median    Mean 3rd Qu.    Max.   NA's **
34900  129975  163000  180921  214000  755000    1459 

Correlations with SalePrice

In [None]:
NumvericVar <- select_if(all, is.numeric) # Saving Numeric Variables
NumVar <- names(select_if(all, is.numeric)) 
cat('There are', length(NumVar), 'numeric variables')

In [None]:
cor_NumericVar <- cor(NumvericVar, use="pairwise.complete.obs")
#sort on decreasing correlations with SalePrice
cor_sorted <- as.matrix(sort(cor_NumericVar[,'SalePrice'], decreasing = TRUE))

#Plot High correlations values only
cor_high<- names(which(apply(cor_sorted,1,function(x) abs(x)>0.5)))
cor_NumericVar <- cor_NumericVar[cor_high, cor_high]

corrplot.mixed(cor_NumericVar,tl.col="black", tl.pos = "lt")


Overall Quality:

Overall Quality has the maximum correlation with SalePrice among the numeric variables (0.79). It rates the overall material and finish of the house on a scale from 1 (very poor) to 10 (very excellent).

In [None]:
ggplot(data=all[!is.na(all$SalePrice),], aes(x=factor(OverallQual), y=SalePrice))+
        geom_boxplot(outlier.colour="red", outlier.shape=8,outlier.size=4)+
          labs(title="Plot of Sale Price vs Overall Quality",x="Overall Quality", y = "Sale Price")+ 
            theme_classic()

Above Grade (Ground) Living Area (square feet):

The numeric variable with the second highest correlation with SalesPrice is the Above Grade Living Area. Because usually ****big houses are generally more expensive.

In [None]:
ggplot(data=all[!is.na(all$SalePrice),], aes(x=GrLivArea, y=SalePrice))+
        geom_point(col='red')+geom_smooth(method = "glm", se=FALSE, color="black")+
          labs(title="Plot of Sale Price vs Above Grade (Ground) Living Area ",x="Above Grade (Ground) Living Area", y = "Sale Price")+ 
            theme_classic()

In [None]:
Model1 <- lm(formula = SalePrice ~ OverallQual, data=all  )
Model1 <- summary(Model1)$r.squared 
Model1
# 62.56%




In [None]:
Model2 <- lm(formula = SalePrice ~ OverallQual + GrLivArea, data=all  )
Model2 <- summary(Model2)$r.squared 
Model2
# 71.41%



In [None]:
Model3 <- lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars, data=all  )
Model3 <- summary(Model3)$r.squared 
Model3
# 73.90%



In [None]:
Model4 <- lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars + GarageArea, data=all  )
Model4 <- summary(Model4)$r.squared 
Model4
# 74.17%

In [None]:
Model5 <- lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars + GarageArea + TotalBsmtSF + X1stFlrSF + FullBath + TotRmsAbvGrd +YearBuilt +YearRemodAdd , data=all  )
summary(Model5)
Model5S<- summary(Model5)$r.squared 
Model5S

#77.36% # Maximum Dependancy

**#77.36% # Maximum Dependancy**

In [None]:
ggplot(all, aes(x = OverallQual + GrLivArea + GarageCars + GarageArea + TotalBsmtSF + X1stFlrSF + FullBath + TotRmsAbvGrd +YearBuilt +YearRemodAdd, y = SalePrice)) + 
  geom_point() +
  stat_smooth(method = "lm", col = "red")

Applying KSVM on all data

In [None]:
# Applying KSVM for for Model 5 to find out root mean sqaure error
Housing.ksvm <- ksvm(SalePrice ~ OverallQual + GrLivArea + GarageCars + GarageArea + TotalBsmtSF + X1stFlrSF + FullBath + TotRmsAbvGrd +YearBuilt +YearRemodAdd , data=all)
Housing.ksvm@error

**RMSE value = 0.11**

In [None]:
predicted_prices <- predict(Model5, newdata = test, na.rm=TRUE)
head(predicted_prices)


In [None]:
# create a dataframe with our results
my_submission <- data_frame('Id' = test_ids, 'SalePrice' = predicted_prices)
# save file
write_csv(my_submission, 'submission.csv')

