In [None]:
## Importing packages

# This R environment comes with all of CRAN and many other helpful packages preinstalled.
# You can see which packages are installed by checking out the kaggle/rstats docker image: 
# https://github.com/kaggle/docker-rstats

library(tidyverse) # metapackage with lots of helpful functions
library("ggplot2")
library(tidyverse)
library(formattable)
library(gridExtra)
library(moments) # for skewness() func
library(corrplot)
library(glmnet)
## Running code

# In a notebook, you can run a single code cell by clicking in the cell and then hitting 
# the blue arrow to the left, or by clicking in the cell and pressing Shift+Enter. In a script, 
# you can run code by highlighting the code you want to run and then clicking the blue arrow
# at the bottom of this window.

## Reading in files

# You can access files from datasets you've added to this kernel in the "../input/" directory.
# You can see the files added to this kernel by running the code below. 

list.files(path = "../input")

## Saving data

# If you save any files or images, these will be put in the "output" directory. You 
# can see the output directory by committing and running your kernel (using the 
# Commit & Run button) and then checking out the compiled version of your kernel.

In [None]:
# Import data
data <- read.csv(file="../input/reale.csv", sep=",")
# Changing variable names
names(data)[names(data) == "sale.price.in..100000"] <- "sale_price"
names(data)[names(data) == "list.price.in..100000"] <- "list_price"


In [None]:
class(data)

In [None]:
scatter_plot <- function(x, y, col, xaxis, yaxis, title, lgd_x, lgd_y ) {
  plot(x, y,
     col = c("blue", "red")[as.numeric(col)],
    pch = c(16, 3)[as.numeric(col)],
    xlab=xaxis,
    ylab=yaxis, 
    main=title)

  legend( lgd_x, lgd_y,
       legend=c("Location: O", "Location: X"),
       col=c("blue", "red"), lty = 1, cex=0.8,
       title="Locations:", text.font=4,
       pch = c(16, 3))
}
scatter_plot(data$Case_ID, data$sale_price, data$location,'Case ID', 'Sale Price ($100,000)', " Scatter Plot of Sale Price (8350)", c(250, 190), c(90, 75))


Here we can clearly see two outliers. THe first one with sale price value with sale price > 80 and one with around value 0. We now need to remove these outliers to clean the data. 

In [None]:
## Removing the outliers 
datav1 <- data[which(data$sale_price > 5),]
datav1 <- datav1[which(datav1$sale_price < 80),]

scatter_plot(datav1$Case_ID, datav1$sale_price, datav1$location,'Case ID', 'Sale Price ($100,000)', " Scatter Plot of Sale Price (8350)", c(200, 250), c(33, 28))


In [None]:
## Plot of sale price vs list price 
scatter_plot(datav1$list_price, datav1$sale_price, datav1$location,'List Price ($100,000)', 'Sale Price ($100,000)', " Scatter Plot of Sale Price vs List Price (8350)", c(27, 34), c(11, 7))
scatter_plot(datav1$taxes, datav1$sale_price, datav1$location,'Taxes', 'Sale Price ($100,000)', " Scatter Plot of Sale Price vs Taxes (8350)",  c(17000, 22000), c(11, 7))

In [None]:
qq_plot <- function(x, col){
    norme <- qqnorm(x, col=col, pch = 16, frame = FALSE)
    qqline(x, col = "steelblue", lwd = 2)
}

In [None]:

qq_plot(datav1$sale_price, "red")


qq_plot(log(datav1$sale_price), "green")


qq_plot(sqrt(datav1$sale_price), 100)


qq_plot(datav1$sale_price^(-1), 30)
# hist(datav1$sale_price^(-1))


In [None]:
data_X <- subset(datav1, datav1$location=="X")
data_O <- subset(datav1, datav1$location=="O")
dim(data_X)
dim(data_O)
plot(data_X$list_price, data_X$sale_price)
plot(data_O$list_price, data_O$sale_price)

In [None]:
sale_price = datav1$sale_price
list_price = datav1$list_price
model <- lm(sale_price~list_price)
summary(model)$r.squared
coef(model)
(summary(model)$sigma)**2
summary(model)$coefficients[2, 4]
confint(model)
plot(datav1$sale_price,datav1$list_price )
abline(model)
summary(model)

In [None]:
sale_price = data_X$sale_price
list_price = data_X$list_price
model_X <- lm(sale_price~list_price)
summary(model_X)$r.squared 
coef(model_X)
(summary(model_X)$sigma)^2
(summary(model_X)$sigma)^1
summary(model_X)$coefficients[2, 4]
confint(model_X)
plot(data_X$sale_price,data_X$list_price )
abline(model_X)
summary(model_X)

In [None]:
model_Y <- lm(data_O$sale_price ~ data_O$list_price)
summary(model_Y)$r.squared 
coef(model_Y)
(summary(model_Y)$sigma)**2
(summary(model_Y)$sigma)**1
summary(model_Y)$coefficients[2, 4]
confint(model_Y)
plot(data_O$sale_price,data_O$list_price )
abline(model_Y)
summary(model_Y)

In [None]:
t.test(datav1$sale_price,datav1$list_price)

In [None]:
par(mfrow=c(2,2))
plot(model)

In [None]:
anova(model_X, model_Y)