# Regularization

In [3]:
library(dplyr)
library(caTools)
library(glmnet)
library(modelr)

"package 'dplyr' was built under R version 3.6.1"
Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

"package 'caTools' was built under R version 3.6.1"Loading required package: Matrix
Loading required package: foreach
Loaded glmnet 2.0-16



In [4]:
df = read.csv("house.csv")

In [5]:
head(df)

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
7129300520,20141013T000000,221900,3,1.0,1180,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
6414100192,20141209T000000,538000,3,2.25,2570,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
5631500400,20150225T000000,180000,2,1.0,770,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
2487200875,20141209T000000,604000,4,3.0,1960,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
1954400510,20150218T000000,510000,3,2.0,1680,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
7237550310,20140512T000000,1225000,4,4.5,5420,101930,1,0,0,...,11,3890,1530,2001,0,98053,47.6561,-122.005,4760,101930


In [6]:
df <- select(df,-c(id,date))

In [7]:
head(df)

price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
221900,3,1.0,1180,5650,1,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
538000,3,2.25,2570,7242,2,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
180000,2,1.0,770,10000,1,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
604000,4,3.0,1960,5000,1,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
510000,3,2.0,1680,8080,1,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
1225000,4,4.5,5420,101930,1,0,0,3,11,3890,1530,2001,0,98053,47.6561,-122.005,4760,101930


In [8]:
str(df)

'data.frame':	21613 obs. of  19 variables:
 $ price        : num  221900 538000 180000 604000 510000 ...
 $ bedrooms     : int  3 3 2 4 3 4 3 3 3 3 ...
 $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
 $ sqft_living  : int  1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
 $ sqft_lot     : int  5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
 $ floors       : num  1 2 1 1 1 1 2 1 1 2 ...
 $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ view         : int  0 0 0 0 0 0 0 0 0 0 ...
 $ condition    : int  3 3 3 5 3 3 3 3 3 3 ...
 $ grade        : int  7 7 6 7 8 11 7 7 7 7 ...
 $ sqft_above   : int  1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
 $ sqft_basement: int  0 400 0 910 0 1530 0 0 730 0 ...
 $ yr_built     : int  1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
 $ yr_renovated : int  0 1991 0 0 0 0 0 0 0 0 ...
 $ zipcode      : int  98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
 $ lat          : num  47.5 47.7 47.7 47.5 47.6 ...

In [9]:
set.seed(101)

sample <- sample.split(df, SplitRatio = 0.70) # SplitRatio = percent of sample==TRUE

# Training Data
train = subset(df, sample == TRUE)

# Testing Data
test = subset(df, sample == FALSE)

In [10]:
model1 <- lm(price ~., data=train)

In [11]:
rmse(model1,test)

"prediction from a rank-deficient fit may be misleading"

In [12]:
rsquare(model1,test)

"prediction from a rank-deficient fit may be misleading"

In [13]:
coef(model1)

In [14]:
y_train = select(train,price)
x_train = select(train,-price)

In [15]:
y_test = select(test,price)
x_test = select(test,-price)

In [16]:
regularize <- function(alpha,lambda){
    
  model <- glmnet(as.matrix(x_train),as.matrix(y_train), alpha = alpha, lambda  = lambda)
  prediction <- predict(model,as.matrix(x_test))

  RSS = sum((y_test - prediction)^2)
  y_mean = mean(y_test$price)  
     
  TSS = sum((y_test - y_mean)^2)
 
  r2 = (TSS - RSS)/TSS  
  
  print(r2)  
 
}

## Lasso Regularization

In [21]:
for(lambda in 1:50){
    regularize(1,lambda)
}

[1] 0.6969922
[1] 0.6969915
[1] 0.6969908
[1] 0.6969902
[1] 0.6969895
[1] 0.6969888
[1] 0.6969882
[1] 0.6969875
[1] 0.6969868
[1] 0.6969861
[1] 0.6969855
[1] 0.6969848
[1] 0.6969841
[1] 0.6969835
[1] 0.6969828
[1] 0.6969821
[1] 0.6969814
[1] 0.6969808
[1] 0.6969801
[1] 0.6969794
[1] 0.6969787
[1] 0.696978
[1] 0.6969774
[1] 0.6969767
[1] 0.696976
[1] 0.6969753
[1] 0.6969746
[1] 0.6969739
[1] 0.6969733
[1] 0.6969726
[1] 0.6969719
[1] 0.6969712
[1] 0.6969705
[1] 0.6969698
[1] 0.6969692
[1] 0.6969685
[1] 0.6969678
[1] 0.6969671
[1] 0.6969664
[1] 0.6969657
[1] 0.696965
[1] 0.6969643
[1] 0.6969636
[1] 0.6969629
[1] 0.6969622
[1] 0.6969615
[1] 0.6969609
[1] 0.6969602
[1] 0.6969595
[1] 0.6969588


## Ridge Regularization

In [22]:
for(lambda in 1:50){
    regularize(0,lambda)
}

[1] 0.6969927
[1] 0.6969926
[1] 0.6969925
[1] 0.6969924
[1] 0.6969923
[1] 0.6969922
[1] 0.6969921
[1] 0.696992
[1] 0.6969919
[1] 0.6969918
[1] 0.6969917
[1] 0.6969916
[1] 0.6969915
[1] 0.6969914
[1] 0.6969913
[1] 0.6969912
[1] 0.6969911
[1] 0.696991
[1] 0.6969909
[1] 0.6969908
[1] 0.6969907
[1] 0.6969906
[1] 0.6969905
[1] 0.6969904
[1] 0.6969903
[1] 0.6969902
[1] 0.6969901
[1] 0.69699
[1] 0.6969899
[1] 0.6969898
[1] 0.6969896
[1] 0.6969895
[1] 0.6969894
[1] 0.6969893
[1] 0.6969892
[1] 0.6969891
[1] 0.696989
[1] 0.6969889
[1] 0.6969888
[1] 0.6969887
[1] 0.6969886
[1] 0.6969885
[1] 0.6969884
[1] 0.6969883
[1] 0.6969882
[1] 0.6969881
[1] 0.696988
[1] 0.6969879
[1] 0.6969878
[1] 0.6969877


## Gradient descent

In [19]:
gradient_descent <- function(x,y){
    
    
    n = dim(x)[1]
    learning_rate = 0.001
    intercept = slope = 0
    iterations = 500
    
    for (i in 1:iterations){
        
        y_pred = x*slope + intercept
        
        mse = sum((y_pred - y)^2)
        
        md = -(2/n)*sum(x*(y-y_pred))
        bd = -(2/n)*sum(y-y_pred)

        slope = slope - (learning_rate * md)
        intercept = intercept - (learning_rate * bd)
        
        print(i)
        print(mse)
        print(slope)
        print(intercept)
        print("-------------------------------------------------------------")
    }
    
}

In [20]:
x = select(df,sqft_living)
y = select(df,price)
gradient_descent(x,y)

[1] 1
[1] 9.217325e+15
[1] 2720067
[1] 1080.176
[1] "-------------------------------------------------------------"
[1] 2
[1] 8.264891e+23
[1] -28117218384
[1] -11312776
[1] "-------------------------------------------------------------"
[1] 3
[1] 8.832966e+31
[1] 2.906746e+14
[1] 116950701130
[1] "-------------------------------------------------------------"
[1] 4
[1] 9.440087e+39
[1] -3.004981e+18
[1] -1.209031e+15
[1] "-------------------------------------------------------------"
[1] 5
[1] 1.008894e+48
[1] 3.106537e+22
[1] 1.249891e+19
[1] "-------------------------------------------------------------"
[1] 6
[1] 1.078239e+56
[1] -3.211524e+26
[1] -1.292132e+23
[1] "-------------------------------------------------------------"
[1] 7
[1] 1.15235e+64
[1] 3.32006e+30
[1] 1.335801e+27
[1] "-------------------------------------------------------------"
[1] 8
[1] 1.231555e+72
[1] -3.432264e+34
[1] -1.380945e+31
[1] "-------------------------------------------------------------"
[1] 9
[1