In [10]:
rm(list=ls())
library(Matrix)
library(glmnet)
library(MASS)

In [11]:
dat<-read.csv('../input/allstatetrain/train.csv')
dim(dat)
dat[1:2,]
dat <- dat[,-1]
dim(dat)
head(dat)

In [12]:
loss <- dat$loss
hist(loss)
hist(log(loss))
qqnorm(log(loss))
qqline(log(loss), col="blue", lwd=3)

In [13]:
quantile(loss)
quantile(loss, p=seq(0,1,.01))
sum(loss < 100)
loss[loss<100]

In [14]:
dat <- dat[dat$loss>=100,]
dim(dat)
loss <- dat$loss
hist(log(loss))
qqnorm(log(loss))
qqline(log(loss), col="blue", lwd=3)

In [15]:
set.seed(652)
trn <- runif(nrow(dat)) < 0.2
table(trn)
train <- dat[trn==TRUE,]
test <- dat[trn==FALSE,]
dim(train); dim(test)

In [16]:
#Linear Regression
r0 <- lm(loss~., data=train)
#summary(r0)

Y <- train$loss
Y.tst <- test$loss

do.RMSE.trn <- function(yhat)  sqrt( mean( (Y-yhat)^2 ) )
do.RMSE.tst <- function(yhat)  sqrt( mean( (Y.tst-yhat)^2 ) )

yhat_ols<-predict(r0, data = test)

RMSE.trn_OLS <- do.RMSE.trn(predict(r0, data = train))
RMSE.tst_OLS <- do.RMSE.tst(predict(r0, data = test))
RMSE.trn_OLS; RMSE.tst_OLS

In [17]:
#Lasso regression 
X_train <- data.matrix(train)
X_train<-subset(X_train,select=-c(loss))
Y_train <- train$loss
X_test <- data.matrix(test)
X_test<-subset(X_test,select=-c(loss))
Y_test <- test$loss



lasso_mod <- glmnet(X_train, Y_train, family = "gaussian", alpha = 1, standardize = TRUE, nlambda=10)
plot(lasso_mod, lwd=3, xvar = "lambda")
coef(lasso_mod)

In [18]:
mse_train <- colMeans((replicate(10, Y_train)-predict(lasso_mod,X_train))^2)
plot(mse_train,type = "o", lwd=3,col="blue",xlab="model complexity")
mse_test <- colMeans((replicate(10, Y_test)-predict(lasso_mod,X_test))^2)
lines(mse_test,type = "o", lwd=3,col="red")

In [19]:
cv_lasso <- cv.glmnet(X_train, Y_train, alpha = 1, family="gaussian",k=5) 
plot(cv_lasso)
lambda_lasso <- cv_lasso$lambda.min
lambda_lasso

In [20]:
lasso_best <- glmnet(X_train, Y_train, family = "gaussian", alpha = 1, lambda = lambda_lasso, standardize = TRUE)
cor(predict(lasso_best,X_test),Y_test)^2

In [21]:
RMSE.trn_Lasso <- do.RMSE.trn(predict(lasso_best, X_train, s=lambda_lasso, type="response"))
RMSE.tst_Lasso <- do.RMSE.tst(predict(lasso_best, X_test, s=lambda_lasso, type="response"))
RMSE.trn_Lasso; RMSE.tst_Lasso
