In [1]:
# Pkg.add("LowRankModels")
# Pkg.update()
# Pkg.add("SCS")

In [2]:
# Pkg.add("Gadfly")
using Convex
using DataFrames
using PyPlot
using SCS

INFO: Recompiling stale cache file /home/juser/.julia/lib/v0.5/DataStructures.ji for module DataStructures.
INFO: Recompiling stale cache file /home/juser/.julia/lib/v0.5/ColorTypes.ji for module ColorTypes.
INFO: Recompiling stale cache file /home/juser/.julia/lib/v0.5/Colors.ji for module Colors.
INFO: Recompiling stale cache file /home/juser/.julia/lib/v0.5/PyPlot.ji for module PyPlot.


In [3]:
include("proxgrad.jl")

proxgrad_const (generic function with 1 method)

In [4]:
# load data
listings = readtable("listings_cleaned.csv")

n = size(listings)[1]

# Code borrowed from http://blog.yhat.com/posts/julia-neural-networks.html
# Generate train/test split
is_train = shuffle(1:n .> floor(n * .25))
listings_train, listings_test = listings[is_train, :], listings[!is_train, :]
listings = listings_train;

In [5]:
# Generate train/validate split based on a percent (1-100) for the validation set
function generateValidationSplit(listings, percent)
    n = size(listings)[1]
    is_validate = shuffle(1:n .> floor(n * (percent / 100.)))
    listings_train, listings_validate = listings[is_validate, :], listings[!is_validate, :]
    return listings_train, listings_validate
end

generateValidationSplit (generic function with 1 method)

In [6]:
# Various loss and error functions

function lasso(X,y; λ=1)
    @show d = size(X,2)
    @show w = Variable(d)
    @show p = minimize(sumsquares(X*w - y) + λ*norm(w,1))
    solve!(p)
    return w.value
end

function ridge_regression(X,y; λ=1)
    d = size(X,2)
    w = Variable(d)
    p = minimize(sumsquares(X*w - y) + λ*sumsquares(w))
    solve!(p)
    return w.value
end

function nnls(X,y)
    d = size(X,2)
    w = Variable(d)
    p = minimize(sumsquares(X*w - y), w>=0)
    solve!(p)
    return w.value
end

function ols(x, y)
    return x\y
end

function RMSE(w, x, y)
    n = length(y)
    f(x_i) = vecdot(w,x_i)
    total_error = 0
    for i = 1:size(x,1)
        actual = y[i]
        predicted = f(x[i,:])
        total_error += (actual - predicted)^2
    end
    return sqrt(total_error / n)
end

function squared_error(w, x, y)
    f(x_i) = vecdot(w,x_i)
    total_error = 0
    for i = 1:size(x,1)
        actual = y[i]
        predicted = f(x[i,:])
        total_error += (actual - predicted)^2
    end
    return total_error
end

function abs_error(w, x, y)
    f(x_i) = vecdot(w,x_i)
    total_error = 0
    for i = 1:size(x,1)
        actual = y[i]
        predicted = f(x[i,:])
        total_error += abs(actual - predicted)
    end
    return total_error
end

abs_error (generic function with 1 method)

In [22]:
# Takes in training and validation vectors, and returns the error for a specific method
function calculateError(x_train, y_train, x_validate, y_validate, lossFunction, regFunction)
    n = length(y)
    w = proxgrad((1./n)*lossFunction, regFunction, x_train, y_train, maxiters=200)
    
    err = RMSE(w, x_validate, y_validate)
    return err
end

function calculateErrorOLS(x_train, y_train, x_validate, y_validate)
    w = ols(x_train, y_train)
    err = RMSE(w, x_validate, y_validate)
    return err
end



calculateErrorOLS (generic function with 1 method)

In [8]:
# Create training vectors

x = copy(listings)
x[:offset] = 1;
# smalldata = x[1:10, [:accommodates, :bathrooms, :beds, :amenities, :number_of_reviews, :review_scores_rating]]
x0 = convert(Array{Float64}, x)
x1 = convert(Array{Float64}, x[:, [:accommodates, :beds, :amenities, :review_scores_rating, :offset]])
x2 = convert(Array{Float64}, x[:, [:accommodates, :beds, :offset]])
x3 = convert(Array{Float64}, x[:, [:accommodates, :beds, :room_type_entire_home_apt, :offset]])
x4 = convert(Array{Float64}, x[:, [:accommodates, :beds, :room_type_entire_home_apt, :reviews_per_month, :offset]])
x5 = convert(Array{Float64}, x[:, [:accommodates, :beds, :bed_type_real_bed, :room_type_entire_home_apt, :reviews_per_month, :property_type_apartment, :offset]])
x6 = convert(Array{Float64}, x[:, [:accommodates, :beds, :bed_type_real_bed, :room_type_entire_home_apt, :room_type_private_room
, :room_type_shared_room, :reviews_per_month, :property_type_apartment, :offset]])

y = convert(Array{Float64}, x[:, [:price]])
y = y[:,1];

In [9]:
# Create test vectors

x_test = copy(listings_test)
x_test[:offset] = 1
@show size(x_test)

x0_test = convert(Array{Float64}, x_test)
x1_test = convert(Array{Float64}, x_test[:, [:accommodates, :beds, :amenities, :review_scores_rating, :offset]])
x2_test = convert(Array{Float64}, x_test[:, [:accommodates, :beds, :offset]])
x3_test = convert(Array{Float64}, x_test[:, [:accommodates, :beds, :room_type_entire_home_apt, :offset]])
x4_test = convert(Array{Float64}, x_test[:, [:accommodates, :beds, :room_type_entire_home_apt, :reviews_per_month, :offset]])
x5_test = convert(Array{Float64}, x_test[:, [:accommodates, :beds, :bed_type_real_bed, :room_type_entire_home_apt, :reviews_per_month, :property_type_apartment, :offset]])
x6_test = convert(Array{Float64}, x_test[:, [:accommodates, :beds, :bed_type_real_bed, :room_type_entire_home_apt, :room_type_private_room
, :room_type_shared_room, :reviews_per_month, :property_type_apartment, :offset]])

y_test = convert(Array{Float64}, x_test[:, [:price]])
y_test = y_test[:,1];

size(x_test) = (6519,80)


In [30]:
n = length(y)

# huber regression

@show calculateError(x1, y, x1_test, y_test, HuberLoss(), ZeroReg())
@show calculateError(x2, y, x2_test, y_test, HuberLoss(), ZeroReg())
@show calculateError(x3, y, x3_test, y_test, HuberLoss(), ZeroReg())
@show calculateError(x4, y, x4_test, y_test, HuberLoss(), ZeroReg())
@show calculateError(x5, y, x5_test, y_test, HuberLoss(), ZeroReg())

# ordinary least squares regression
lambda = 1
@show calculateError(x0, y, x0_test, y_test, QuadLoss(), ZeroReg())
@show calculateError(x1, y, x1_test, y_test, QuadLoss(), ZeroReg())
@show calculateError(x2, y, x2_test, y_test, QuadLoss(), ZeroReg())
@show calculateError(x3, y, x3_test, y_test, QuadLoss(), ZeroReg())
@show calculateError(x4, y, x4_test, y_test, QuadLoss(), ZeroReg())
@show calculateError(x5, y, x5_test, y_test, QuadLoss(), ZeroReg())
@show calculateError(x5, y, x5_test, y_test, QuadLoss(), OneReg(lambda))
@show calculateError(x5, y, x5_test, y_test, QuadLoss(), QuadReg(lambda))
@show calculateError(x6, y, x6_test, y_test, QuadLoss(), ZeroReg())


# l1 loss regression
@show calculateError(x1, y, x1_test, y_test, L1Loss(), ZeroReg())
@show calculateError(x2, y, x2_test, y_test, L1Loss(), ZeroReg())
@show calculateError(x3, y, x3_test, y_test, L1Loss(), ZeroReg())
@show calculateError(x4, y, x4_test, y_test, L1Loss(), ZeroReg())
@show calculateError(x5, y, x5_test, y_test, L1Loss(), ZeroReg())

# quantile regression
@show calculateError(x5, y, x5_test, y_test, QuantileLoss(quantile=.4), ZeroReg())

calculateError(x1,y,x1_test,y_test,HuberLoss(),ZeroReg()) = 337.04394458737266
calculateError(x2,y,x2_test,y_test,HuberLoss(),ZeroReg()) = 305.6622585556549
calculateError(x3,y,x3_test,y_test,HuberLoss(),ZeroReg()) = 306.7621409659282
calculateError(x4,y,x4_test,y_test,HuberLoss(),ZeroReg()) = 305.17195429999606
calculateError(x5,y,x5_test,y_test,HuberLoss(),ZeroReg()) = 305.72391289211293
calculateError(x0,y,x0_test,y_test,QuadLoss(),ZeroReg()) = 382.0981459515079
calculateError(x1,y,x1_test,y_test,QuadLoss(),ZeroReg()) = 312.95558882657406
calculateError(x2,y,x2_test,y_test,QuadLoss(),ZeroReg()) = 292.26639508702135
calculateError(x3,y,x3_test,y_test,QuadLoss(),ZeroReg()) = 292.13859966021613
calculateError(x4,y,x4_test,y_test,QuadLoss(),ZeroReg()) = 289.8394215223547
calculateError(x5,y,x5_test,y_test,QuadLoss(),ZeroReg()) = 288.91739005209655
calculateError(x5,y,x5_test,y_test,QuadLoss(),OneReg(lambda)) = 288.9380576359464
calculateError(x5,y,x5_test,y_test,QuadLoss(),QuadReg(lambd

In [27]:
# ordinary least squares

@show calculateErrorOLS(x1, y, x1_test, y_test)
@show calculateErrorOLS(x2, y, x2_test, y_test)
@show calculateErrorOLS(x3, y, x3_test, y_test)
@show calculateErrorOLS(x4, y, x4_test, y_test)
@show calculateErrorOLS(x5, y, x5_test, y_test)

calculateErrorOLS(x1,y,x1_test,y_test) = 290.3259308915268
calculateErrorOLS(x2,y,x2_test,y_test) = 292.26774128398966
calculateErrorOLS(x3,y,x3_test,y_test) = 292.1302273403702
calculateErrorOLS(x4,y,x4_test,y_test) = 289.82864013708115
calculateErrorOLS(x5,y,x5_test,y_test) = 288.9090491920301


In [None]:
w_ridge1 = ridge_regression(x1, y)
@show err_ridge1 = RMSE(w_ridge1, x1_test, y_test)


In [None]:
w_lasso1 = lasso(x1, y)
@show err_lasso1 = RMSE(w_lasso1, x1_test, y_test)


In [None]:
# w_nonneg1 = nnls(x1, y)
# @show err_nonneg1 = test(w_nonneg1, x1, y)


In [None]:
plt[:hist](w_ridge1)

In [None]:
plt[:hist](w_lasso1, bins=50)