In [None]:
# Pkg.add("LowRankModels")
# Pkg.update()
# Pkg.add("SCS")

In [None]:
# Pkg.add("Gadfly")
using Convex
using DataFrames
using PyPlot
using SCS

In [None]:
include("proxgrad.jl")

In [None]:
PERCENT_TEST = 0.2
PERCENT_VALIDATE = 0.25
;

In [None]:
# Generate a split data with a given percent from 0-1
function splitData(listings, percent)
    n = size(listings)[1]
    # Code borrowed from http://blog.yhat.com/posts/julia-neural-networks.html
    is_split = shuffle(1:n .> floor(n * percent))
    listings_train, listings_test = listings[is_split, :], listings[!is_split, :]
    return listings_train, listings_test
end

In [None]:
# load data
listings = readtable("listings_cleaned.csv")
listings[:offset] = 1

# Generate train/test split
listings_train, listings_test = splitData(listings, PERCENT_TEST)
listings = listings_train;

In [None]:
# Generate train/validate split
listings_train, listings_validate = splitData(listings, PERCENT_VALIDATE)
listings = listings_train;

In [None]:
# Various loss and error functions

function lasso(X,y; λ=1)
    @show d = size(X,2)
    @show w = Variable(d)
    @show p = minimize(sumsquares(X*w - y) + λ*norm(w,1))
    solve!(p)
    return w.value
end

function ridge_regression(X,y; λ=1)
    d = size(X,2)
    w = Variable(d)
    p = minimize(sumsquares(X*w - y) + λ*sumsquares(w))
    solve!(p)
    return w.value
end

function nnls(X,y)
    d = size(X,2)
    w = Variable(d)
    p = minimize(sumsquares(X*w - y), w>=0)
    solve!(p)
    return w.value
end

function ols(x, y)
    return x\y
end

function RMSE(w, x, y)
    n = length(y)
    f(x_i) = vecdot(w,x_i)
    total_error = 0
    for i = 1:size(x,1)
        actual = y[i]
        predicted = f(x[i,:])
        total_error += (actual - predicted)^2
    end
    return sqrt(total_error / n)
end

function squared_error(w, x, y)
    f(x_i) = vecdot(w,x_i)
    total_error = 0
    for i = 1:size(x,1)
        actual = y[i]
        predicted = f(x[i,:])
        total_error += (actual - predicted)^2
    end
    return total_error
end

function abs_error(w, x, y)
    f(x_i) = vecdot(w,x_i)
    total_error = 0
    for i = 1:size(x,1)
        actual = y[i]
        predicted = f(x[i,:])
        total_error += abs(actual - predicted)
    end
    return total_error
end

In [None]:
# Takes in training and validation vectors, and returns the error for a specific method (as well as the w)
function calculateError(x_train, y_train, x_validate, y_validate, lossFunction, regFunction)
    n = length(y_train)
    w = proxgrad((1./n)*lossFunction, regFunction, x_train, y_train, maxiters=200)
    
    err = RMSE(w, x_validate, y_validate)
    return err, w
end

function calculateErrorOLS(x_train, y_train, x_validate, y_validate)
    w = ols(x_train, y_train)
    err = RMSE(w, x_validate, y_validate)
    return err, w
end

In [None]:
# Cross validate to produce best w
k = 5

function crossValidate(columnList, lossFunction, regFunction)
    minW = nothing
    minErr = typemax(Int32)
    for i in 1:k
        listings_training, listings_validate = splitData(listings, PERCENT_VALIDATE) #(listings_train)
        
        x = convert(Array{Float64}, listings_training[:, columnList])
        y = convert(Array{Float64}, listings_training[:, [:price]])
        y = y[:,1] #???
        
        x_validate = convert(Array{Float64}, listings_validate[:, columnList])
        y_validate = convert(Array{Float64}, listings_validate[:, [:price]])
        y_validate = y_validate[:,1] #????
        
        err, w = calculateError(x, y, x_validate, y_validate, lossFunction, regFunction)
        if (err < minErr)
            minW = w
            minErr = err
        end
        
        #TODO: Use average not min? (in sample error)
    end
    return minW, minErr
end

In [None]:
# Cross validate to produce best w

function crossValidateAverage(columnList, lossFunction, regFunction)
    avgW = 0
    for i in 1:k
        listings_training, listings_validate = splitData(listings, PERCENT_VALIDATE) #(listings_train)
        
        x = convert(Array{Float64}, listings_training[:, columnList])
        y = convert(Array{Float64}, listings_training[:, [:price]])
        y = y[:,1] #???
        
        x_validate = convert(Array{Float64}, listings_validate[:, columnList])
        y_validate = convert(Array{Float64}, listings_validate[:, [:price]])
        y_validate = y_validate[:,1] #????
        
        err, w = calculateError(x, y, x_validate, y_validate, lossFunction, regFunction)
        avgW += w
        
        #TODO: Use average not min? (in sample error)
    end
    return (avgW/k), minErr
end

In [None]:
"""function to plot the above data"""
function plotdata(x,y; margin=.05)
    hold(true)
    for i in 1:size(y,1)
        plot(x,y[i,:],"o")
    end
#     plot(x,y,"o")
    xlabel("x")
    ylabel("y")
    range_y = maximum(vec(y)) - minimum(vec(y))
    range_x = maximum(vec(x)) - minimum(vec(x))
    ylim([minimum(y)-margin*range_y,maximum(y)+margin*range_y])
    xlim([minimum(x)-margin*range_x,maximum(x)+margin*range_x])
end

In [None]:
# Choose different sets of columns that we care about

columns_1 = [:accommodates, :beds, :amenities, :review_scores_rating, :offset]
columns_2 = [:accommodates, :beds, :offset]
columns_3 = [:accommodates, :beds, :room_type_entire_home_apt, :offset]
columns_4 = [:accommodates, :beds, :room_type_entire_home_apt, :reviews_per_month, :offset]
columns_5 = [:accommodates, :beds, :bed_type_real_bed, :room_type_entire_home_apt, 
    :reviews_per_month, :property_type_apartment, :offset]
columns_6 = [:accommodates, :beds, :bed_type_real_bed, :room_type_entire_home_apt, 
    :room_type_private_room, :room_type_shared_room, 
    :reviews_per_month, :property_type_apartment, :offset]
;

In [None]:

#huber regression
w_huber1, err_huber1 = crossValidate(columns_1, HuberLoss(), ZeroReg())
w_huber2, err_huber2 = crossValidate(columns_2, HuberLoss(), ZeroReg())
w_huber3, err_huber3 = crossValidate(columns_3, HuberLoss(), ZeroReg())
w_huber4, err_huber4 = crossValidate(columns_4, HuberLoss(), ZeroReg())
w_huber5, err_huber5 = crossValidate(columns_5, HuberLoss(), ZeroReg())
w_huber6, err_huber6 = crossValidate(columns_6, HuberLoss(), ZeroReg())

@show err_huber1
@show err_huber2
@show err_huber3
@show err_huber4
@show err_huber5
@show err_huber6

#huber regression & L1 Reg
w1_huber1, err1_huber1 = crossValidate(columns_1, HuberLoss(), OneReg())
w1_huber2, err1_huber2 = crossValidate(columns_2, HuberLoss(), OneReg())
w1_huber3, err1_huber3 = crossValidate(columns_3, HuberLoss(), OneReg())
w1_huber4, err1_huber4 = crossValidate(columns_4, HuberLoss(), OneReg())
w1_huber5, err1_huber5 = crossValidate(columns_5, HuberLoss(), OneReg())
w1_huber6, err1_huber6 = crossValidate(columns_6, HuberLoss(), OneReg())

@show err1_huber1
@show err1_huber2
@show err1_huber3
@show err1_huber4
@show err1_huber5
@show err1_huber6

#huber L2 Reg
w2_huber1, err2_huber1 = crossValidate(columns_1, HuberLoss(), QuadReg())
w2_huber2, err2_huber2 = crossValidate(columns_2, HuberLoss(), QuadReg())
w2_huber3, err2_huber3 = crossValidate(columns_3, HuberLoss(), QuadReg())
w2_huber4, err2_huber4 = crossValidate(columns_4, HuberLoss(), QuadReg())
w2_huber5, err2_huber5 = crossValidate(columns_5, HuberLoss(), QuadReg())
w2_huber6, err2_huber6 = crossValidate(columns_6, HuberLoss(), QuadReg())

@show err2_huber1
@show err2_huber2
@show err2_huber3
@show err2_huber4
@show err2_huber5
@show err2_huber6

In [None]:
@show w_huber6
# columns_6 = [
#     :accommodates, 
#     :beds, 
#     :bed_type_real_bed, 
#     :room_type_entire_home_apt, 
#     :room_type_private_room, 
#     :room_type_shared_room, 
#     :reviews_per_month, 
#     :property_type_apartment, 
#     :offset]

In [None]:
# Quad Loss
lambda = 1
w_quad1, err_quad1 = crossValidate(columns_1, QuadLoss(), ZeroReg())
w_quad2, err_quad2 = crossValidate(columns_2, QuadLoss(), ZeroReg())
w_quad3, err_quad3 = crossValidate(columns_3, QuadLoss(), ZeroReg())
w_quad4, err_quad4 = crossValidate(columns_4, QuadLoss(), ZeroReg())
w_quad5, err_quad5 = crossValidate(columns_5, QuadLoss(), ZeroReg())
w_quad6, err_quad6 = crossValidate(columns_6, QuadLoss(), ZeroReg())
@show err_quad1
@show err_quad2
@show err_quad3
@show err_quad4
@show err_quad5
@show err_quad6

# Quad Loss & L1 Reg
w1_quad1, err1_quad1 = crossValidate(columns_1, QuadLoss(), OneReg())
w1_quad2, err1_quad2 = crossValidate(columns_2, QuadLoss(), OneReg())
w1_quad3, err1_quad3 = crossValidate(columns_3, QuadLoss(), OneReg())
w1_quad4, err1_quad4 = crossValidate(columns_4, QuadLoss(), OneReg())
w1_quad5, err1_quad5 = crossValidate(columns_5, QuadLoss(), OneReg())
w1_quad6, err1_quad6 = crossValidate(columns_6, QuadLoss(), OneReg())
@show err1_quad1
@show err1_quad2
@show err1_quad3
@show err1_quad4
@show err1_quad5
@show err1_quad6

# Quad Loss & L2 Reg
w2_quad1, err2_quad1 = crossValidate(columns_1, QuadLoss(), QuadReg())
w2_quad2, err2_quad2 = crossValidate(columns_2, QuadLoss(), QuadReg())
w2_quad3, err2_quad3 = crossValidate(columns_3, QuadLoss(), QuadReg())
w2_quad4, err2_quad4 = crossValidate(columns_4, QuadLoss(), QuadReg())
w2_quad5, err2_quad5 = crossValidate(columns_5, QuadLoss(), QuadReg())
w2_quad6, err2_quad6 = crossValidate(columns_6, QuadLoss(), QuadReg())
@show err2_quad1
@show err2_quad2
@show err2_quad3
@show err2_quad4
@show err2_quad5
@show err2_quad6

In [None]:
@show w1_quad6
# columns_6 = [
#     :accommodates, 
#     :beds, 
#     :bed_type_real_bed, 
#     :room_type_entire_home_apt, 
#     :room_type_private_room, 
#     :room_type_shared_room, 
#     :reviews_per_month, 
#     :property_type_apartment, 
#     :offset]

In [None]:
# l1 loss regression
w_l11, err_l11 = crossValidate(columns_1, L1Loss(), ZeroReg())
w_l12, err_l12 = crossValidate(columns_2, L1Loss(), ZeroReg())
w_l13, err_l13 = crossValidate(columns_3, L1Loss(), ZeroReg())
w_l14, err_l14 = crossValidate(columns_4, L1Loss(), ZeroReg())
w_l15, err_l15 = crossValidate(columns_5, L1Loss(), ZeroReg())
w_l16, err_l16 = crossValidate(columns_6, L1Loss(), ZeroReg())

@show err_l11
@show err_l12
@show err_l13
@show err_l14
@show err_l15
@show err_l16

# l1 loss regression & L1 Reg
w1_l11, err1_l11 = crossValidate(columns_1, L1Loss(), OneReg())
w1_l12, err1_l12 = crossValidate(columns_2, L1Loss(), OneReg())
w1_l13, err1_l13 = crossValidate(columns_3, L1Loss(), OneReg())
w1_l14, err1_l14 = crossValidate(columns_4, L1Loss(), OneReg())
w1_l15, err1_l15 = crossValidate(columns_5, L1Loss(), OneReg())
w1_l16, err1_l16 = crossValidate(columns_6, L1Loss(), OneReg())

@show err1_l11
@show err1_l12
@show err1_l13
@show err1_l14
@show err1_l15
@show err1_l16

# l1 loss regression & L2 Reg
w2_l11, err2_l11 = crossValidate(columns_1, L1Loss(), QuadReg())
w2_l12, err2_l12 = crossValidate(columns_2, L1Loss(), QuadReg())
w2_l13, err2_l13 = crossValidate(columns_3, L1Loss(), QuadReg())
w2_l14, err2_l14 = crossValidate(columns_4, L1Loss(), QuadReg())
w2_l15, err2_l15 = crossValidate(columns_5, L1Loss(), QuadReg())
w2_l16, err2_l16 = crossValidate(columns_6, L1Loss(), QuadReg())

@show err2_l11
@show err2_l12
@show err2_l13
@show err2_l14
@show err2_l15
@show err2_l16

# quantile regression
w_quantile1, err_quantile1 = crossValidate(columns_5, QuantileLoss(quantile=.4), ZeroReg())
@show err_quantile1

In [None]:
# Test our w's

function calculateTestError(listings_test, columnList, w)
    x_test = convert(Array{Float64}, listings_test[:, columnList])
    y_test = convert(Array{Float64}, listings_test[:, [:price]])
    err = RMSE(w, x_test, y_test)
    return err
end

@show calculateTestError(listings_test, columns_1, w_huber1)
@show calculateTestError(listings_test, columns_2, w_huber2)
@show calculateTestError(listings_test, columns_3, w_huber3)
@show calculateTestError(listings_test, columns_4, w_huber4)
@show calculateTestError(listings_test, columns_5, w_huber5)
@show calculateTestError(listings_test, columns_6, w_huber6)

@show calculateTestError(listings_test, columns_1, w1_huber1)
@show calculateTestError(listings_test, columns_2, w1_huber2)
@show calculateTestError(listings_test, columns_3, w1_huber3)
@show calculateTestError(listings_test, columns_4, w1_huber4)
@show calculateTestError(listings_test, columns_5, w1_huber5)
@show calculateTestError(listings_test, columns_6, w1_huber6)

@show calculateTestError(listings_test, columns_1, w2_huber1)
@show calculateTestError(listings_test, columns_2, w2_huber2)
@show calculateTestError(listings_test, columns_3, w2_huber3)
@show calculateTestError(listings_test, columns_4, w2_huber4)
@show calculateTestError(listings_test, columns_5, w2_huber5)
@show calculateTestError(listings_test, columns_6, w2_huber6)

@show calculateTestError(listings_test, columns_1, w_quad1)
@show calculateTestError(listings_test, columns_2, w_quad2)
@show calculateTestError(listings_test, columns_3, w_quad3)
@show calculateTestError(listings_test, columns_4, w_quad4)
@show calculateTestError(listings_test, columns_5, w_quad5)
@show calculateTestError(listings_test, columns_6, w_quad6)

@show calculateTestError(listings_test, columns_1, w1_quad1)
@show calculateTestError(listings_test, columns_2, w1_quad2)
@show calculateTestError(listings_test, columns_3, w1_quad3)
@show calculateTestError(listings_test, columns_4, w1_quad4)
@show calculateTestError(listings_test, columns_5, w1_quad5)
@show calculateTestError(listings_test, columns_6, w1_quad6)

@show calculateTestError(listings_test, columns_1, w2_quad1)
@show calculateTestError(listings_test, columns_2, w2_quad2)
@show calculateTestError(listings_test, columns_3, w2_quad3)
@show calculateTestError(listings_test, columns_4, w2_quad4)
@show calculateTestError(listings_test, columns_5, w2_quad5)
@show calculateTestError(listings_test, columns_6, w2_quad6)

@show calculateTestError(listings_test, columns_1, w_l11)
@show calculateTestError(listings_test, columns_2, w_l12)
@show calculateTestError(listings_test, columns_3, w_l13)
@show calculateTestError(listings_test, columns_4, w_l14)
@show calculateTestError(listings_test, columns_5, w_l15)
@show calculateTestError(listings_test, columns_6, w_l16)

@show calculateTestError(listings_test, columns_1, w1_l11)
@show calculateTestError(listings_test, columns_2, w1_l12)
@show calculateTestError(listings_test, columns_3, w1_l13)
@show calculateTestError(listings_test, columns_4, w1_l14)
@show calculateTestError(listings_test, columns_5, w1_l15)
@show calculateTestError(listings_test, columns_6, w1_l16)

@show calculateTestError(listings_test, columns_1, w2_l11)
@show calculateTestError(listings_test, columns_2, w2_l12)
@show calculateTestError(listings_test, columns_3, w2_l13)
@show calculateTestError(listings_test, columns_4, w2_l14)
@show calculateTestError(listings_test, columns_5, w2_l15)
@show calculateTestError(listings_test, columns_6, w2_l16)

@show calculateTestError(listings_test, columns_5, w_quantile1)