In [1]:
using DataFrames, CSV
using PyPlot
using ScikitLearn
using ScikitLearn.CrossValidation: train_test_split



In [91]:
# define error functions

function mean_abs_err(y_output, y_true)
    n = size(y_output,1)
    sum = 0
    for i=1:n
        sum += abs.(y_output[i] - y_true[i])
    end
    return (sum+0.0)/n
end

function mean_abs_percent(y_output, y_true)
    n = size(y_output,1)
    sum = 0
    for i=1:n
        sum += abs.((y_output[i] - y_true[i])/y_true[i])
    end
    return 100*(sum+0.0)/n
end

mean_abs_percent (generic function with 1 method)

In [3]:
tap_train = readtable("TAP_train.csv");

In [4]:
X_nontest = convert(Array{Float64},tap_train[:,1:42]);
y_nontest = convert(Array,tap_train[:,43]);

In [5]:
X_train, X_validate, y_train, y_validate = train_test_split(X_nontest, y_nontest, test_size=0.25, random_state=42);



## model 1: linear regression with quadratic loss, no regularizer

In [7]:
@sk_import linear_model: LinearRegression

regr_1 = LinearRegression();
ScikitLearn.fit!(regr_1, X_train, y_train);
output_regr1 = ScikitLearn.predict(regr_1, X_validate);

In [62]:
regr_1[:coef_][1:15]

15-element Array{Float64,1}:
    31.2125   
  1280.1      
   340.463    
     5.79992  
  1780.41     
    -0.0630094
   253.482    
   308.467    
   357.934    
   205.021    
 -1124.9      
    74.956    
  -409.045    
   189.195    
  -195.569    

In [63]:
regr_1[:coef_][16:30]

15-element Array{Float64,1}:
   57.9742
   45.1896
  183.083 
  -40.6903
   94.9061
   11.2028
  -11.3755
  -17.2555
  -13.5783
   31.0065
  308.607 
 -414.631 
  106.024 
  592.066 
  106.024 

In [64]:
regr_1[:coef_][31:42]

12-element Array{Float64,1}:
  846.251
 1299.74 
 -512.468
 -665.592
 -422.293
 -133.944
   32.081
  318.299
 -180.057
 -829.093
 -451.01 
    0.0  

In [9]:
# compute errors on validation set
MAE_regr1 = mean_abs_err(output_regr1, y_validate);
MAPD_regr1 = mean_abs_percent(output_regr1, y_validate);
@printf "For model 1 on the validation set, the MAE is %f and the MAPD is %f" MAE_regr1 MAPD_regr1

For model 1 on the validation set, the MAE is 510.644185 and the MAPD is 62.607305

## model 2: linear regression with quadratic loss, l2 regularizer (ridge)

In [12]:
# ridge regression with lambda parameter = lambda
lambda = 0.25
@sk_import linear_model: Ridge

regr_2 = Ridge(alpha = 0.25)
ScikitLearn.fit!(regr_2, X_train, y_train);
output_regr2 = ScikitLearn.predict(regr_2, X_validate);

In [13]:
regr_2[:coef_]

42-element Array{Float64,1}:
    31.2138   
  1228.44     
   340.434    
     5.79988  
  1781.89     
    -0.0630091
   270.116    
   325.104    
   374.565    
   136.334    
 -1106.12     
    74.9615   
  -409.024    
     ⋮        
   837.655    
  1291.14     
  -507.895    
  -660.747    
  -428.663    
  -141.01     
    24.9893   
   311.201    
  -184.582    
  -804.034    
  -424.406    
     0.0      

In [14]:
# compute errors on validation set
MAE_regr2 = mean_abs_err(output_regr2, y_validate);
MAPD_regr2 = mean_abs_percent(output_regr2, y_validate);
@printf "For model 2 on the validation set, the MAE is %f and the MAPD is %f" MAE_regr2 MAPD_regr2

For model 2 on the validation set, the MAE is 510.645113 and the MAPD is 62.606834

## model 3: linear regression with quadratic loss, l1 regularizer (lasso)

In [15]:
@sk_import linear_model: LassoCV

PyObject <class 'sklearn.linear_model.coordinate_descent.LassoCV'>

In [31]:
lasso_cv = ScikitLearn.fit!(LassoCV(normalize = true), X_train, y_train);
output_lasso_cv = ScikitLearn.predict(lasso_cv, X_validate);

In [61]:
lambda = lasso_cv[:alpha_]

0.003281307194525865

In [33]:
@sk_import linear_model: Lasso

regr_3 = Lasso(alpha = lambda, max_iter = 8000)
ScikitLearn.fit!(regr_3, X_train, y_train);
output_regr3 = ScikitLearn.predict(regr_3, X_validate);



In [189]:
regr_3[:coef_][1:15]

15-element Array{Float64,1}:
    31.2155   
  1921.85     
   371.811    
     5.79522  
  3004.04     
    -0.0630092
    -0.0      
    55.008    
   104.435    
   -48.7011   
 -1375.44     
    48.3283   
  -435.588    
   162.565    
  -222.142    

In [190]:
regr_3[:coef_][16:30]

15-element Array{Float64,1}:
    0.0    
  -12.7414 
  124.893  
  -98.5106 
   36.4145 
   22.5821 
   -0.0    
   -5.87228
   -2.18208
   42.3783 
  995.251  
 -338.458  
   60.8438 
 -209.477  
   36.1935 

In [191]:
regr_3[:coef_][31:42] #7,16,22,35,41,42

12-element Array{Float64,1}:
   44.6745
  498.175 
  -56.0027
 -216.412 
    0.0   
 -324.832 
 -158.669 
  127.167 
  845.048 
 -988.487 
   -0.0   
    0.0   

In [35]:
# compute errors on validation set
MAE_regr3 = mean_abs_err(output_regr3, y_validate);
MAPD_regr3 = mean_abs_percent(output_regr3, y_validate);
@printf "For model 3 on the validation set, the MAE is %f and the MAPD is %f" MAE_regr3 MAPD_regr3

For model 3 on the validation set, the MAE is 510.649891 and the MAPD is 62.607691

## model 4: polynomial

In [178]:
tap_train_poly = readtable("TAP_train_poly.csv");

In [179]:
X_nontest_poly = convert(Array{Float64},tap_train_poly[:,1:43]);
y_nontest_poly = convert(Array,tap_train_poly[:,44]);

In [180]:
X_train_poly, X_validate_poly, y_train_poly, y_validate_poly = train_test_split(X_nontest_poly, y_nontest_poly, test_size=0.25, random_state=42);

In [181]:
w = X_train_poly\y_train_poly

43-element Array{Float64,1}:
    0.937997  
  835.496     
  304.811     
    3.70493   
  239.491     
   -0.120824  
  200.976     
  264.665     
  269.311     
 -834.432     
   96.785     
   83.4556    
 -384.386     
    ⋮         
 1024.26      
  -11.5503    
  -14.5651    
 -192.545     
 -385.369     
 -596.487     
  156.697     
  -23.5264    
 -990.618     
  152.426     
    8.16142e-7
   -2.69559   

In [182]:
function mean_abs_err(X,y,w)
    n = size(X,1)
    sum = 0
    for i=1:n
        sum += abs.(y[i] - X[i,:]'*w)
    end
    return (sum+0.0)/n
end

mean_abs_err (generic function with 2 methods)

In [183]:
function mean_abs_percent(X,y,w)
    n = size(X,1)
    sum = 0
    for i=1:n
        sum += abs.((y[i] - X[i,:]'*w)/y[i])
    end
    return 100*(sum+0.0)/n
end

mean_abs_percent (generic function with 2 methods)

In [184]:
MAE_train_poly = mean_abs_err(X_train_poly, y_train_poly, w)

426.52151358576197

In [185]:
MAPD_train_poly = mean_abs_percent(X_train_poly, y_train_poly, w)

49.861689236013405

In [186]:
MAE_validate_poly = mean_abs_err(X_validate_poly, y_validate_poly, w)

425.5776193022963

In [187]:
MAPD_validate_poly = mean_abs_percent(X_validate_poly, y_validate_poly, w)

49.32059919790348