# Preliminary Models, Julia 0.60

In [1]:
# bring packages into main namespace
using DataFrames, Gadfly
using ScikitLearn
using ScikitLearn.CrossValidation: train_test_split



In [52]:
tap_raw = readtable("TAP_data.csv");
tap_clean = readtable("TAP_data_numeric.csv");
delete!(tap_clean, :x);

In [53]:
size(tap_clean)

(190964, 42)

In [54]:
data = convert(Array, tap_clean[:,:]);

In [55]:
# switch columns so that last column, award per fte, is the output column y
function swapCols!(X, i, j)
    for k = 1:size(X,1)
        X[k,i], X[k,j] = X[k,j], X[k,i]
    end
end

swapCols!(data, 6, 42)

In [56]:
X = data[:,1:41];
X = [copy(X) ones(size(X,1))]
y = data[:,42];

In [57]:
# split data into training, validation, test sets
X_nontest, X_test, y_nontest, y_test = train_test_split(X, y, test_size=0.20, random_state=24);
X_train, X_validate, y_train, y_validate = train_test_split(X_nontest, y_nontest, test_size=0.25, random_state=42);

## Linear model fitting

In [64]:
w = X_train\y_train;

In [95]:
function mean_abs_err(X,y,w)
    n = size(X,1)
    sum = 0
    for i=1:n
        sum += abs(y[i] - X[i,:]'*w)
    end
    return (sum+0.0)/n
end

mean_abs_err (generic function with 1 method)

In [132]:
function mean_abs_percent(X,y,w)
    n = size(X,1)
    sum = 0
    for i=1:n
        sum += abs((y[i] - X[i,:]'*w)/y[i])
    end
    return 100*(sum+0.0)/n
end

mean_abs_percent (generic function with 1 method)

In [115]:
MAE_train = mean_abs_err(X_train, y_train, w);
MAE_validate = mean_abs_err(X_validate, y_validate, w);
MAE_test = mean_abs_err(X_test, y_test, w);

In [131]:
print("Training MAE: $MAE_train \n")
print("Validation MAE: $MAE_validate \n")
print("Test MAE: $MAE_test \n")

Training MAE: 510.8487592507097 
Validation MAE: 510.64418530827277 
Test MAE: 513.049488689495 


In [133]:
MAPD_train = mean_abs_percent(X_train, y_train, w);
MAPD_validate = mean_abs_percent(X_validate, y_validate, w);
MAPD_test = mean_abs_percent(X_test, y_test, w);

In [134]:
print("Training MAPD: $MAPD_train \n")
print("Validation MAPD: $MAPD_validate \n")
print("Test MAPD: $MAPD_test \n")

Training MAPD: 62.70902925577334 
Validation MAPD: 62.607305105814014 
Test MAPD: 62.17103944422087 


In [69]:
train_err = (norm(y_train - X_train*w)^2)/size(y_train,1)

387817.5053848939

In [70]:
valid_err = (norm(y_validate - X_validate*w)^2)/size(y_validate,1)

388896.69734040124

In [118]:
w[1:15]

15-element Array{Float64,1}:
    31.2125   
 -8189.49     
 -3857.88     
     5.79992  
 -4983.59     
    -0.0630094
 -4616.6      
 -4561.61     
 -4512.14     
 -4665.06     
 -5994.98     
 -4963.06     
 -5447.06     
 -4848.82     
 -5233.58     

In [119]:
w[16:30]

15-element Array{Float64,1}:
   -781.694
   -794.479
   -656.586
   -880.359
   -744.762
  -4858.87 
  -4881.45 
  -4887.33 
  -4883.66 
  -4839.07 
 -10513.8  
  -9884.23 
  -3952.37 
   3297.66 
  -3952.37 

In [120]:
w[31:42]

12-element Array{Float64,1}:
   3551.85
   4005.33
  -4570.87
  -4723.99
  -4480.69
   1218.85
   1384.88
   1671.1 
  -5591.25
  -7593.09
  -8567.81
 -24350.4 

In [21]:
@sk_import linear_model: LinearRegression;

In [33]:
model = LinearRegression(fit_intercept=true)
ScikitLearn.fit!(model, X_train, y_train);

PyObject LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)