In [1]:
using CSV, DataFrames, Statistics, GLMNet, Random, Gurobi, JuMP, LinearAlgebra
Random.seed!(15095)

MersenneTwister(15095)

# Data Cleaning

In [2]:
df = DataFrame(CSV.File("clean_data.csv"));

In [3]:
liwc_vars = names(df)[[28,29,30,31,32,33,34,35,36,37,38,39,40,
        41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,
        66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,
        91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107]];
liwc_df = df[:,liwc_vars];
df = select(df,Not(liwc_vars));

In [4]:
# Remove low base rate variables (< .05)
base_rates = mean.(eachcol(liwc_df));
low_base_rates = []
for (i,e) in enumerate(base_rates)
    if abs(e) <= .5
        append!(low_base_rates,i)
    end
end
liwc_df = select(liwc_df,Not(low_base_rates));
df = hcat(df,liwc_df);

In [5]:
# Train/Test split
n = nrow(df);
split_point = convert(Int,ceil(0.8*n));
shuffled_ind = randperm(n);
train_ind = shuffled_ind[1:split_point];
test_ind = shuffled_ind[(1+split_point):end];
Xtrain,ytrain,Xtest,ytest = df[train_ind,Not(:plebe_cqpa)],df[train_ind,:plebe_cqpa],df[test_ind,Not(:plebe_cqpa)],df[test_ind,:plebe_cqpa];

In [6]:
function compute_r2(X,y,beta,beta_zero)
    SSres = sum( (y .- X*beta .- beta_zero).^2 )
    SStot = sum( (y .- Statistics.mean(y)).^2 )
    return 1-SSres/SStot
end

compute_r2 (generic function with 1 method)

In [7]:
function compute_mse(X,y,beta,beta_zero)
    n,p = size(X)
    return sum((y .- X*beta .- beta_zero).^2)/n
end

compute_mse (generic function with 1 method)

# Old Model

In [8]:
ytrain_old = (ytrain .- mean(ytrain))./std(ytrain);
ytest_old = (ytest .- mean(ytest))./std(ytest);

In [9]:
Xtrain_old = Matrix(Xtrain[:,[:average_WC,:verb,:informal,:compare,:faculty_app_scr]]);
Xtest_old = Matrix(Xtest[:,[:average_WC,:verb,:informal,:compare,:faculty_app_scr]]);

Xtrain_old = (Xtrain_old .- mean(Xtrain_old,dims=1))./std(Xtrain_old,dims=1);
Xtest_old = (Xtest_old .- mean(Xtest_old,dims=1))./std(Xtest_old,dims=1);

Xtrain_old = coalesce(Xtrain_old,0);
Xtest_old = coalesce(Xtest_old,0);

In [10]:
# Feature names - for interpretability purposes
feature_names = ["Average WC","Verb","Informal","Compare","FAS"];

In [11]:
# Final model
lasso_cv = glmnetcv(Xtrain_old,ytrain_old)
best = argmin(lasso_cv.meanloss)
beta_lasso = lasso_cv.path.betas[:,best]
beta_zero_lasso = lasso_cv.path.a0[best]
indices_lasso = [i for i=1:length(beta_lasso) if beta_lasso[i] != 0]

lasso_lambda = lambdamin(lasso_cv)
println("Hyperparameter: $(lasso_lambda) \n")

for i in indices_lasso
    println("Selected $(feature_names[i]) with value $(beta_lasso[i])")
end

Hyperparameter: 0.001144532333395605 

Selected Average WC with value 0.185812596343394
Selected Verb with value -0.20239721606158242
Selected Informal with value -0.06795163321405868
Selected Compare with value 0.08104164196587725
Selected FAS with value 0.3025071524449968


In [12]:
println("Train r2 = $(compute_r2(Xtrain_old,ytrain_old,beta_lasso,beta_zero_lasso))")
println("Test r2 = $(compute_r2(Xtest_old,ytest_old,beta_lasso,beta_zero_lasso))")
println()
println("Train MSE = $(compute_mse(Xtrain_old,ytrain_old,beta_lasso,beta_zero_lasso))")
println("Test MSE = $(compute_mse(Xtest_old,ytest_old,beta_lasso,beta_zero_lasso))")
println()
println("Sparsity: $(length(indices_lasso))")
println()

Train r2 = 0.192664382735646
Test r2 = 0.17785414053677728

Train MSE = 0.8072133122170485
Test MSE = 0.8216475892453662

Sparsity: 5



# New Model

In [13]:
ytrain_new = (ytrain .- mean(ytrain))./std(ytrain);
ytest_new = (ytest .- mean(ytest))./std(ytest);

In [14]:
vars = names(df)[[12,20,21,22,23,24,25,28,29,30,31,32,33,34,35,36,37,38,39,40,
        41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,
        66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81]];

In [15]:
Xtrain_new = Matrix(Xtrain[:,vars]);
Xtest_new = Matrix(Xtest[:,vars]);

Xtrain_new = coalesce.(Xtrain_new,0);
Xtest_new = coalesce.(Xtest_new,0);

Xtrain_new = (Xtrain_new .- mean(Xtrain_new,dims=1))./std(Xtrain_new,dims=1);
Xtest_new = (Xtest_new .- mean(Xtest_new,dims=1))./std(Xtest_new,dims=1);

In [16]:
# Feature names - for interpretability purposes
features = vars
p = length(features)
feature_names = []
for i=1:p
    append!(feature_names,[features[i]])
    append!(feature_names,[features[i]*"_squared"])
    append!(feature_names,[features[i]*"_sqrt"])
    append!(feature_names,[features[i]*"_log"])
end

In [17]:
# Holistic regression auxiliary functions

function compute_nonlinear_transformations(X::Matrix{Float64})
    n,p0 = size(X)
    p = p0 + p0*3
    X_ext = zeros(n,p)
    
    col_num = 1
    for j0=1:p0
        X_ext[:,col_num] .= X[:,j0]
        col_num += 1
        for j=1:3
            if j == 1
                X_ext[:,col_num] .= X[:,j0].^2
            elseif j == 2
                X_ext[:,col_num] .= abs.(X[:,j0]).^0.5
            elseif j == 3
                X_ext[:,col_num] .= log.(abs.(X[:,j0]).+1) 
            end
            col_num += 1
        end
    end
    
    return X_ext
end  

function compute_HC(X::Matrix{Float64},ρ_max::Float64)
    n,p = size(X)
    c = zeros(p,p)
    for i=1:p-1,j=i+1:p
        c[i,j] = cor(X[:,i],X[:,j])
    end
    return [(i,j) for i=1:p for j=i+1:p if abs(c[i,j])>ρ_max]
end

compute_HC (generic function with 1 method)

In [18]:
function holistic(X::Matrix{Float64},Y::Vector{Float64},Γ::Float64,k::Int,ρ_max::Float64)

    n,p0 = size(X)
    M = 30
    # NOTE: Select big-M by inspecting the absolute maginitude of the lasso coefficients
    
    X_extended = compute_nonlinear_transformations(X)
    n,p = size(X_extended)
        
    HC = compute_HC(X_extended,ρ_max)
    
    m = Model(Gurobi.Optimizer)
    set_optimizer_attribute(m,"OutputFlag",0)
    set_time_limit_sec(m, 60.0)
    
    @variable(m, β0)
    @variable(m, β[1:p])
    @variable(m, w[1:p])
    @variable(m, z[1:p], Bin)

    # Linearize l1 norm
    @constraint(m, [i=1:p], w[i] >= β[i])
    @constraint(m, [i=1:p], w[i] >= -β[i])

    # Big M and sparsity
    @constraint(m, sum(z) <= k)
    @constraint(m, [i=1:p], β[i] >= -M*z[i])
    @constraint(m, [i=1:p], β[i] <= M*z[i])

    # Pairwise correlation
    for (i,j) in HC
        @constraint(m, z[i] + z[j] <= 1)
    end
    
    # Transformations
    j = 1
    for j0=1:p0
        @constraint(m, sum(z[j+j_transf] for j_transf=0:3) <= 1)
        j += 4
    end

    # Objective (w robustness)
    res = 0.5*sum((Y[i]-dot(X_extended[i,:],β)-β0)^2 for i=1:n)
    reg = sum(w[i] for i=1:p)
    @objective(m,Min,res + Γ*reg)

    optimize!(m)
                    
    return value.(β), value(β0)
end

holistic (generic function with 1 method)

In [19]:
function holistic_cv(X::Matrix{Float64}, Y::Vector{Float64}, Γ_vals, k_vals, ρ_max_vals)

    n,p = size(X)
    
    # Train and validation split
    split_point = convert(Int, ceil(0.8*n))
    shuffled_ind = randperm(n)
    train_ind = shuffled_ind[1:split_point]
    valid_ind = shuffled_ind[(1+split_point):end]
    train_X, train_y, valid_X, valid_y = X[train_ind,:], Y[train_ind], X[valid_ind,:], Y[valid_ind]
    valid_X_ex = compute_nonlinear_transformations(valid_X)

    cv_path = DataFrame(k=Int[], Γ=Float64[], ρ_max=Float64[], valid_score=Float64[])
    
    for ρ_max in ρ_max_vals, Γ in Γ_vals, k in k_vals
        β,β0 = holistic(train_X,train_y,Γ,k,ρ_max)
        r2 = compute_r2(valid_X_ex,valid_y,β,β0)
        push!(cv_path,[k,Γ,ρ_max,r2])
    end

    # Refit
    best = argmax(cv_path.valid_score)
    Γ,k,ρ_max = cv_path.Γ[best], cv_path.k[best], cv_path.ρ_max[best]
    params = (Γ,k,ρ_max)
    β,β0 = holistic(X,Y,Γ,k,ρ_max)
    
    return β,β0,Γ
end

holistic_cv (generic function with 1 method)

In [20]:
# Regularization
Γ_vals = [.001,.005,.01,.03,.1,.2,.3,.5,1.0]

# Sparsity
k_vals = [5]

# Correlation
ρ_max_vals = [0.7]

β,β0,lambda = holistic_cv(Xtrain_new,ytrain_new,Γ_vals,k_vals,ρ_max_vals);
indices_holistic = [i for i=1:length(β) if abs(β[i]) > .00001];
println("\nOptimal lambda: $lambda")

Academic license - for non-commercial use only - expires 2022-09-12
Academic license - for non-commercial use only - expires 2022-09-12
Academic license - for non-commercial use only - expires 2022-09-12
Academic license - for non-commercial use only - expires 2022-09-12
Academic license - for non-commercial use only - expires 2022-09-12
Academic license - for non-commercial use only - expires 2022-09-12
Academic license - for non-commercial use only - expires 2022-09-12
Academic license - for non-commercial use only - expires 2022-09-12
Academic license - for non-commercial use only - expires 2022-09-12
Academic license - for non-commercial use only - expires 2022-09-12

Optimal lambda: 1.0


In [21]:
for i in indices_holistic
    println("Selected $(feature_names[i]) with value $(β[i])")
end

Selected faculty_app_scr with value 0.3150534387471893
Selected average_WC with value 0.2530560652291454
Selected compare with value 0.11018430156657338
Selected social with value -0.24064918513000652
Selected insight_squared with value 0.06342273973248637


In [22]:
Xtrain_new_ex = compute_nonlinear_transformations(Xtrain_new)
Xtest_new_ex = compute_nonlinear_transformations(Xtest_new)

println("Train r2 = $(compute_r2(Xtrain_new_ex,ytrain_new,β,β0))")
println("Test r2 = $(compute_r2(Xtest_new_ex,ytest_new,β,β0))")
println()
println("Train MSE = $(compute_mse(Xtrain_new_ex,ytrain_new,β,β0))")
println("Test MSE = $(compute_mse(Xtest_new_ex,ytest_new,β,β0))")
println()
println("Sparsity: $(length(indices_holistic))")
println()

Train r2 = 0.20207641727241854
Test r2 = 0.18932714911810922

Train MSE = 0.7978027035300771
Test MSE = 0.8101815340025684

Sparsity: 5



# Mean Imputation

In [18]:
ytrain_mean = (ytrain .- mean(ytrain))./std(ytrain);
ytest_mean = (ytest .- mean(ytest))./std(ytest);

In [19]:
Xtrain_mean = deepcopy(Xtrain);
Xtest_mean = deepcopy(Xtest);

vars = names(df)[[12,20,21,22,23,24,25,28,29,30,31,32,33,34,35,36,37,38,39,40,
        41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,
        66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81]];

Xtrain_mean = Xtrain_mean[:,vars];
Xtest_mean = Xtest_mean[:,vars];

Xtrain_mean = coalesce.(Xtrain_mean,0);
Xtest_mean = coalesce.(Xtest_mean,0);

Xtrain_mean[!,:faculty_app_scr] = convert.(Float64,Xtrain_mean[!,:faculty_app_scr]);
Xtrain_mean[!,:min_WC] = convert.(Float64,Xtrain_mean[!,:min_WC]);
Xtrain_mean[!,:max_WC] = convert.(Float64,Xtrain_mean[!,:max_WC]);
Xtrain_mean[!,:sentiment] = convert.(Float64,Xtrain_mean[!,:sentiment]);
Xtrain_mean[!,:WC] = convert.(Float64,Xtrain_mean[!,:WC]);

Xtest_mean[!,:faculty_app_scr] = convert.(Float64,Xtest_mean[!,:faculty_app_scr]);
Xtest_mean[!,:min_WC] = convert.(Float64,Xtest_mean[!,:min_WC]);
Xtest_mean[!,:max_WC] = convert.(Float64,Xtest_mean[!,:max_WC]);
Xtest_mean[!,:sentiment] = convert.(Float64,Xtest_mean[!,:sentiment]);
Xtest_mean[!,:WC] = convert.(Float64,Xtest_mean[!,:WC]);

In [20]:
allowmissing!(Xtrain_mean)
for i in 1:nrow(Xtrain_mean)
    for j in 1:ncol(Xtrain_mean)
        if Xtrain_mean[i,j] == 0.0
            Xtrain_mean[i,j] = missing
        end
    end
end

In [21]:
allowmissing!(Xtest_mean)
for i in 1:nrow(Xtest_mean)
    for j in 1:ncol(Xtest_mean)
        if Xtest_mean[i,j] == 0.0
            Xtest_mean[i,j] = missing
        end
    end
end

In [22]:
for p in 1:ncol(Xtrain_mean)
  idx_missing = ismissing.(Xtrain_mean[:,p])
  Xtrain_mean[idx_missing,p] .= mean(Xtrain_mean[.!idx_missing,p]) 
end

In [23]:
for p in 1:ncol(Xtest_mean)
  idx_missing = ismissing.(Xtest_mean[:,p])
  Xtest_mean[idx_missing,p] .= mean(Xtest_mean[.!idx_missing,p]) 
end

In [24]:
# Feature names - for interpretability purposes
features = names(Xtrain_mean)
p = length(features)
feature_names = []
for i=1:p
    append!(feature_names,[features[i]])
    append!(feature_names,[features[i]*"_squared"])
    append!(feature_names,[features[i]*"_sqrt"])
    append!(feature_names,[features[i]*"_log"])
end

In [25]:
Xtrain_mean = Matrix(Xtrain_mean);
Xtest_mean = Matrix(Xtest_mean);
Xtrain_mean = (Xtrain_mean .- mean(Xtrain_mean,dims=1))./std(Xtrain_mean,dims=1);
Xtest_mean = (Xtest_mean .- mean(Xtest_mean,dims=1))./std(Xtest_mean,dims=1);

In [26]:
# Regularization
Γ_vals = [.001,.005,.01,.03,.1,.2,.3,.5,1.0]

# Sparsity
k_vals = [5]

# Correlation
ρ_max_vals = [0.7]

β,β0,lambda = holistic_cv(Xtrain_mean,ytrain_mean,Γ_vals,k_vals,ρ_max_vals);
indices_holistic = [i for i=1:length(β) if abs(β[i]) > .00001];
println("\nOptimal lambda: $lambda")

LoadError: UndefVarError: holistic_cv not defined

In [27]:
for i in indices_holistic
    println("Selected $(feature_names[i]) with value $(β[i])")
end

LoadError: UndefVarError: indices_holistic not defined

In [28]:
Xtrain_mean_ex = compute_nonlinear_transformations(Xtrain_mean)
Xtest_mean_ex = compute_nonlinear_transformations(Xtest_mean)

println("Train r2 = $(compute_r2(Xtrain_mean_ex,ytrain_mean,β,β0))")
println("Test r2 = $(compute_r2(Xtest_mean_ex,ytest_mean,β,β0))")
println()
println("Train MSE = $(compute_mse(Xtrain_mean_ex,ytrain_mean,β,β0))")
println("Test MSE = $(compute_mse(Xtest_mean_ex,ytest_mean,β,β0))")
println()
println("Sparsity: $(length(indices_holistic))")
println()

LoadError: UndefVarError: β not defined

# Optimal Imputation

In [29]:
ytrain_opt = (ytrain .- mean(ytrain))./std(ytrain);
ytest_opt = (ytest .- mean(ytest))./std(ytest);

In [30]:
Xtrain_opt = deepcopy(Xtrain);
Xtest_opt = deepcopy(Xtest);

vars = names(df)[[12,20,21,22,23,24,25,28,29,30,31,32,33,34,35,36,37,38,39,40,
        41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,
        66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81]];

Xtrain_opt = Xtrain_opt[:,vars];
Xtest_opt = Xtest_opt[:,vars];

Xtrain_opt = coalesce.(Xtrain_opt,0);
Xtest_opt = coalesce.(Xtest_opt,0);

Xtrain_opt[!,:faculty_app_scr] = convert.(Float64,Xtrain_opt[!,:faculty_app_scr]);
Xtrain_opt[!,:min_WC] = convert.(Float64,Xtrain_opt[!,:min_WC]);
Xtrain_opt[!,:max_WC] = convert.(Float64,Xtrain_opt[!,:max_WC]);
Xtrain_opt[!,:sentiment] = convert.(Float64,Xtrain_opt[!,:sentiment]);
Xtrain_opt[!,:WC] = convert.(Float64,Xtrain_opt[!,:WC]);

Xtest_opt[!,:faculty_app_scr] = convert.(Float64,Xtest_opt[!,:faculty_app_scr]);
Xtest_opt[!,:min_WC] = convert.(Float64,Xtest_opt[!,:min_WC]);
Xtest_opt[!,:max_WC] = convert.(Float64,Xtest_opt[!,:max_WC]);
Xtest_opt[!,:sentiment] = convert.(Float64,Xtest_opt[!,:sentiment]);
Xtest_opt[!,:WC] = convert.(Float64,Xtest_opt[!,:WC]);

In [31]:
allowmissing!(Xtrain_opt)
for i in 1:nrow(Xtrain_opt)
    for j in 1:ncol(Xtrain_opt)
        if Xtrain_opt[i,j] == 0.0
            Xtrain_opt[i,j] = missing
        end
    end
end

In [32]:
allowmissing!(Xtest_opt)
for i in 1:nrow(Xtest_opt)
    for j in 1:ncol(Xtest_opt)
        if Xtest_opt[i,j] == 0.0
            Xtest_opt[i,j] = missing
        end
    end
end

In [33]:
Xtrain_opt = IAI.impute(Xtrain_opt);
Xtest_opt = IAI.impute(Xtest_opt);

│  - To prevent this behaviour, do `ProgressMeter.ijulia_behavior(:append)`. 
└ @ ProgressMeter /Users/iai/builds/InterpretableAI/SystemImage/SysImgBuilder/.julia/packages/ProgressMeter/Vf8un/src/ProgressMeter.jl:620
[32mTrying different warm starts...    100%|████████████████| Time: 0:00:31[39m
[A4m  Warmstart:  rand[39m



In [34]:
# Feature names - for interpretability purposes
features = names(Xtrain_opt)
p = length(features)
feature_names = []
for i=1:p
    append!(feature_names,[features[i]])
    append!(feature_names,[features[i]*"_squared"])
    append!(feature_names,[features[i]*"_sqrt"])
    append!(feature_names,[features[i]*"_log"])
end

In [35]:
Xtrain_opt = Matrix(Xtrain_opt);
Xtest_opt = Matrix(Xtest_opt);
Xtrain_opt = (Xtrain_opt .- mean(Xtrain_opt,dims=1))./std(Xtrain_opt,dims=1);
Xtest_opt = (Xtest_opt .- mean(Xtest_opt,dims=1))./std(Xtest_opt,dims=1);

In [36]:
# Regularization
Γ_vals = [.001,.005,.01,.03,.1,.2,.3,.5,1.0]

# Sparsity
k_vals = [5]

# Correlation
ρ_max_vals = [0.7]

β,β0,lambda = holistic_cv(Xtrain_opt,ytrain_opt,Γ_vals,k_vals,ρ_max_vals);
indices_holistic = [i for i=1:length(β) if abs(β[i]) > .00001];
println("\nOptimal lambda: $lambda")

LoadError: UndefVarError: holistic_cv not defined

In [37]:
for i in indices_holistic
    println("Selected $(feature_names[i]) with value $(β[i])")
end

LoadError: UndefVarError: indices_holistic not defined

In [38]:
Xtrain_opt_ex = compute_nonlinear_transformations(Xtrain_opt)
Xtest_opt_ex = compute_nonlinear_transformations(Xtest_opt)

println("Train r2 = $(compute_r2(Xtrain_opt_ex,ytrain_opt,β,β0))")
println("Test r2 = $(compute_r2(Xtest_opt_ex,ytest_opt,β,β0))")
println()
println("Train MSE = $(compute_mse(Xtrain_opt_ex,ytrain_opt,β,β0))")
println("Test MSE = $(compute_mse(Xtest_opt_ex,ytest_opt,β,β0))")
println()
println("Sparsity: $(length(indices_holistic))")
println()

LoadError: UndefVarError: β not defined

In [None]:
# Part 2

In [None]:
lnr = IAI.ImputationLearner(method=:=opt_knn)