# Simple Exercise on Overfitting

In [100]:
# If necesary, install functions
# import Pkg; Pkg.add("GLM")
# import Pkg; Pkg.add("DataFrames")

In [99]:
# Import functions
using LinearAlgebra, GLM, DataFrames

## 1. First set p=n


In [82]:
n = 1000
p = n - 1

999

In [83]:
# Create a 1000x1000 matrix of standard Gaussians
X = randn(n, p)

# Create a 1000x1 matrix of standard Gaussians
Y = randn(n)

1000-element Vector{Float64}:
  1.1181680993031338
  0.9048884366065771
 -1.3907433741150474
  0.6304368151706362
 -1.8429202258286534
 -0.31672072366583975
 -0.2444852981482702
 -1.0950179837695753
  0.07684324028140456
  1.4468569125635145
  0.41008022440563224
 -0.2190566706006362
  0.10351041047812766
  ⋮
  1.3776973853800771
 -1.262104099651985
  0.7308010821897911
  1.171561645515982
 -0.6987243667433993
  0.685404752898965
 -0.28880845835416247
 -0.11666045232052512
 -2.4916696416406925
 -1.2316720906345198
  1.808686123320547
 -0.6892723287637772

In [79]:
# This is a fuction that only returns coeficients (used to compare with LM library used below)
function OLSestimator(y, x)
    estimate = inv(x'*x)*(x'*y)
    return estimate
end

OLSestimator (generic function with 1 method)

In [85]:
estimates = OLSestimator(Y,X)

999-element Vector{Float64}:
 -0.7189579848069982
  0.429788535125601
  0.007963276281123921
  0.5681157230305767
  0.0902094552993905
  1.5746904772433006
 -0.5692232031484568
 -1.325160399623563
  0.11405135260807314
 -0.8225118485867293
 -0.6545464547785147
  1.5439264314872472
 -0.17103304033514322
  ⋮
 -0.12512384818944922
  0.7643468943488685
 -0.4792404797641948
 -0.14616144859326363
 -0.846770186887849
 -0.023457019655847455
  1.0206298792099369
 -0.40479224955707926
 -0.27527486596903844
  0.7209193043286936
 -0.8361814948108592
 -0.17418011103928333

In [86]:
# Here we create a linear form to `y`
# Notice that coefficients are the same as the function used above, but std. errors are different due to these change 
X = randn(n, p)
β = randn(p)
y = X * β .+ randn(n)
fitted = lm(X,y)

LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}:

Coefficients:
──────────────────────────────────────────────────────────────────────────
            Coef.  Std. Error       t  Pr(>|t|)     Lower 95%    Upper 95%
──────────────────────────────────────────────────────────────────────────
x1     0.717086     0.0447572   16.02    0.0397   0.148392      1.28578
x2    -1.01929      0.0706965  -14.42    0.0441  -1.91757      -0.121002
x3     2.573        0.12843     20.03    0.0318   0.941151      4.20486
x4    -2.85703      0.063601   -44.92    0.0142  -3.66516      -2.0489
x5    -0.178015     0.0561149   -3.17    0.1944  -0.891022      0.534992
x6     1.28839      0.104777    12.30    0.0517  -0.0429225     2.61971
x7    -0.966944     0.064687   -14.95    0.0425  -1.78887      -0.145018
x8     0.213872     0.0457784    4.67    0.1342  -0.367798      0.795542
x9     0.736667     0.083501     8.82    0.0719  -0.324314      1.79

In [84]:
# Fitted linear regression 
fitted = lm(X,Y)

LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}:

Coefficients:
─────────────────────────────────────────────────────────────────────
             Coef.  Std. Error      t  Pr(>|t|)  Lower 95%  Upper 95%
─────────────────────────────────────────────────────────────────────
x1    -0.718958       1.4064    -0.51    0.6992  -18.589     17.1511
x2     0.429789       0.982064   0.44    0.7374  -12.0485    12.9081
x3     0.00796328     0.636769   0.01    0.9920   -8.08295    8.09888
x4     0.568116       0.647737   0.88    0.5416   -7.66217    8.7984
x5     0.0902095      0.819931   0.11    0.9302  -10.328     10.5084
x6     1.57469        2.11167    0.75    0.5921  -25.2566    28.406
x7    -0.569223       0.553791  -1.03    0.4913   -7.60581    6.46736
x8    -1.32516        1.21231   -1.09    0.4717  -16.729     14.0787
x9     0.114051       0.962351   0.12    0.9249  -12.1138    12.3419
x10   -0.822512       1.04623   -0.79   

In [87]:
println("p/n is")
println(p/n)

p/n is
0.999


In [90]:
print("R2 is")
r2(fitted)

R2 is

0.9999999932773954

In [92]:
print("Adjusted R2 is")
adjr2(fitted)

Adjusted R2 is

0.9999932841180205

## 2. Second, set p=n/2.

In [127]:
# We have to make sure that both variables are the same type (Integers or floats) to avoid errors when running the regression
n = 1000
p = Int(n/2)

500

In [128]:
typeof(n)

Int64

In [129]:
typeof(p)

Int64

In [130]:
# Create a nxp matrix of standard Gaussians
X = randn(n, p)

# Create a nx1 matrix of standard Gaussians
Y = randn(n)

1000-element Vector{Float64}:
 -0.4147864900021671
  0.3311570590308964
 -1.2832447089277261
  0.5662484898621389
 -0.6896809477152834
  0.8010642790434982
 -0.009942763720650068
  0.8893131364667182
  1.3879502863956992
 -0.973433151332768
  0.06584540515040915
 -0.8090902312377154
  0.4075859901632228
  ⋮
 -0.9256796152462494
 -1.2202670508866358
  0.4841200188613278
 -0.5102700416357255
 -0.05881428204342041
 -0.5393999609439742
  1.2837899014898704
  0.791331629976505
  1.7327515733232632
  0.483880523511758
  0.59248121424177
  0.06280982950753938

In [131]:
fitted = lm(X,Y)

LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}:

Coefficients:
─────────────────────────────────────────────────────────────────────────
             Coef.  Std. Error      t  Pr(>|t|)    Lower 95%    Upper 95%
─────────────────────────────────────────────────────────────────────────
x1    -0.0483005     0.0460182  -1.05    0.2944  -0.138713     0.0421124
x2     0.0397686     0.0477505   0.83    0.4053  -0.0540478    0.133585
x3    -0.0363974     0.0497049  -0.73    0.4643  -0.134054     0.0612589
x4     0.0512461     0.0460902   1.11    0.2667  -0.0393082    0.1418
x5     0.0273769     0.0500945   0.55    0.5850  -0.0710447    0.125799
x6    -0.0371949     0.0484782  -0.77    0.4433  -0.132441     0.0580512
x7    -0.00550442    0.045634   -0.12    0.9040  -0.0951624    0.0841536
x8    -0.0456675     0.0444918  -1.03    0.3052  -0.133081     0.0417464
x9     0.0241762     0.045045    0.54    0.5917  -0.0643247    0.112677

In [132]:
println("p/n is")
println(p/n)

p/n is
0.5


In [133]:
print("R2 is")
r2(fitted)

R2 is

0.47572152313490135

In [134]:
print("Adjusted R2 is")
adjr2(fitted)

Adjusted R2 is

-0.04750839677646712

## 3. Third, set p/n =.05

In [135]:
# We have to make sure that both variables are the same type (Integers or floats) to avoid errors when running the regression
n = 1000
p = Int(0.05*n)

50

In [136]:
typeof(n)

Int64

In [137]:
typeof(p)

Int64

In [138]:
# Create a nxp matrix of standard Gaussians
X = randn(n, p)

# Create a nx1 matrix of standard Gaussians
Y = randn(n)

1000-element Vector{Float64}:
 -1.1256362958678077
 -0.28443850070764587
  0.7623924742104705
  1.326775230464077
 -0.7668409491865783
  0.07517496555779557
  1.168395633313302
 -0.5298326940035766
  0.9888701338331589
  1.5555689312957381
  0.08282118655469965
  0.10102911492089463
 -0.09096446480587435
  ⋮
 -0.8062443249678559
  0.4904453322142418
 -0.6286379739153596
 -1.8053341081077878
  0.8157910797034916
  1.0281506232674964
  1.0653568869658934
  0.2697922075204841
  0.8368075230948664
 -0.5936812390595293
 -0.822541134575298
 -0.7825693760704974

In [139]:
fitted = lm(X,Y)

LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}:

Coefficients:
────────────────────────────────────────────────────────────────────────
            Coef.  Std. Error      t  Pr(>|t|)   Lower 95%     Upper 95%
────────────────────────────────────────────────────────────────────────
x1    0.0051283     0.03234     0.16    0.8740  -0.0583378   0.0685944
x2   -0.00377035    0.0322134  -0.12    0.9069  -0.066988    0.0594473
x3   -0.0636958     0.0334902  -1.90    0.0575  -0.129419    0.00202762
x4   -0.0655549     0.0328955  -1.99    0.0466  -0.130111   -0.000998551
x5    0.0294538     0.0327108   0.90    0.3681  -0.0347399   0.0936476
x6    0.00159376    0.0328855   0.05    0.9614  -0.0629429   0.0661304
x7   -0.0410104     0.0319855  -1.28    0.2001  -0.103781    0.02176
x8    0.000525336   0.0331416   0.02    0.9874  -0.0645138   0.0655645
x9   -0.0415254     0.0320503  -1.30    0.1954  -0.104423    0.0213722
x10  -0.01980

In [140]:
println("p/n is")
println(p/n)

p/n is
0.05


In [141]:
print("R2 is")
r2(fitted)

R2 is

0.044226247866030355

In [142]:
print("Adjusted R2 is")
adjr2(fitted)

Adjusted R2 is

-0.00507155619140609