In [1]:
import pandas as pd
import numpy as np
from linear_regression import *

In [2]:
gss = pd.read_csv('../Datasets/gss.csv', low_memory=False) 
gss_2010 = gss.loc[gss["yrint"] == 2010].dropna(subset=['paeduc', 'maeduc', 'age', 'educ'])
X, y = gss_2010[['paeduc', 'maeduc', 'age']], gss_2010['educ']

In [None]:
X_numpy = np.hstack([np.ones((np.asarray(X, dtype=float).shape[0], 1)), np.asarray(X, dtype=float)])
y_numpy = np.asarray(y, dtype=float)
np_model = LinearRegressionOLS().fit(X_numpy, y_numpy, feature_names=['const', 'paeduc','maeduc','age'], target_name="Education")

print(X_numpy, y_numpy)
print(np_model)

[[ 1.  8.  3. 31.]
 [ 1. 18.  6. 23.]
 [ 1.  2.  0. 82.]
 ...
 [ 1. 12. 14. 54.]
 [ 1.  6. 12. 57.]
 [ 1. 12. 12. 30.]] [16. 16. 10. ... 12. 16. 14.]

OLS Regression Results
Dependent:                Education
-----------------------------------
 
const                     7.3256***
                           (0.3684)
 
paeduc                    0.2144***
                           (0.0241)
 
maeduc                    0.2569***
                           (0.0271)
 
age                       0.0241***
                           (0.0043)

-----------------------------------
R-squared                     0.276
Adjusted R-squared            0.274
F Statistic                 177.548
Observations               1402.000
Log Likelihood            -3359.107
AIC                        6726.213
BIC                        6747.196
*p<0.1; **p<0.05; ***p<0.01



In [None]:
X_const = X.copy()
X_const.insert(0, 'const', np.ones(len(X)))
model = LinearRegressionOLS().fit(X_const, y)

print(model)


OLS Regression Results
Dependent:                     educ
-----------------------------------
 
const                     7.3256***
                           (0.3684)
 
paeduc                    0.2144***
                           (0.0241)
 
maeduc                    0.2569***
                           (0.0271)
 
age                       0.0241***
                           (0.0043)

-----------------------------------
R-squared                     0.276
Adjusted R-squared            0.274
F Statistic                 177.548
Observations               1402.000
Log Likelihood            -3359.107
AIC                        6726.213
BIC                        6747.196
*p<0.1; **p<0.05; ***p<0.01



In [5]:
print(RegressionOutput([model, np_model], 15, 20))


OLS Regression Results
Dependent:                     educ      Education
--------------------------------------------------
 
const                     7.3256***      7.3256***
                           (0.3684)       (0.3684)
 
paeduc                    0.2144***      0.2144***
                           (0.0241)       (0.0241)
 
maeduc                    0.2569***      0.2569***
                           (0.0271)       (0.0271)
 
age                       0.0241***      0.0241***
                           (0.0043)       (0.0043)

--------------------------------------------------
R-squared                     0.276          0.276
Adjusted R-squared            0.274          0.274
F Statistic                 177.548        177.548
Observations               1402.000       1402.000
Log Likelihood            -3359.107      -3359.107
AIC                        6726.213       6726.213
BIC                        6747.196       6747.196
*p<0.1; **p<0.05; ***p<0.01



In [6]:
pd.DataFrame(model.feature_summary())

Unnamed: 0,feature,coefficient,se,t_statistic,p_>_abs_t,conf_interval__0.05
0,const,7.3256,0.3684,19.887,0.0,"[6.603, 8.048]"
1,paeduc,0.2144,0.0241,8.8796,0.0,"[0.167, 0.262]"
2,maeduc,0.2569,0.0271,9.4725,0.0,"[0.204, 0.31]"
3,age,0.0241,0.0043,5.5789,0.0,"[0.016, 0.033]"


In [7]:
pd.DataFrame(model.VarianceInflationFactor())

Unnamed: 0,feature,VIF
0,paeduc,2.0233
1,maeduc,2.0285
2,age,1.0971


In [8]:
pd.DataFrame(model.RobustStandardError(type="HC3"))

Unnamed: 0,feature,robust_se,robust_t,robust_p
0,const,0.43446,16.861504,0.0
1,paeduc,0.023602,9.084741,0.0
2,maeduc,0.029439,8.727734,0.0
3,age,0.004221,5.71529,1.33669e-08


In [9]:
model.predict(np.array([[0, 0, 0], ]))

array([7.32564767])

In [10]:
prediction_set = [
    np.array([[0, 0, 0], ]),
    np.array([[X['paeduc'].mean(), X['maeduc'].mean(), X['age'].mean(),], ]),
    np.array([[X['paeduc'].mean(), 14,  X['age'].mean()], ]),
    np.array([[11.624822, 11.659058, 48.225392], ]),
    np.array([[8, 14,  X['age'].mean()], ]),
] 
predictions = pd.concat([pd.DataFrame(model.predict(test_set, return_table=True)) for test_set in prediction_set], ignore_index=True)
predictions

Unnamed: 0,features,prediction,std_error,t_statistic,P>|t|,ci_low_0.05,ci_high_0.05
0,"{'paeduc': '0.00', 'maeduc': '0.00', 'age': '0...",7.3256,0.3684,19.887,0.0,6.603,8.0483
1,"{'paeduc': '11.62', 'maeduc': '11.66', 'age': ...",13.9772,0.071,196.7284,0.0,13.8378,14.1165
2,"{'paeduc': '11.62', 'maeduc': '14.00', 'age': ...",14.5786,0.0953,152.9974,0.0,14.3917,14.7656
3,"{'paeduc': '11.62', 'maeduc': '11.66', 'age': ...",13.9772,0.071,196.7284,0.0,13.8378,14.1165
4,"{'paeduc': '8.00', 'maeduc': '14.00', 'age': '...",13.8014,0.156,88.4816,0.0,13.4954,14.1074


In [11]:
prediction_set = [
    (np.array([[i, X['maeduc'].mean(), X['age'].mean()],]))
    for i in range(int(X['paeduc'].min()), int(X['paeduc'].max())+1)
    ] 
predictions = pd.concat([pd.DataFrame(model.predict(i, return_table=True)) for i in prediction_set], ignore_index=True)
predictions

Unnamed: 0,features,prediction,std_error,t_statistic,P>|t|,ci_low_0.05,ci_high_0.05
0,"{'paeduc': '0.00', 'maeduc': '11.66', 'age': '...",11.4846,0.2896,39.662,0.0,10.9166,12.0526
1,"{'paeduc': '1.00', 'maeduc': '11.66', 'age': '...",11.699,0.2662,43.9452,0.0,11.1768,12.2212
2,"{'paeduc': '2.00', 'maeduc': '11.66', 'age': '...",11.9134,0.243,49.0201,0.0,11.4367,12.3902
3,"{'paeduc': '3.00', 'maeduc': '11.66', 'age': '...",12.1278,0.2201,55.1134,0.0,11.6962,12.5595
4,"{'paeduc': '4.00', 'maeduc': '11.66', 'age': '...",12.3423,0.1974,62.5393,0.0,11.9551,12.7294
5,"{'paeduc': '5.00', 'maeduc': '11.66', 'age': '...",12.5567,0.175,71.7361,0.0,12.2133,12.9001
6,"{'paeduc': '6.00', 'maeduc': '11.66', 'age': '...",12.7711,0.1533,83.3162,0.0,12.4704,13.0718
7,"{'paeduc': '7.00', 'maeduc': '11.66', 'age': '...",12.9855,0.1324,98.1061,0.0,12.7259,13.2452
8,"{'paeduc': '8.00', 'maeduc': '11.66', 'age': '...",13.1999,0.1127,117.0875,0.0,12.9788,13.4211
9,"{'paeduc': '9.00', 'maeduc': '11.66', 'age': '...",13.4144,0.0952,140.8904,0.0,13.2276,13.6011


In [12]:
# Do not include the intercept
results = model.HypothesisTesting(test=np.array([[0, 0, 0], ]), hyp=np.array([[1, 1, 1], ]))
print(results['summary'])
pd.DataFrame(results['table']).T

Significance Analysis (p > |t|)
1.96 > |-1.3451| == True

Fail to reject the null hypothesis: 7.3256 is not statistically different from 7.8211 at 5.0% level

Conclude that outcome of {'paeduc': '0.00', 'maeduc': '0.00', 'age': '0.00'}
does not differ from {'paeduc': '1.00', 'maeduc': '1.00', 'age': '1.00'}


Unnamed: 0,0
feature_labels,"{'paeduc': '0.00', 'maeduc': '0.00', 'age': '0..."
hypothesis_labels,"{'paeduc': '1.00', 'maeduc': '1.00', 'age': '1..."
prediction,7.325648
hypothesis,7.821124
t-statistic,-1.345073
P>|t|,0.17882


In [13]:
# Hypothesis testing with one array and a fixed value
results = model.HypothesisTesting(test=np.array([[0, 0, 0], ]), hyp=7.325648)
print(results['summary'])
pd.DataFrame(results['table']).T

Significance Analysis (p > |t|)
1.96 > |-0.0000| == True

Fail to reject the null hypothesis: 7.3256 is not statistically different from 7.3256 at 5.0% level

Conclude that outcome of {'paeduc': '0.00', 'maeduc': '0.00', 'age': '0.00'}
does not differ from {'educ': '7.325648'}


Unnamed: 0,0
feature_labels,"{'paeduc': '0.00', 'maeduc': '0.00', 'age': '0..."
hypothesis_labels,{'educ': '7.325648'}
prediction,7.325648
hypothesis,7.325648
t-statistic,-0.000001
P>|t|,0.999999
