# First, reset all your Python and load pandas and statsmodels

In [1]:
%reset -f

In [2]:
import pandas          as pd
import statsmodels.api as sm

# Question 2

## load data into Python

In [3]:
data = pd.read_csv('econmath.csv')
data = data.dropna()

## check the first 5 observations of the data

In [4]:
data.head()

Unnamed: 0,age,work,study,econhs,colgpa,hsgpa,acteng,actmth,act,mathscr,male,calculus,attexc,attgood,fathcoll,mothcoll,score
0,23,15.0,10.0,0,3.4909,3.355,24.0,26.0,27.0,10,1,1,0,0,1,1,84.43
1,23,0.0,22.5,1,2.1,3.219,23.0,20.0,24.0,9,1,0,0,0,0,1,57.380001
2,21,25.0,12.0,0,3.0851,3.306,21.0,24.0,21.0,8,1,1,1,0,0,1,66.389999
3,22,30.0,40.0,0,2.6805,3.977,31.0,28.0,31.0,10,0,1,0,1,1,1,81.150002
4,22,25.0,15.0,1,3.7454,3.89,28.0,31.0,32.0,8,1,1,0,1,0,1,95.900002


## list all the variable names of the data

In [5]:
list(data.columns)

['age',
 'work',
 'study',
 'econhs',
 'colgpa',
 'hsgpa',
 'acteng',
 'actmth',
 'act',
 'mathscr',
 'male',
 'calculus',
 'attexc',
 'attgood',
 'fathcoll',
 'mothcoll',
 'score']

## compute the min, max, mean and sd of 3 specific variables

### specify which variable you want to compute the summary statistics on

In [6]:
var_name = ['actmth', 'acteng', 'score']

### mean

In [7]:
data[var_name].mean()

actmth    23.211302
acteng    22.594595
score     72.608734
dtype: float64

### sd

In [8]:
data[var_name].std()

actmth     3.773354
acteng     3.788735
score     13.304494
dtype: float64

### min

In [9]:
data[var_name].min()

actmth    12.000000
acteng    12.000000
score     20.309999
dtype: float64

### max

In [10]:
data[var_name].max()

actmth    36.000000
acteng    34.000000
score     98.440002
dtype: float64

## the mean, max and min of *score*

In [11]:
data['score'].mean()

72.60873437837839

In [12]:
data['score'].max()

98.440002

In [13]:
data['score'].min()

20.309999

## estimate the regression model 

## $$ score = \beta_0 + \beta_1 \cdot colgpa + \beta_2 \cdot actmth \beta_1 \cdot acteng + u$$

In [14]:
X = data[['colgpa', 'actmth', 'acteng']]
X = sm.add_constant(X)
Y = data[['score']]

  x = pd.concat(x[::order], 1)


In [15]:
OLS_model = sm.OLS(Y, X)

OLS_result = OLS_model.fit()
print(OLS_result.summary())

                            OLS Regression Results                            
Dep. Variable:                  score   R-squared:                       0.397
Model:                            OLS   Adj. R-squared:                  0.395
Method:                 Least Squares   F-statistic:                     177.9
Date:                Sat, 21 Aug 2021   Prob (F-statistic):           1.31e-88
Time:                        21:16:23   Log-Likelihood:                -3055.2
No. Observations:                 814   AIC:                             6118.
Df Residuals:                     810   BIC:                             6137.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         16.1740      2.800      5.776      0.0

### standardize each variable

$$ \mathbf{E}_{emp} \left[ \mathbf{x} \right] = \overline{\mathbf{x}}$$
$$ \implies \mathbf{E}_{emp} \left[ \mathbf{x} -\overline{\mathbf{x}} \right] = 0 $$

$$ \mathbf{var}_{emp} \left[ \mathbf{x} \right] = \sigma^2$$
$$ \implies $$ 
$$ \mathbf{var}_{emp} \left[ \mathbf{x} / \sigma \right]$$
$$ = 1/\sigma^2 \cdot \mathbf{var}_{emp} \left[ \mathbf{x} \right]$$
$$ = 1 $$

In [16]:
X_st = data[['colgpa', 'actmth', 'acteng']]
X_st = (X_st - X_st.mean()) / X_st.std()
X_st = sm.add_constant(X_st)
Y_st = data[['score']]
Y_st = (Y_st - Y_st.mean())/Y_st.std()

  x = pd.concat(x[::order], 1)


### check the mean 

In [17]:
X_st.mean()

const     1.000000e+00
colgpa    7.419672e-16
actmth    5.237416e-17
acteng    4.146287e-16
dtype: float64

In [18]:
Y_st.mean()

score   -6.110318e-16
dtype: float64

### check the sd

In [19]:
X_st.std()

const     0.0
colgpa    1.0
actmth    1.0
acteng    1.0
dtype: float64

In [20]:
Y_st.std()

score    1.0
dtype: float64

### re-estimate the model after standardization

In [21]:
OLS_st = sm.OLS(Y_st, X_st)

OLS_st_result = OLS_st.fit()
print(OLS_st_result.summary())

                            OLS Regression Results                            
Dep. Variable:                  score   R-squared:                       0.397
Model:                            OLS   Adj. R-squared:                  0.395
Method:                 Least Squares   F-statistic:                     177.9
Date:                Sat, 21 Aug 2021   Prob (F-statistic):           1.31e-88
Time:                        21:16:23   Log-Likelihood:                -948.48
No. Observations:                 814   AIC:                             1905.
Df Residuals:                     810   BIC:                             1924.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -6.151e-16      0.027  -2.26e-14      1.0

# Question 3

## load data into Python

In [22]:
data2 = pd.read_csv('hprice1.csv')

## compute the min, max, mean and sd of specific variables

In [23]:
var_name2 = ['price', 'sqrft', 'bdrms']

In [24]:
data2[var_name2].mean()

price     293.546034
sqrft    2013.693182
bdrms       3.568182
dtype: float64

In [25]:
data2[var_name2].std()

price    102.713445
sqrft    577.191583
bdrms      0.841393
dtype: float64

In [26]:
data2[var_name2].min()

price     111.0
sqrft    1171.0
bdrms       2.0
dtype: float64

In [27]:
data2[var_name2].max()

price     725.0
sqrft    3880.0
bdrms       7.0
dtype: float64

## regression $$price = \beta_0 + \beta_1 \cdot sqrmtr + \beta_2 \cdot lotsize + u$$

In [28]:
X2 = data2[['sqrft', 'bdrms']]
X2 = sm.add_constant(X2)
Y2 = data2[['price']]

  x = pd.concat(x[::order], 1)


In [29]:
OLS2 = sm.OLS(Y2, X2)

OLS2 = OLS2.fit()
print(OLS2.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.632
Model:                            OLS   Adj. R-squared:                  0.623
Method:                 Least Squares   F-statistic:                     72.96
Date:                Sat, 21 Aug 2021   Prob (F-statistic):           3.57e-19
Time:                        21:16:23   Log-Likelihood:                -488.00
No. Observations:                  88   AIC:                             982.0
Df Residuals:                      85   BIC:                             989.4
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -19.3150     31.047     -0.622      0.5

## standardize all variables and re-estimate the model

In [30]:
X2_st = data2[['sqrft', 'bdrms']]
X2_st = (X2_st - X2_st.mean()) / X2_st.std()
X2_st = sm.add_constant(X2_st)

Y2_st = data2[['price']]
Y2_st = (Y2_st - Y2_st.mean()) / Y2_st.std()

  x = pd.concat(x[::order], 1)


In [31]:
OLS2_st = sm.OLS(Y2_st, X2_st)

OLS2_st = OLS2_st.fit()
print(OLS2_st.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.632
Model:                            OLS   Adj. R-squared:                  0.623
Method:                 Least Squares   F-statistic:                     72.96
Date:                Sat, 21 Aug 2021   Prob (F-statistic):           3.57e-19
Time:                        21:16:23   Log-Likelihood:                -80.388
No. Observations:                  88   AIC:                             166.8
Df Residuals:                      85   BIC:                             174.2
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.527e-16      0.065   2.33e-15      1.0

## predict the price of a house

In [32]:
OLS2.predict([1, 2438, 4])

array([354.60524888])

## if the actual price is 300000, compute the prediction error of the house

In [33]:
300 - OLS2.predict([1, 2438, 4])

array([-54.60524888])

In [35]:
!rm -rf W2_Python.html
!jupyter nbconvert --to html W2_Python.ipynb 

[NbConvertApp] Converting notebook W2_Python.ipynb to html
[NbConvertApp] Writing 615867 bytes to W2_Python.html
