#  Dummy variable

### Econometrics A (ØkA)

Wooldridge (Ch. 7)

Bertel Schjerning

Department of Economics, University of Copenhagen


### Enable autoreload and read libraries

In [1]:
# Sørger for at alle importerede python filer geninlæses ved import statements
# Nødvedigt, hvis ændinger skal tage effekt uden at genstarte Python Kernel
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import mymlr as mlr # see mymlr.py

# Part 1: Lønforskelle mellem mænd og kvinder

### Lønregression: Timeløn, uddannelse og erfaring
Estimation af lineær model for timeløn, uddannelse og erfaring:

$$
				\log (wage_i) = \beta_0 + \alpha \text{kvinde}_i + u_i,
$$

hvor $\text{kvinde}_{i}$ er en **dummy variabel for kvinde**
- $\text{kvinde}_{i}=1$, hvis person $i$ er en kvinde
- $\text{kvinde}_{i}=0$, ellers



### Indlæs data og dan dummy for kvinde

In [3]:
# Indlæs data og generer varibale
df = pd.read_stata(r"../data/wage.dta")   # Load data

df['kvinde'] = (df['sex']==2).astype(int) # Dummy for kvinde

print('Frekvenstabel:')
print(df[['sex', 'kvinde']].value_counts())

# generer andre variable
df['const'] = 1                           # Add constant term
df['lwage'] = np.log(df['wage'])          # Log af timeln
df['experience2'] = df['experience'] ** 2 # tilføj experience²
display(df.describe())

Frekvenstabel:
sex  kvinde
1.0  0         561
2.0  1         517
dtype: int64


Unnamed: 0,sex,age,wage,educ,experience,single,kvinde,const,lwage,experience2
count,1078.0,1078.0,1078.0,1078.0,1078.0,1078.0,1078.0,1078.0,1078.0,1078.0
mean,1.479592,39.302412,139.19295,11.449907,14.626375,0.258813,0.479592,1.0,4.883426,287.220284
std,0.499815,11.06801,47.357094,3.311553,8.564899,0.438186,0.499815,0.0,0.321398,280.487155
min,1.0,20.0,44.0,0.0,0.139,0.0,0.0,1.0,3.78419,0.019321
25%,1.0,30.0,110.0,10.0,7.83375,0.0,0.0,1.0,4.70048,61.367738
50%,1.0,39.0,129.0,12.0,13.8235,0.0,0.0,1.0,4.859812,191.089146
75%,2.0,48.0,159.0,13.0,20.9135,1.0,1.0,1.0,5.068904,437.374512
max,2.0,68.0,300.0,18.0,31.0,1.0,1.0,1.0,5.703782,961.0


### Simpel lønregression med kvinde dummy

In [4]:
# Estimate the model using the mlr.ols function
mlr1 = mlr.ols(df[['const', 'kvinde']], df['lwage'])
mlr2 = mlr.ols(df[['const', 'kvinde', 'educ']], df['lwage'])
mlr3 = mlr.ols(df[['const', 'kvinde', 'educ', 'experience']], df['lwage'])
mlr4 = mlr.ols(df[['const', 'kvinde', 'educ', 'experience', 'experience2']], df['lwage'])

# Print the summary using the mlr.summary() function
mlr.summary([mlr1,mlr2,mlr3,mlr4])

                    Model 1  Model 2  Model 3  Model 4
Dependent variable    lwage    lwage    lwage    lwage
             const   4.9873   4.6647   4.4979   4.4169
                   (0.0128) (0.0327) (0.0353) (0.0387)
            kvinde  -0.2166  -0.2164  -0.1787  -0.1888
                   (0.0185) (0.0176) (0.0172) (0.0171)
              educ            0.0282   0.0281   0.0271
                            (0.0027) (0.0025) (0.0025)
        experience                     0.0102   0.0279
                                     (0.0010) (0.0038)
       experience2                             -0.0006
                                              (0.0001)
         R_squared   0.1135   0.1977   0.2685   0.2843
               TSS 111.2507 111.2507 111.2507 111.2507
               RSS  98.6226  89.2533  81.3842  79.6240
               ESS  12.6281  21.9974  29.8665  31.6267
                 n     1078     1078     1078     1078


### Dummy for kvinder, mænd og "the dummy variable trap"

In [15]:
df['mand'] = (df['sex']==1).astype(int) # Dummy for kvinde
# Estimate the model using the mlr.ols function
mlr1 = mlr.ols(df[['const', 'kvinde']], df['lwage'])
mlr2 = mlr.ols(df[['const', 'mand']], df['lwage'])
mlr3 = mlr.ols(df[['kvinde', 'mand']], df['lwage'])

# hvorfor kan vi ikke estimere en model med begge dummies og en konstant?
#mlr4 = mlr.ols(df[['const','kvinde', 'mand']], df['lwage']) 

# Print the summary using the mlr.summary() function
mlr.summary([mlr1,mlr2,mlr3])

                    Model 1  Model 2  Model 3
Dependent variable    lwage    lwage    lwage
             const   4.9873   4.7707         
                   (0.0128) (0.0133)         
            kvinde  -0.2166            4.7707
                   (0.0185)          (0.0133)
              mand            0.2166   4.9873
                            (0.0185) (0.0128)
         R_squared   0.1135   0.1135   0.1135
               TSS 111.2507 111.2507 111.2507
               RSS  98.6226  98.6226  98.6226
               ESS  12.6281  12.6281  12.6281
                 n     1078     1078     1078


# Part 2: Lønforskelle på tværs af uddannelseskategorier

### Uddanelses dummier

In [6]:
df['min_udd'] =  (df['educ']<10).astype(int)
df['klasse10'] =  (df['educ']==10).astype(int)
df['ung_udd'] =  ((df['educ'] > 10) & (df['educ'] <= 13)).astype(int)
df['videre_udd'] =  (df['educ']>13).astype(int)

tab=df[['min_udd','klasse10', 'ung_udd', 'videre_udd']].mean(axis=0)
print('Fordeling af udannelse\n',tab.to_string())
print('Check sum  : ',tab.sum())

Fordeling af udannelse
 min_udd       0.202226
klasse10      0.119666
ung_udd       0.464750
videre_udd    0.213358
Check sum  :  1.0


### Uddanelses kategorier

In [7]:
# Create a cross-tabulation of 'educ' against the new categories
df['udd_kat'] = df['min_udd'] + 2*df['klasse10'] + 3*df['ung_udd'] + 4* df['videre_udd']
tabulation = pd.crosstab(df['udd_kat'],df['educ'])

# Display the result
display(tabulation)

educ,0.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0
udd_kat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,32,116,21,49,0,0,0,0,0,0,0,0,0
2,0,0,0,0,129,0,0,0,0,0,0,0,0
3,0,0,0,0,0,79,283,139,0,0,0,0,0
4,0,0,0,0,0,0,0,0,64,54,63,36,13


### Løn regregtession med uddanelses dummier

In [19]:
# Estimate the model using the mlr.ols function
Xvar= ['const','kvinde', 'experience', 'experience2']
Dvar = ['klasse10', 'ung_udd', 'videre_udd'] # brug 'min_udd' som reference kategori 
# Prøv at ændre reference kategori. Hvilke koefficienter ærdrer sig

mlr1 = mlr.ols(df[Xvar + ['educ']], df['lwage'])
mlr2 = mlr.ols(df[Xvar + Dvar], df['lwage'])
mlr.summary([mlr1,mlr2])

                    Model 1  Model 2
Dependent variable    lwage    lwage
             const   4.4169   4.6405
                   (0.0387) (0.0305)
            kvinde  -0.1888  -0.1836
                   (0.0171) (0.0164)
        experience   0.0279   0.0280
                   (0.0038) (0.0036)
       experience2  -0.0006  -0.0006
                   (0.0001) (0.0001)
              educ   0.0271         
                   (0.0025)         
          klasse10           -0.0493
                            (0.0293)
           ung_udd            0.0596
                            (0.0212)
        videre_udd            0.3086
                            (0.0247)
         R_squared   0.2843   0.3472
               TSS 111.2507 111.2507
               RSS  79.6240  72.6190
               ESS  31.6267  38.6317
                 n     1078     1078


###  F-test: Er afkastet af uddannelse er lineært i antallet af års uddannelse? 

Vi estimerer en urestrikteret model lineær effekt af uddannelse og udd. dummies:

$$
\log(\text{wage}) = \beta_{0}+\delta _{1}\text{klasse10}+\delta_{2}\text{ung.udd}+\delta_{3}\text{videreg.udd}
						     +\beta_{1}\text{uddannelse} +\beta_{2}\text{erfaring}+\beta_{3}\text{erfaring}^{2}+\beta _{4}\text{kvinde}+u.
$$

og en restrikteret model under nulhypoten
$$H0: \delta_{1}=\delta_{2}=\delta_{3}=0$$

For at teste restriktioner på $\beta$, udfører vi en **F-test** mellem en **fuld model** og en **restrikteret model**.

1. **F-statistik**:
   $$
   F = \frac{(RSS_r - RSS_{ur}) / q}{RSS_{ur} / (n - k_{ur})} \sim F(q, n - k_{ur})
   $$
   Hvor:
   - $RSS_r$: RSS for den restrikterede model
   - $RSS_{ur}$: RSS for den fulde model
   - $q$: antal restriktioner

2. **P-værdi**:
   P-værdien beregnes som:
   $$
   p = 1 - F_{\text{cdf}}(F_{\text{stat}}, q, n - k_{ur})
   $$
   hvor $F_{\text{cdf}}$ er kumulativ fordelingsfunktion for F-fordelingen, og $F_{\text{stat}}$ er den beregnede F-statistik.


### F-test

In [20]:
from scipy import stats
def Ftest(y, X_ur, X_r):
    # Fit the unrestricted and unrestricted models and print summary output
    m_ur = mlr.ols(X_ur, y)   
    m_r = mlr.ols(X_r, y)
    mlr.summary([m_ur, m_r])

    # Number of observations (n) and parameters (k) in the unrestricted model
    n = m_ur['n']
    k_ur = m_ur['k'] # Number of columns in X (including intercept)
    k_r = m_r['k']   # Number of columns in X_r (including intercept)

    # Number of restrictions (q) - this is the difference in parameters between the models
    q = k_ur - k_r

    # Residual sum of squares (RSS) for both models
    RSS_ur = m_ur['RSS']
    RSS_r = m_r['RSS']

    # Compute the F-statistic
    F_stat = ((RSS_r - RSS_ur) / q) / (RSS_ur / (n - k_ur))

    # Compute the p-value based on the F-distribution
    p_value = 1 - stats.f.cdf(F_stat, q, n - k_ur)

    # Display the results
    print(f"\nF-test for multible lineære hypotester: ")
    print(f"   Number of restrictions: {q:d}")
    print(f"   Df, unrestricted model: {n -k_ur:d}")
    print(f"   Df, restricted model: {n - k_r:d}")
    print(f"   F-statistic: {F_stat:.4f} ~ F({q:d},{n - k_ur:d})")
    print(f"   Critical value for F-statistic at 5% significance: {stats.f.ppf(0.95, q, n - k_ur):.4f}")
    print(f"   P-value: {p_value:.4f}")

### Do the F-test

In [21]:
Xvar= ['const','kvinde', 'experience', 'experience2']
Dvar = ['klasse10', 'ung_udd', 'videre_udd'] # brug 'min_udd' som reference kategori 
X_ur = df[Xvar + ['educ']+Dvar]
X_r = df[Xvar + ['educ']]
Ftest(df['lwage'], X_ur, X_r)

                    Model 1  Model 2
Dependent variable    lwage    lwage
             const   4.6259   4.4169
                   (0.0463) (0.0387)
            kvinde  -0.1839  -0.1888
                   (0.0164) (0.0171)
        experience   0.0279   0.0279
                   (0.0036) (0.0038)
       experience2  -0.0006  -0.0006
                   (0.0001) (0.0001)
              educ   0.0023   0.0271
                   (0.0055) (0.0025)
          klasse10  -0.0574         
                   (0.0350)         
           ung_udd   0.0467         
                   (0.0373)         
        videre_udd   0.2880         
                   (0.0549)         
         R_squared   0.3474   0.2843
               TSS 111.2507 111.2507
               RSS  72.6070  79.6240
               ESS  38.6437  31.6267
                 n     1078     1078

F-test for multible lineære hypotester: 
   Number of restrictions: 3
   Df, unrestricted model: 1070
   Df, restricted model: 1073
   F-statistic: 

### Konklusion vedr specifikationstest
- Vi forkaster H0 med F-test størrelse på 34.4693 fås en p-værdi på 0.0000 ved opslag i 1-F(3,1070) 
- Vi forkaster at effekten skulle være lineær. Tværtimod. 
- Når vi har kontrolleret for "diplom" effekterne betyder års skolegang ikke noget her (insignifikant)

# Part 3: Interaktions led

### Interaktionsled: Heterogent afkanst til uddannelse på tværs af køn

In [11]:
df['kvindeXeduc'] =  df['kvinde']*df['educ']
Xvar= ['const', 'educ', 'experience', 'experience2']

mlr1 = mlr.ols(df[Xvar ], df['lwage'])
mlr2 = mlr.ols(df[Xvar + ['kvinde']], df['lwage'])
mlr3 = mlr.ols(df[Xvar + ['kvinde','kvindeXeduc']], df['lwage'])
mlr.summary([mlr1,mlr2,mlr3])

                    Model 1  Model 2  Model 3
Dependent variable    lwage    lwage    lwage
             const   4.3155   4.4169   4.3735
                   (0.0396) (0.0387) (0.0477)
              educ   0.0274   0.0271   0.0309
                   (0.0027) (0.0025) (0.0035)
        experience   0.0254   0.0279   0.0278
                   (0.0040) (0.0038) (0.0038)
       experience2  -0.0004  -0.0006  -0.0006
                   (0.0001) (0.0001) (0.0001)
            kvinde           -0.1888  -0.0990
                            (0.0171) (0.0603)
       kvindeXeduc                    -0.0078
                                     (0.0050)
         R_squared   0.2033   0.2843   0.2859
               TSS 111.2507 111.2507 111.2507
               RSS  88.6338  79.6240  79.4453
               ESS  22.6168  31.6267  31.8054
                 n     1078     1078     1078


### Interaktionsled: Heterogent løngab for enlige og gifte kvinder

In [12]:
df['kvindeXsingle'] =  df['kvinde']*df['single']
Xvar= ['const', 'educ', 'experience', 'experience2']

mlr1 = mlr.ols(df[Xvar ], df['lwage'])
mlr2 = mlr.ols(df[Xvar + ['kvinde', 'single']], df['lwage'])
mlr3 = mlr.ols(df[Xvar + ['kvinde', 'single','kvindeXsingle']], df['lwage'])
mlr.summary([mlr1,mlr2,mlr3])

                    Model 1  Model 2  Model 3
Dependent variable    lwage    lwage    lwage
             const   4.3155   4.4314   4.4518
                   (0.0396) (0.0405) (0.0409)
              educ   0.0274   0.0270   0.0267
                   (0.0027) (0.0025) (0.0025)
        experience   0.0254   0.0271   0.0279
                   (0.0040) (0.0038) (0.0038)
       experience2  -0.0004  -0.0005  -0.0006
                   (0.0001) (0.0001) (0.0001)
            kvinde           -0.1907  -0.2236
                            (0.0172) (0.0201)
            single           -0.0234  -0.0776
                            (0.0195) (0.0260)
     kvindeXsingle                     0.1206
                                     (0.0386)
         R_squared   0.2033   0.2852   0.2917
               TSS 111.2507 111.2507 111.2507
               RSS  88.6338  79.5174  78.7992
               ESS  22.6168  31.7333  32.4515
                 n     1078     1078     1078


### Samlet virkning
|            | Mand   | Kvinde                              |
|------------|--------|-------------------------------------|
| Gift       | 0      | -0.224                              |
| Single     | -0.078 | -0.224 - 0.078 + 0.121 = -0.181     |
