## Cleaning the Data

In [189]:
##importing the necessary packages
import pandas as pd
import numpy as np
import math
import scipy.optimize as opt
from scipy.optimize import minimize
import scipy.stats as stats
import time
from statsmodels.iolib.summary2 import summary_col

In [190]:
#importing the data and dropping missing values
df1 = pd.read_stata('/Users/alexandradinu/Desktop/CompEcon_Fall17/Functions/PS3_data.dta')
df1 = df1[df1.hlabinc.isnull() !=True] #drop if missing hlabinc
df1 = df1[df1.hannhrs.isnull() !=True] #drop if missing hannhrs
df1 = df1[df1.age.isnull() !=True] #drop if missing age
df1 = df1[df1.hyrsed.isnull() !=True] #drop if missing educ

In [191]:
df1.head()

Unnamed: 0,id68,year,intid,relhh,hannhrs,wannhrs,hlabinc,wlabinc,nochild,wrace,...,redpregovinc,hsex,wsex,age,wage,hpersno,wpersno,hyrsed,wyrsed,pce
11161,402,1971,1,Head,1523.0,0.0,62928.707031,,0,1.0,...,15801.0,1.0,2.0,51.0,48.0,1.0,2.0,12.0,12.0,0.247121
11162,446,1971,2,Head,520.0,0.0,1618.640747,,0,,...,5153.0,2.0,,62.0,,1.0,,17.0,,0.247121
11164,461,1971,4,Head,2010.0,0.0,22660.970703,,0,1.0,...,5600.0,1.0,2.0,55.0,54.0,1.0,2.0,5.0,5.0,0.247121
11165,462,1971,5,Head,1960.0,0.0,12949.125977,,0,,...,3200.0,1.0,2.0,59.0,55.0,1.0,2.0,5.0,8.0,0.247121
11166,1126,1971,8,Head,2860.0,0.0,29337.865234,,1,,...,7250.0,1.0,2.0,25.0,24.0,1.0,2.0,16.0,12.0,0.247121


In [192]:
#Renaming the variable for education "Educ"
df1=df1.rename(columns= {'hyrsed': 'Educ'})

In [193]:
#Creating race dummies
df1_hrace =pd.get_dummies(df1['hrace'], prefix='race')

In [194]:
df_new = pd.concat([df1, df1_hrace], axis=1)

In [195]:
df_new

Unnamed: 0,id68,year,intid,relhh,hannhrs,wannhrs,hlabinc,wlabinc,nochild,wrace,...,age,wage,hpersno,wpersno,Educ,wyrsed,pce,race_1.0,race_2.0,race_3.0
11161,402,1971,1,Head,1523.0,0.0,62928.707031,,0,1.0,...,51.0,48.0,1.0,2.0,12.0,12.0,0.247121,1,0,0
11162,446,1971,2,Head,520.0,0.0,1618.640747,,0,,...,62.0,,1.0,,17.0,,0.247121,1,0,0
11164,461,1971,4,Head,2010.0,0.0,22660.970703,,0,1.0,...,55.0,54.0,1.0,2.0,5.0,5.0,0.247121,1,0,0
11165,462,1971,5,Head,1960.0,0.0,12949.125977,,0,,...,59.0,55.0,1.0,2.0,5.0,8.0,0.247121,1,0,0
11166,1126,1971,8,Head,2860.0,0.0,29337.865234,,1,,...,25.0,24.0,1.0,2.0,16.0,12.0,0.247121,1,0,0
11167,1585,1971,10,Head,840.0,833.0,8627.355469,6628.333984,0,1.0,...,67.0,66.0,1.0,2.0,12.0,12.0,0.247121,1,0,0
11170,97,1971,15,Head,1960.0,0.0,19678.625000,,0,1.0,...,63.0,64.0,1.0,2.0,6.0,8.0,0.247121,1,0,0
11171,237,1971,16,Head,604.0,0.0,2124.466064,,0,,...,65.0,,1.0,,8.0,,0.247121,1,0,0
11172,669,1971,17,Head,683.0,0.0,3439.611816,,0,,...,66.0,,2.0,,12.0,,0.247121,1,0,0
11173,284,1971,20,Head,2400.0,0.0,76885.437500,,2,1.0,...,39.0,36.0,1.0,2.0,16.0,12.0,0.247121,1,0,0


In [196]:
# Renaming the race dummies that I will actually use
df_new=df_new.rename(columns= {'race_1.0': 'White'})
df_new=df_new.rename(columns= {'race_2.0': 'Black'})
df_new=df_new.rename(columns= {'race_3.0': 'Other'})

In [197]:
df_new

Unnamed: 0,id68,year,intid,relhh,hannhrs,wannhrs,hlabinc,wlabinc,nochild,wrace,...,age,wage,hpersno,wpersno,Educ,wyrsed,pce,White,Black,Other
11161,402,1971,1,Head,1523.0,0.0,62928.707031,,0,1.0,...,51.0,48.0,1.0,2.0,12.0,12.0,0.247121,1,0,0
11162,446,1971,2,Head,520.0,0.0,1618.640747,,0,,...,62.0,,1.0,,17.0,,0.247121,1,0,0
11164,461,1971,4,Head,2010.0,0.0,22660.970703,,0,1.0,...,55.0,54.0,1.0,2.0,5.0,5.0,0.247121,1,0,0
11165,462,1971,5,Head,1960.0,0.0,12949.125977,,0,,...,59.0,55.0,1.0,2.0,5.0,8.0,0.247121,1,0,0
11166,1126,1971,8,Head,2860.0,0.0,29337.865234,,1,,...,25.0,24.0,1.0,2.0,16.0,12.0,0.247121,1,0,0
11167,1585,1971,10,Head,840.0,833.0,8627.355469,6628.333984,0,1.0,...,67.0,66.0,1.0,2.0,12.0,12.0,0.247121,1,0,0
11170,97,1971,15,Head,1960.0,0.0,19678.625000,,0,1.0,...,63.0,64.0,1.0,2.0,6.0,8.0,0.247121,1,0,0
11171,237,1971,16,Head,604.0,0.0,2124.466064,,0,,...,65.0,,1.0,,8.0,,0.247121,1,0,0
11172,669,1971,17,Head,683.0,0.0,3439.611816,,0,,...,66.0,,2.0,,12.0,,0.247121,1,0,0
11173,284,1971,20,Head,2400.0,0.0,76885.437500,,2,1.0,...,39.0,36.0,1.0,2.0,16.0,12.0,0.247121,1,0,0


In [198]:
#Dropping 0 values from the hours variable to avoid generating infinite values for wage variable
df_new['hours']=df_new['hannhrs'].where(df_new['hannhrs'] > 0)

In [199]:
#Creating an hourly wage variable
df_new['Wage'] = df_new['hlabinc']/df_new['hours'] 

In [200]:
df_new.describe()

Unnamed: 0,id68,year,intid,hannhrs,wannhrs,hlabinc,wlabinc,nochild,wrace,hrace,...,hpersno,wpersno,Educ,wyrsed,pce,White,Black,Other,hours,Wage
count,89727.0,89727.0,89727.0,89727.0,89727.0,89727.0,45372.0,89727.0,73851.0,89688.0,...,89727.0,62744.0,89727.0,62283.0,89727.0,89727.0,89727.0,89727.0,89537.0,89537.0
mean,1510.5825,1986.321453,3515.416474,2067.515625,763.884583,42116.45,21923.435547,0.949692,1.096911,1.123695,...,48.79987,65.535561,13.228849,13.029382,0.609883,0.897913,0.079664,0.021989,2071.903076,20.665855
std,834.683767,8.794182,2313.887855,756.214233,913.208374,46753.79,20679.59375,1.168184,0.354742,0.390376,...,73.757706,81.078194,2.526552,2.225672,0.208727,0.302765,0.270774,0.146648,750.982544,24.84012
min,1.0,1971.0,1.0,0.0,0.0,0.6353981,1.19278,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.247121,0.0,0.0,0.0,1.0,0.000204
25%,783.0,1979.0,1673.0,1800.0,0.0,19763.67,8042.066406,0.0,1.0,1.0,...,1.0,2.0,12.0,12.0,0.421747,1.0,0.0,0.0,1804.0,10.413431
50%,1541.0,1986.0,3322.0,2064.0,75.0,34600.22,18134.263672,0.0,1.0,1.0,...,4.0,4.0,12.0,12.0,0.614522,1.0,0.0,0.0,2064.0,16.558668
75%,2240.0,1993.0,5058.5,2452.0,1701.0,52673.09,30188.000977,2.0,1.0,1.0,...,170.0,170.0,16.0,15.0,0.786908,1.0,0.0,0.0,2456.0,24.737776
max,2930.0,2002.0,16968.0,7800.0,5840.0,3771521.0,685266.75,11.0,8.0,3.0,...,227.0,231.0,17.0,17.0,0.928007,1.0,1.0,1.0,7800.0,1865.037964


In [201]:
#Creating the y variable log of wage
df_new['lnwage'] = np.log(df_new.Wage)

In [202]:
# Selecting the final data that I will use for my analysis. 
#We need only male's that are head of household, that are 25-60 years old and make more than 7$/hour
final= df_new[(df_new['hsex']==1) & (df_new['age'] <= 60) & (df_new['age'] >= 25) & (df_new['Wage'] > 7)]

In [203]:
final.describe()

Unnamed: 0,id68,year,intid,hannhrs,wannhrs,hlabinc,wlabinc,nochild,wrace,hrace,...,wpersno,Educ,wyrsed,pce,White,Black,Other,hours,Wage,lnwage
count,57097.0,57097.0,57097.0,57097.0,57097.0,57097.0,37163.0,57097.0,51707.0,57062.0,...,49810.0,57097.0,49460.0,57097.0,57097.0,57097.0,57097.0,57097.0,57097.0,57097.0
mean,1507.17486,1986.584129,3480.671944,2228.480713,991.917847,52827.1,23085.300781,1.173337,1.086275,1.101416,...,67.981453,13.529993,13.21019,0.616286,0.920539,0.056343,0.022506,2228.480713,24.320503,3.010798
std,828.407534,8.7165,2253.042974,620.01886,926.230774,52355.79,21002.712891,1.216793,0.343647,0.369015,...,81.709457,2.44951,2.163178,0.206366,0.27046,0.230584,0.148322,620.01886,25.204367,0.544119
min,1.0,1971.0,1.0,2.0,0.0,16.6698,1.19278,0.0,1.0,1.0,...,1.0,1.0,1.0,0.247121,0.0,0.0,0.0,2.0,7.000252,1.945946
25%,782.0,1979.0,1690.0,1952.0,0.0,30373.45,8895.574219,0.0,1.0,1.0,...,2.0,12.0,12.0,0.421747,1.0,0.0,0.0,1952.0,13.950494,2.635515
50%,1542.0,1987.0,3296.0,2160.0,955.0,43811.45,19501.712891,1.0,1.0,1.0,...,4.0,13.0,12.0,0.635834,1.0,0.0,0.0,2160.0,19.914677,2.991457
75%,2225.0,1994.0,5003.0,2519.0,1900.0,61383.94,31769.908203,2.0,1.0,1.0,...,170.0,16.0,15.0,0.803488,1.0,0.0,0.0,2519.0,27.79324,3.324793
max,2930.0,2002.0,16968.0,5840.0,5840.0,3771521.0,417271.46875,11.0,8.0,3.0,...,230.0,17.0,17.0,0.928007,1.0,1.0,1.0,5840.0,1717.330322,7.448526


In [204]:
final['age'].describe()

count    57097.000000
mean        39.242939
std          9.579581
min         25.000000
25%         31.000000
50%         38.000000
75%         47.000000
max         60.000000
Name: age, dtype: float64

In [205]:
#Now splitting up the years, starting with year 1971
year1= final[(final['year']==1971)] 
year1.describe()

Unnamed: 0,id68,year,intid,hannhrs,wannhrs,hlabinc,wlabinc,nochild,wrace,hrace,...,wpersno,Educ,wyrsed,pce,White,Black,Other,hours,Wage,lnwage
count,1380.0,1380.0,1380.0,1380.0,1380.0,1380.0,727.0,1380.0,1012.0,1380.0,...,1284.0,1380.0,1272.0,1380.0,1380.0,1380.0,1380.0,1380.0,1380.0,1380.0
mean,1520.844928,1971.0,2484.165942,2239.666748,654.630432,46228.089844,15088.083984,1.594928,1.077075,1.12029,...,17.13162,12.210145,11.988208,0.247122,0.906522,0.066667,0.026812,2239.666748,21.360428,2.93699
std,838.157827,0.0,1403.973525,635.049377,817.565491,27814.853516,12095.452148,1.54003,0.314484,0.399448,...,47.952663,3.074726,2.322334,1e-06,0.291207,0.249534,0.161591,635.049377,13.763299,0.471703
min,1.0,1971.0,1.0,71.0,0.0,590.803894,36.419418,0.0,1.0,1.0,...,1.0,1.0,1.0,0.247121,0.0,0.0,0.0,71.0,7.003735,1.946444
25%,795.5,1971.0,1313.75,1946.25,0.0,29916.529297,4451.262207,0.0,1.0,1.0,...,2.0,11.0,11.0,0.247121,1.0,0.0,0.0,1946.25,13.691573,2.61678
50%,1569.5,1971.0,2399.0,2136.0,80.0,40901.029297,13353.787109,1.0,1.0,1.0,...,2.0,12.0,12.0,0.247121,1.0,0.0,0.0,2136.0,18.63097,2.924825
75%,2251.25,1971.0,3695.5,2501.0,1440.0,56652.429688,23569.432617,2.0,1.0,1.0,...,2.0,14.0,12.0,0.247121,1.0,0.0,0.0,2501.0,25.181152,3.226096
max,2927.0,1971.0,5057.0,5140.0,3000.0,396567.0,60699.03125,11.0,3.0,3.0,...,181.0,17.0,17.0,0.247121,1.0,1.0,1.0,5140.0,230.959808,5.442244


## Maximum Likelihood Estimator

In [216]:
def neg_loglike(beta):
    #Predicted Model
    yhat = beta[0] + beta[1]*year1['Educ'] + beta[2]*year1['age'] + beta[3]*year1['Black'] + beta[4]* year1['Other']
    
    
    
    #Defining the log-likelihood as a negative log likelihood which is a sum of the log of a normal pdf
    #with mean yhat and standard deviation beta[5]
    #All other years will follow a similar process
    
    logl =  -1*stats.norm(yhat, beta[5]).logpdf(year1['lnwage']).sum()
    
    return logl

In [217]:
#Defining the initial guesses for b0-b4 as well as b5 (st.deviation) respectively 

beta_start = np.array([2, -2, 0.5, 0.5, 0.5, 1])

#optimizing the log-likelihood over the given parameters using the Nelder-Mead method

result = minimize(neg_loglike, beta_start, method = 'Nelder-Mead', options={'disp': True})

print(result)


Optimization terminated successfully.
         Current function value: 728.062858
         Iterations: 681
         Function evaluations: 1059
 final_simplex: (array([[ 1.55114748,  0.06686841,  0.0143899 , -0.16395096,  0.03064082,
         0.41009259],
       [ 1.55121174,  0.06687153,  0.01438785, -0.16389915,  0.03063831,
         0.4100891 ],
       [ 1.55108379,  0.06687504,  0.01438969, -0.16392666,  0.0307296 ,
         0.41008253],
       [ 1.55111934,  0.06687736,  0.01438823, -0.16392111,  0.03058678,
         0.41009722],
       [ 1.55115975,  0.06687352,  0.01438812, -0.16392127,  0.03071874,
         0.41009128],
       [ 1.5511094 ,  0.06687689,  0.01438825, -0.16386633,  0.03057076,
         0.41008561],
       [ 1.55115496,  0.06687305,  0.01438857, -0.16394035,  0.0306875 ,
         0.41008532]]), array([ 728.06285792,  728.06285888,  728.06286502,  728.0628697 ,
        728.06287323,  728.0628809 ,  728.06288942]))
           fun: 728.06285791806476
       message: '

In [133]:
res1 =pd.DataFrame({'parameters':res1['x']})
res1.index=['Intercept', 'b1', 'b2', 'b3', 'b4', 'sigma']
res1.head()

Unnamed: 0,parameters
Intercept,1.551147
b1,0.066868
b2,0.01439
b3,-0.163951
b4,0.030641


In [287]:
#Now looking only at year 1980
year2= final[(final['year']==1980)] 
year2.describe()

Unnamed: 0,id68,year,intid,hannhrs,wannhrs,hlabinc,wlabinc,nochild,wrace,hrace,...,wpersno,Educ,wyrsed,pce,White,Black,Other,hours,Wage,lnwage
count,1856.0,1856.0,1856.0,1856.0,1856.0,1856.0,1152.0,1856.0,1575.0,1856.0,...,1633.0,1856.0,1633.0,1856.0,1856.0,1856.0,1856.0,1856.0,1856.0,1856.0
mean,1529.302263,1980.0,3218.014009,2187.516602,862.890076,48254.414062,18117.966797,1.199353,1.073651,1.106681,...,55.364971,13.217134,12.76485,0.466735,0.915409,0.0625,0.022091,2187.516602,22.684633,2.986743
std,828.231155,0.0,1843.885088,642.458801,883.042175,31733.013672,14426.77832,1.204717,0.308145,0.373573,...,77.92009,2.504283,2.095117,6e-06,0.278346,0.242127,0.147018,642.458801,16.444181,0.492932
min,4.0,1980.0,2.0,21.0,0.0,321.386169,64.277237,0.0,1.0,1.0,...,1.0,3.0,3.0,0.466728,0.0,0.0,0.0,21.0,7.040302,1.951651
25%,818.75,1980.0,1681.75,1920.0,0.0,29996.042969,6427.723145,0.0,1.0,1.0,...,2.0,12.0,12.0,0.466728,1.0,0.0,0.0,1920.0,14.056891,2.643113
50%,1591.0,1980.0,3243.5,2112.0,629.5,42851.488281,15829.339844,1.0,1.0,1.0,...,3.0,12.0,12.0,0.466728,1.0,0.0,0.0,2112.0,19.963352,2.993898
75%,2244.0,1980.0,4800.75,2483.5,1800.0,57849.511719,26570.601562,2.0,1.0,1.0,...,170.0,16.0,14.0,0.466728,1.0,0.0,0.0,2483.5,26.782181,3.287737
max,2927.0,1980.0,6615.0,5824.0,5011.0,317866.78125,145358.671875,6.0,3.0,3.0,...,181.0,17.0,17.0,0.466728,1.0,1.0,1.0,5824.0,446.369659,6.101148


In [288]:
def neg_loglike(beta):
    yhat = beta[0] + beta[1]*year2['Educ'] + beta[2]*year2['age'] + beta[3]*year2['Black'] + beta[4]* year2['Other']
    
    logl =  -1*stats.norm(yhat, beta[5]).logpdf(year2['lnwage']).sum()
    return logl

In [289]:
beta_start = np.array([2, 1, 1, -0.5, 0.5, 1])

res2 = minimize(neg_loglike, beta_start, method = 'Nelder-Mead', options={'disp': True})

print(res2)



Optimization terminated successfully.
         Current function value: 1167.197338
         Iterations: 619
         Function evaluations: 987
 final_simplex: (array([[ 1.62687924,  0.06690139,  0.01221821, -0.02486872,  0.43503576,
         0.45434005],
       [ 1.62689705,  0.06689914,  0.01221922, -0.0249499 ,  0.43507049,
         0.45434535],
       [ 1.62696523,  0.0668925 ,  0.01221956, -0.02491155,  0.43504537,
         0.45434972],
       [ 1.62686832,  0.06690002,  0.01221905, -0.02489276,  0.43504694,
         0.45434866],
       [ 1.62681401,  0.06689965,  0.0122206 , -0.02487658,  0.43504566,
         0.45434485],
       [ 1.62686546,  0.06690174,  0.01221927, -0.02493448,  0.43506732,
         0.45434025],
       [ 1.6267854 ,  0.06690298,  0.01222035, -0.02481188,  0.43501964,
         0.45433987]]), array([ 1167.19733763,  1167.19735172,  1167.19735336,  1167.19735791,
        1167.19736196,  1167.19737136,  1167.19737279]))
           fun: 1167.1973376281089
       mes

In [286]:
res2 =pd.DataFrame({'parameters':res2['x']})
res2.index=['Intercept', 'b1', 'b2', 'b3', 'b4', 'sigma']
res2.head()


Unnamed: 0,parameters
Intercept,1.626879
b1,0.066901
b2,0.012218
b3,-0.024869
b4,0.435036


In [207]:
#Now looking only at year 1990
year3= final[(final['year']==1990)] 
year3.describe()

Unnamed: 0,id68,year,intid,hannhrs,wannhrs,hlabinc,wlabinc,nochild,wrace,hrace,...,wpersno,Educ,wyrsed,pce,White,Black,Other,hours,Wage,lnwage
count,2013.0,2013.0,2013.0,2013.0,2013.0,2013.0,1451.0,2013.0,1953.0,2013.0,...,1753.0,2013.0,1750.0,2013.0,2013.0,2013.0,2013.0,2013.0,2013.0,2013.0
mean,1506.764531,1990.0,4586.804272,2275.852539,1152.904175,53131.402344,23856.576172,1.15996,1.083461,1.084451,...,78.557327,13.883756,13.554857,0.721435,0.930452,0.054645,0.014903,2275.852539,23.588531,2.988202
std,822.890325,0.0,2478.085045,592.799988,933.885193,45474.148438,20483.181641,1.183151,0.329085,0.327381,...,83.549347,2.223457,2.066671,4e-06,0.254447,0.227342,0.121195,592.799988,19.707569,0.546725
min,4.0,1990.0,1.0,50.0,0.0,1108.906982,83.16803,0.0,1.0,1.0,...,1.0,4.0,6.0,0.721431,0.0,0.0,0.0,50.0,7.000676,1.946007
25%,796.0,1990.0,2589.0,1974.0,0.0,30148.410156,9344.621094,0.0,1.0,1.0,...,3.0,12.0,12.0,0.721431,1.0,0.0,0.0,1974.0,13.431529,2.597605
50%,1559.0,1990.0,4683.0,2205.0,1300.0,44356.28125,20792.007812,1.0,1.0,1.0,...,6.0,14.0,13.0,0.721431,1.0,0.0,0.0,2205.0,19.541361,2.972533
75%,2212.0,1990.0,6572.0,2550.0,1960.0,62376.019531,33267.210938,2.0,1.0,1.0,...,171.0,16.0,16.0,0.721431,1.0,0.0,0.0,2550.0,27.930822,3.329731
max,2927.0,1990.0,9345.0,5200.0,4368.0,762373.5625,277226.75,7.0,3.0,3.0,...,191.0,17.0,17.0,0.721431,1.0,1.0,1.0,5200.0,317.283844,5.759797


In [272]:
def neg_loglike(beta):
    yhat = beta[0] + beta[1]*year3['Educ'] + beta[2]*year3['age'] + beta[3]*year3['Black'] + beta[4]* year3['Other']
    
    logl =  -1*stats.norm(yhat, beta[5]).logpdf(year3['lnwage']).sum()
    return logl

In [273]:
beta_start = np.array([2, -1, 0.5, 0.5, 0.5, 1])

res3 = minimize(neg_loglike, beta_start, method = 'Nelder-Mead', options={'disp': True})

print(res3)


Optimization terminated successfully.
         Current function value: 1393.882137
         Iterations: 741
         Function evaluations: 1180
 final_simplex: (array([[ 1.11855319,  0.09755967,  0.01346536, -0.17192547, -0.05960034,
         0.48356114],
       [ 1.11860257,  0.09755539,  0.01346572, -0.17194051, -0.05964875,
         0.48355992],
       [ 1.1185606 ,  0.09755932,  0.01346507, -0.17192997, -0.05962425,
         0.48356525],
       [ 1.11855597,  0.09755674,  0.01346629, -0.1719274 , -0.059562  ,
         0.48356224],
       [ 1.11857498,  0.09755749,  0.0134655 , -0.17190021, -0.0596607 ,
         0.48356244],
       [ 1.11856726,  0.0975581 ,  0.01346565, -0.17190866, -0.05964049,
         0.48356341],
       [ 1.11852874,  0.09756009,  0.01346598, -0.17191742, -0.05964425,
         0.4835614 ]]), array([ 1393.88213727,  1393.88215439,  1393.88215792,  1393.88216019,
        1393.88216186,  1393.8821675 ,  1393.8821763 ]))
           fun: 1393.8821372733109
       me

In [274]:
res3 =pd.DataFrame({'parameters':res3['x']})
res3.index=['Intercept', 'b1', 'b2', 'b3', 'b4', 'sigma']
res3.head()

Unnamed: 0,parameters
Intercept,1.118553
b1,0.09756
b2,0.013465
b3,-0.171925
b4,-0.0596


In [208]:
#Now looking only at year 2000
year4= final[(final['year']==2000)] 
year4.describe()

Unnamed: 0,id68,year,intid,hannhrs,wannhrs,hlabinc,wlabinc,nochild,wrace,hrace,...,wpersno,Educ,wyrsed,pce,White,Black,Other,hours,Wage,lnwage
count,2595.0,2595.0,2595.0,2595.0,2595.0,2595.0,1784.0,2595.0,2496.0,2580.0,...,2154.0,2595.0,2081.0,2595.0,2595.0,2595.0,2595.0,2595.0,2595.0,2595.0
mean,1496.867823,2000.0,3462.025048,2259.263672,1181.238037,63842.46,31749.779297,0.968786,1.10016,1.118605,...,99.139275,13.901734,13.845266,0.896233,0.909056,0.052408,0.032755,2259.263672,28.443388,3.117217
std,838.342028,0.0,2078.540111,597.081116,976.454346,74398.92,26718.029297,1.118719,0.398876,0.412911,...,82.034149,2.091758,1.975827,2.3e-05,0.287585,0.222892,0.17803,597.081116,30.680948,0.605962
min,4.0,2000.0,4.0,104.0,0.0,781.067,111.580994,0.0,1.0,1.0,...,1.0,4.0,8.0,0.89621,0.0,0.0,0.0,104.0,7.090258,1.958722
25%,762.0,2000.0,1691.0,1980.0,0.0,33474.3,15621.339844,0.0,1.0,1.0,...,5.0,12.0,12.0,0.89621,1.0,0.0,0.0,1980.0,14.949019,2.704646
50%,1520.0,2000.0,3334.0,2200.0,1428.0,46864.02,26779.439453,1.0,1.0,1.0,...,170.0,14.0,14.0,0.89621,1.0,0.0,0.0,2200.0,21.401117,3.063443
75%,2245.0,2000.0,5123.5,2550.0,2000.0,71411.84,40169.160156,2.0,1.0,1.0,...,173.0,16.0,16.0,0.89621,1.0,0.0,0.0,2550.0,31.446275,3.448278
max,2927.0,2000.0,7456.0,4900.0,4940.0,1290657.0,379375.375,8.0,8.0,3.0,...,212.0,17.0,17.0,0.89621,1.0,1.0,1.0,4900.0,526.79895,6.266819


In [269]:
def neg_loglike(beta):
    yhat = beta[0] + beta[1]*year4['Educ'] + beta[2]*year4['age'] + beta[3]*year4['Black'] + beta[4]* year4['Other']
    
    logl =  -1*stats.norm(yhat, beta[5]).logpdf(year4['lnwage']).sum()
    return logl

In [270]:
beta_start = np.array([2, -1, 0.5, -0.5, 0.5, 1])

y2000 = minimize(neg_loglike, beta_start, method = 'Nelder-Mead', options={'disp': True})

print(y2000)


Optimization terminated successfully.
         Current function value: 2081.366950
         Iterations: 753
         Function evaluations: 1181
 final_simplex: (array([[ 1.1609899 ,  0.10922855,  0.01097847, -0.24569339, -0.06044025,
         0.53964669],
       [ 1.16096633,  0.10922863,  0.01097897, -0.24570005, -0.06046799,
         0.53964897],
       [ 1.16101147,  0.10922588,  0.01097885, -0.24570704, -0.06045947,
         0.53965316],
       [ 1.16105346,  0.10922458,  0.01097827, -0.24569697, -0.06047854,
         0.53965011],
       [ 1.16095087,  0.10922982,  0.01097883, -0.24572757, -0.06042966,
         0.53964674],
       [ 1.16103173,  0.10922483,  0.01097864, -0.24569937, -0.06046346,
         0.53964801],
       [ 1.16098802,  0.10922791,  0.01097876, -0.24566238, -0.06049927,
         0.53965617]]), array([ 2081.36695014,  2081.36696865,  2081.3669721 ,  2081.36697846,
        2081.36698915,  2081.36699176,  2081.36699596]))
           fun: 2081.3669501405948
       me

In [271]:
y2000 =pd.DataFrame({'parameters':y2000['x']})
y2000.index=['Intercept', 'b1', 'b2', 'b3', 'b4', 'sigma']
y2000.head()

Unnamed: 0,parameters
Intercept,1.16099
b1,0.109229
b2,0.010978
b3,-0.245693
b4,-0.06044


## Interpretations

In these examples by year, the coefficient listed next to b1 represents the coefficient on Education. For 1971 a one year increase in education results in an approximate 6.69% increase in earnings. In 1980, a one year increase in education results in a 6.69% increase in earnings. In 1990 this figure has increased to 9.76%. Finally, in the year 2000 a one year increase in education increases expected earnings by 10.92%. From this analysis, the main conclusion is that the effect of education on earnings has increased over time. 