In [1]:
import os
from functools import partial

import pandas as pd
import numpy as np

import soepy
import estimagic
from estimagic.optimization.optimize import minimize

HUGE_INT = 100000000000

In [6]:
def get_moments(data):
    
    # Pre_process data frame
    
    # Determine the education level given years of experience
    data["Educ_Level"] = 0
    data.loc[data["Years_of_Education"] == 11, "Educ_Level"] = 1
    data.loc[data["Years_of_Education"] == 12, "Educ_Level"] = 2

    # Determine the observed wage given period choice
    data["Wage_Observed"] = 0
    data.loc[data["Choice"] == 0, "Wage_Observed"] = data.loc[data["Choice"] == 0, "Period_Wage_N"]
    data.loc[data["Choice"] == 1, "Wage_Observed"] = data.loc[data["Choice"] == 1, "Period_Wage_P"]
    data.loc[data["Choice"] == 2, "Wage_Observed"] = data.loc[data["Choice"] == 2, "Period_Wage_F"]
    
    # Calculate moments
    
    # Initialize moments dictionary
    moments = dict()

    # Store moments in groups as nested dictionary
    for group in [
        "Wage_Distribution",
        "Choice_Probability",
    ]:
        moments[group] = dict()
    
    
    # Compute unconditional moments of the wage distribution
    info = data.groupby(["Period"])["Wage_Observed"].describe().to_dict()

    # Save mean and standard deviation of wages for each period
    # to Wage Distribution section of the moments dictionary
    for period in range(40): ## TO DO: Remove hard coded number
        moments["Wage_Distribution"][period] = []
        try:
            for label in ["mean", "std"]:
                moments["Wage_Distribution"][period].append(info[label][period])
        except KeyError:
            for i in range(2):
                moments["Wage_Distribution"][period].append(
                    0.0
                )
    
    
    # Compute unconditional moments of the choice probabilities
    info = data.groupby(["Period"])["Choice"].value_counts(normalize=True).to_dict()

    for period in range(40): ## TO DO: Remove hard coded number
        moments["Choice_Probability"][period] = []
        for choice in range(3):
            try:
                stat = info[(period, choice)]
            except KeyError:
                stat = 0.00
            moments["Choice_Probability"][period].append(stat)
        
    return moments

In [18]:
class SimulationBasedEstimationCls:
    """This class facilitates estimation of the free parameter vector
    in a life-cycle model of labor supply based on the soepy package
    and estimagic."""
    
    def __init__(
        self, 
        params, 
        model_spec_init_file_name, 
        moments_obs, 
        weighting_matrix, 
        get_moments, 
        max_evals=HUGE_INT,
    ):
        
        self.params = params
        self.model_spec_init_file_name = model_spec_init_file_name
        self.moments_obs = moments_obs
        self.weighting_matrix = weighting_matrix
        self.get_moments = get_moments
        self.max_evals = max_evals
        
        self.num_evals = 0
        self.fval = None
        
        self._calculate_criterion_func_value(self.params)
        
    
    def get_objective(self, params_cand):
        
        self.params = params_cand
        self.params.drop(columns = ["_fixed"], inplace = True, errors = "ignore")

        # Obtain criterion function value
        fval, stats_obs, stats_sim = self._calculate_criterion_func_value(params_cand)

        print(fval)

        # Save params and function value as pickle object.
        is_start = self.fval is None

        if is_start:
            data = {'current': fval, 'start': fval, 'step': fval}
            self.fval = pd.DataFrame(data, columns=['current', 'start', 'step'], index=[0])
            self.params.to_pickle('step.soepy.pkl')
        else:
            is_step = self.fval['step'].iloc[-1] > fval
            step = self.fval['step'].iloc[-1]
            start = self.fval['start'].loc[0]

            if is_step:
                data = {'current': fval, 'start': start, 'step': fval}
                self.params.to_pickle('step.soepy.pkl')
            else:
                data = {'current': fval, 'start': start, 'step': step}

            self.fval = self.fval.append(data, ignore_index=True)

        self._logging_smm(stats_obs, stats_sim, fval)

        self.num_evals = self.num_evals + 1
        if self.num_evals >= self.max_evals:
            raise RuntimeError('maximum number of evaluations reached')

        return fval
    
    def _calculate_criterion_func_value(self, params_cand):
        
        self.params = params_cand
        
        # Generate simulated data set
        data_frame_sim = soepy.simulate(self.params, self.model_spec_init_file_name)

        # Calculate simulated moments
        moments_sim = self.get_moments(data_frame_sim)

        # Move all moments from a dictionary to an array
        stats_obs, stats_sim = [], []

        for group in [
            "Wage_Distribution",
            "Choice_Probability",
        ]:
            for key_ in moments_obs[group].keys():
                stats_obs.extend(moments_obs[group][key_])
                stats_sim.extend(moments_sim[group][key_])

        # Construct criterion value
        stats_dif = np.array(stats_obs) - np.array(stats_sim)

        fval = float(np.dot(np.dot(stats_dif, weighting_matrix), stats_dif))

        return fval, stats_obs, stats_sim
    
    def _logging_smm(self, stats_obs, stats_sim, fval):
        """This method contains logging capabilities that are just relevant for the SMM routine."""
    
        fname = "monitoring.estimagic.smm.info"
        fname2 = "monitoring_compact.estimagic.smm.info"

        if self.num_evals == 1 and os.path.exists(fname):
            os.unlink(fname)
        if self.num_evals ==1 and os.path.exists(fname2):
            os.unlink(fname2)

        with open(fname, "a+") as outfile:
            fmt_ = "\n\n{:>8}{:>15}\n\n"
            outfile.write(fmt_.format("EVALUATION", self.num_evals))
            fmt_ = "\n\n{:>8}{:>15}\n\n"
            outfile.write(fmt_.format("fval", fval))
            for x in self.params.index:
                info = [x[0],x[1],self.params.loc[x,"value"]]
                fmt_ = "{:>8}" + "{:>15}" * 2 +"\n\n"
                outfile.write(fmt_.format(*info))

            fmt_ = "{:>8}" + "{:>15}" * 4 + "\n\n"
            info = ["Moment", "Observed", "Simulated", "Difference", "Weight"]
            outfile.write(fmt_.format(*info))
            for x in enumerate(stats_obs):
                stat_obs, stat_sim = stats_obs[x[0]], stats_sim[x[0]]
                info = [
                    x[0],
                    stat_obs,
                    stat_sim,
                    abs(stat_obs - stat_sim),
                    self.weighting_matrix[x[0], x[0]],
                ]

                fmt_ = "{:>8}" + "{:15.5f}" * 4 + "\n"
                outfile.write(fmt_.format(*info))

            with open(fname2, "a+") as outfile:
                fmt_ = "\n\n{:>8}{:>15}\n\n"
                outfile.write(fmt_.format("EVALUATION", self.num_evals))
                fmt_ = "\n\n{:>8}{:>15}\n\n"
                outfile.write(fmt_.format("fval", fval))
                for x in self.params.index:
                    info = [x[0], x[1], self.params.loc[x, "value"]]
                    fmt_ = "{:>8}" + "{:>15}" * 2 + "\n\n"
                    outfile.write(fmt_.format(*info))


In [7]:
def get_weighting_matrix(data, num_agents_smm, num_samples):
    """Calculates the weighting matrix based on the
    moments of the observed data"""

    moments_sample = []

    # Collect n samples of moments
    for k in range(num_samples):
        data_sample = data.sample(n=num_agents_smm)

        moments_sample_k = get_moments(data_sample)

        moments_sample.append(moments_sample_k)

        k = +1

    # Append samples to a list of size num_samples
    # containing number of moments values each
    stats = []

    for moments_sample_k in moments_sample:
        stats.append(moments_dict_to_list(moments_sample_k))

    # Calculate sample variances for each moment
    moments_var = np.array(stats).var(axis=0)

    # Handling of nan
    moments_var[np.isnan(moments_var)] = np.nanmax(moments_var)

    # Handling of zero variances
    is_zero = moments_var <= 1e-10
    moments_var[is_zero] = 0.1

    # Construct weighting matrix
    weighting_matrix = np.diag(moments_var ** (-1))

    return weighting_matrix


def moments_dict_to_list(moments_dict):
    """This function constructs a list of available moments based on the moment dictionary."""
    moments_list = []
    for group in [
        "Wage_Distribution",
        "Choice_Probability",
    ]:
        for period in sorted(moments_dict[group].keys()):
            moments_list.extend(moments_dict[group][period])
    return moments_list



In [8]:
lower = np.tile(
    (
        1.000,
        1.000,
        1.000,
        0.050,
        0.050,
        0.050,
        0.005,
        0.005,
        0.005,
        0.001,
        0.001,
        0.001,
        1.00,
        1.00,
        -0.400,
        -0.800,
        -0.400,
        -0.800,
        0.001,
        0.001,
        0.001,
        0.001,
        0.001,
    ),
    1,
)

upper = np.tile(
    (
        3.000,
        3.000,
        3.000,
        0.400,
        0.400,
        0.400,
        0.600,
        0.600,
        0.600,
        0.150,
        0.150,
        0.150,
        4.00,
        4.00,
        -0.050,
        -0.150,
        -0.050,
        -0.150,
        0.999,
        0.999,
        0.800,
        0.800,
        0.800,
    ),
    1,
)

In [11]:
def prepare_estimation(model_params_init_file_name, model_spec_init_file_name, data_file_name, lower, upper):
    """Prepares objects for SMM estimation."""
    
    # Read in data and init file sources
    model_params_df = pd.read_pickle(model_params_init_file_name)
    data = pd.read_pickle(data_file_name)
    model_params_df["lower"] = lower
    model_params_df["upper"] = upper

    # Get moments from observed data
    moments_obs = get_moments(data)

    # Calculate weighting matrix based on bootstrap variances of observed moments
    weighting_matrix = get_weighting_matrix(data, num_agents_smm = 500, num_samples = 200)
    
    return moments_obs, weighting_matrix, model_params_df

In [24]:
model_params_init_file_name = "params_init_delta0.pkl"
model_spec_init_file_name = "model_spec_init_test_zero.yml"
data_file_name = "data_obs_3types.pkl"

moments_obs, weighting_matrix, model_params_df = prepare_estimation(model_params_init_file_name, model_spec_init_file_name, data_file_name, lower, upper)

In [25]:
max_evals = 1000

adapter_smm = SimulationBasedEstimationCls( 
        params = model_params_df, 
        model_spec_init_file_name = model_spec_init_file_name, 
        moments_obs = moments_obs, 
        weighting_matrix = weighting_matrix, 
        get_moments = get_moments, 
        max_evals = max_evals,
)

In [26]:
algo_options = {
    "stopeval": 1e-9,
    # "maxeval": 2,
}

In [None]:
result = minimize(
    
    criterion = adapter_smm.get_objective, 
    params = adapter_smm.params, 
    algorithm = "nlopt_bobyqa", 
    algo_options = algo_options,
)

2.288148813240183
2.288148813240183
2.288148813240183
117.62594066207873
171.85513422529914
362.72383522802176
7.167446716701969
16.485602976036763
43.74076966830853
2.3470151818751672
2.3253076278917737
2.605943202178503
2.2891838572329526
2.2916049250330146
2.349501546027568
3958.5238795268006
1283.812097096212
3.51475987539418
10.155911834919603
3.212617923091189
12.83515825020311
21.14788335281556
27.06027118329708
2.3930184487958392
2.275299576229437
219.2520302487295
5735.071698973661
67.69541356602744
86.33749016026138
5.7258942605138845
9.998266335304761
22.893954434174503
2.3508808760271354
2.3185824875810943
2.6626902771358307
2.295720682457498
2.300602966948591
2.3497938914485026
1283.812097096212
935.2612083124848
4.402739998965584
6.10558342564775
3.4628749471396816
9.836563600349304
17.6733879186622
20.916544310471515
2.288148813240183
2.3815428355666137
96.09808107751633
176.65220550403683
97.88908276007072
42.90838903015602
12.516175595634863
2.547677354977497
25.583531

2.1881215975325787
2.1881210125429393
2.1881208638819793
2.188120778181003
2.1881199764052455
2.188120294090919
2.188119478325488
2.18811892328717
2.188119235349072
2.1881190743750465
2.1881190518406983
2.18811924716098
2.188118959434454
2.1881191715334447
2.1881186384054594
2.188118218985449
2.188118173664152
2.188117569257546
2.1881168273154374
2.188116703319199
2.1881168123243238
2.188116792541826
2.188116692785006
2.188116422806322
2.1881163850570178
2.1881160085710065
2.188116061139424
2.1881158848409945
2.188115999444179
2.1881161863306064
2.188115616551082
2.188115058753581
2.188115369129923
2.1881135143130286
2.1881104264050553
2.1881042544811296
2.1880919261920524
2.1880673317575883
2.1880183907413864
2.1911871559929184
2.1912850479500854
2.1891120301809046
2.188018398559888
2.188013532018499
2.188013598134482
2.1880119693023747
2.1880119808762633
2.188011578057197
2.188011615416955
2.1880114512828226
2.1880114529908883
2.1880112543150547
2.188011242066132
2.1880110171309655
2