# Stata Admin Programming

In [64]:
import stata_setup
stata_setup.config("C:/Program Files/Stata17", "se")

## Set up simulated data in Stata
Data broadly resembles a year panel based on IAB data with person (persnr), establishment (betnr) and year identifiers.

In [65]:
%%stata
clear all
set obs 100000

gen betnr = runiformint(1, 1000)
gen persnr = runiformint(1,100000)
gen year = runiformint(1972, 2019)

gen double tentgelt = runiform(0, 100000) 
cap drop random
gen random = runiform(0,1)
replace tentgelt = . if random < 0.05

gen german = runiformint(0,1)
bysort persnr: replace german = german[_N]


. clear all

. set obs 100000
Number of observations (_N) was 0, now 100,000.

. 
. gen betnr = runiformint(1, 1000)

. gen persnr = runiformint(1,100000)

. gen year = runiformint(1972, 2019)

. 
. gen double tentgelt = runiform(0, 100000) 

. cap drop random

. gen random = runiform(0,1)

. replace tentgelt = . if random < 0.05
(4,942 real changes made, 4,942 to missing)

. 
. gen german = runiformint(0,1)

. bysort persnr: replace german = german[_N]
(18,436 real changes made)

. 


# Working with yearly panels
## Egen: max()
    - initialize variable
    - use max() function to get largest value into last observation determined by 'bysort'
        - note missings are ignored by max() unless *all* observations are missing
    - replace within 'bysort' by the last observation

In [66]:
%%stata
gen max_tentgelt = tentgelt
bysort persnr: replace max_tentgelt = max(max_tentgelt[_n-1], max_tentgelt)
bysort persnr: replace max_tentgelt = max_tentgelt[_N]


. gen max_tentgelt = tentgelt
(4,942 missing values generated)

. bysort persnr: replace max_tentgelt = max(max_tentgelt[_n-1], max_tentgelt)
(20,273 real changes made)

. bysort persnr: replace max_tentgelt = max_tentgelt[_N]
(18,320 real changes made)

. 


### Run-time comparison


In [67]:
%%stata
drop max_tentgelt
timer on 1
gen double max_tentgelt = tentgelt
bysort persnr: replace max_tentgelt = max(max_tentgelt[_n-1], max_tentgelt)
bysort persnr: replace max_tentgelt = max_tentgelt[_N]
timer off 1

timer on 2
egen double max_tentgelt_egen = max(tentgelt), by(persnr)
timer off 2

timer on 3
gegen double max_tentgelt_gegen = max(tentgelt), by(persnr)
timer off 3

assert max_tentgelt == max_tentgelt_egen
assert max_tentgelt == max_tentgelt_gegen

timer list


. drop max_tentgelt

. timer on 1

. gen double max_tentgelt = tentgelt
(4,942 missing values generated)

. bysort persnr: replace max_tentgelt = max(max_tentgelt[_n-1], max_tentgelt)
(20,273 real changes made)

. bysort persnr: replace max_tentgelt = max_tentgelt[_N]
(18,320 real changes made)

. timer off 1

. 
. timer on 2

. egen double max_tentgelt_egen = max(tentgelt), by(persnr)
(1,910 missing values generated)

. timer off 2

. 
. timer on 3

. gegen double max_tentgelt_gegen = max(tentgelt), by(persnr)

. timer off 3

. 
. assert max_tentgelt == max_tentgelt_egen

. assert max_tentgelt == max_tentgelt_gegen

. 
. timer list
   1:      0.16 /        1 =       0.1560
   2:      0.11 /        1 =       0.1130
   3:      0.07 /        1 =       0.0730

. 


## Egen: min()
- use bysort with implicit sorting on the variable of interest to (a) compute by groups and (b) utilize that sorting starts with the smallest value

In [68]:
%%stata
bysort persnr (tentgelt): gen double min_tentgelt = tentgelt[1]

egen double min_tentgelt_egen = min(tentgelt), by(persnr)

gegen double min_tentgelt_gegen = min(tentgelt), by(persnr)

assert min_tentgelt == min_tentgelt_egen
assert min_tentgelt == min_tentgelt_gegen


. bysort persnr (tentgelt): gen double min_tentgelt = tentgelt[1]
(1,910 missing values generated)

. 
. egen double min_tentgelt_egen = min(tentgelt), by(persnr)
(1,910 missing values generated)

. 
. gegen double min_tentgelt_gegen = min(tentgelt), by(persnr)

. 
. assert min_tentgelt == min_tentgelt_egen

. assert min_tentgelt == min_tentgelt_gegen

. 


### Run-time comparison

In [69]:
%%stata
cap drop min_tentgelt*

timer on 1
bysort persnr (tentgelt): gen double min_tentgelt = tentgelt[1]
timer off 1

timer on 2
egen double min_tentgelt_egen = min(tentgelt), by(persnr)
timer off 2

timer on 3
gegen double min_tentgelt_gegen = min(tentgelt), by(persnr)
timer off 3

timer list


. cap drop min_tentgelt*

. 
. timer on 1

. bysort persnr (tentgelt): gen double min_tentgelt = tentgelt[1]
(1,910 missing values generated)

. timer off 1

. 
. timer on 2

. egen double min_tentgelt_egen = min(tentgelt), by(persnr)
(1,910 missing values generated)

. timer off 2

. 
. timer on 3

. gegen double min_tentgelt_gegen = min(tentgelt), by(persnr)

. timer off 3

. 
. timer list
   1:      0.18 /        2 =       0.0890
   2:      0.32 /        2 =       0.1590
   3:      0.13 /        2 =       0.0645

. 


## Egen: rank()
rank is a little more subtle mostly because ties are broken in an arbitrary manner in the egen() function (and thus also for gegen)
the default egen option, however, is to assign the average rank to equal observations
while this is always unique it is harder to implement resorting only to sorting methods

### rank() - low-to-high, (non-)arbitrary tie breaks
this is probably the easiest to implement
- implicitly sort by the variable to be ranked on and then use the _n index
    - caveat: missing values will be sorted to the end (as they are handled as infinitely large in Stata)
    - this means that we can simply set the rank variable to missing for those with missing base variables
    - the sorting will not be impacted as it is low-to-high 
- to make tie-breaks non arbitrary: implicitly sort by a simulated  variable

In [70]:
%%stata
set seed 1234
cap drop sortvar
gen sortvar = runiform(0,1)

bysort betnr year (tentgelt sortvar): gen rank_tentgelt = _n ///
    if !missing(tentgelt)
assert missing(rank_tentgelt) if missing(tentgelt)
assert missing(tentgelt) if missing(rank_tentgelt)

bysort betnr year (sortvar): egen rank_tentgelt_egen = rank(tentgelt)
bysort betnr year (sortvar): gegen rank_tentgelt_gegen = rank(tentgelt)

sum rank*

assert rank_tentgelt == rank_tentgelt_egen
assert rank_tentgelt == rank_tentgelt_gegen



. set seed 1234

. cap drop sortvar

. gen sortvar = runiform(0,1)

. 
. bysort betnr year (tentgelt sortvar): gen rank_tentgelt = _n ///
>     if !missing(tentgelt)
(4,942 missing values generated)

. assert missing(rank_tentgelt) if missing(tentgelt)

. assert missing(tentgelt) if missing(rank_tentgelt)

. 
. bysort betnr year (sortvar): egen rank_tentgelt_egen = rank(tentgelt)
(4,942 missing values generated)

. bysort betnr year (sortvar): gegen rank_tentgelt_gegen = rank(tentgelt)
rank() is not a gtools function; will hash and use egen
(4942 missing values generated)

. 
. sum rank*

    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
rank_tentg~t |     95,058    1.991584    1.145786          1         10
rank_t~_egen |     95,058    1.991584    1.145786          1         10
rank_t~gegen |     95,058    1.991584    1.145786          1         10

. 
. assert rank_tentgelt == rank_tentgelt

## Run-time comparison

In [72]:
%%stata
cap drop rank*
timer on 1
set seed 1234
cap drop sortvar
gen sortvar = runiform(0,1)
bysort betnr year (tentgelt sortvar): gen rank_tentgelt = _n ///
    if !missing(tentgelt)
timer off 1

timer on 2
set seed 1234
cap drop sortvar
gen sortvar = runiform(0,1)
bysort betnr year (sortvar): egen rank_tentgelt_egen = rank(tentgelt)
timer off 2

timer on 3
set seed 1234
cap drop sortvar
gen sortvar = runiform(0,1)
bysort betnr year (sortvar): gegen rank_tentgelt_gegen = rank(tentgelt)
timer off 3

timer list


. cap drop rank*

. timer on 1

. set seed 1234

. cap drop sortvar

. gen sortvar = runiform(0,1)

. bysort betnr year (tentgelt sortvar): gen rank_tentgelt = _n ///
>     if !missing(tentgelt)
(4,942 missing values generated)

. timer off 1

. 
. timer on 2

. set seed 1234

. cap drop sortvar

. gen sortvar = runiform(0,1)

. bysort betnr year (sortvar): egen rank_tentgelt_egen = rank(tentgelt)
(4,942 missing values generated)

. timer off 2

. 
. timer on 3

. set seed 1234

. cap drop sortvar

. gen sortvar = runiform(0,1)

. bysort betnr year (sortvar): gegen rank_tentgelt_gegen = rank(tentgelt)
rank() is not a gtools function; will hash and use egen
(4942 missing values generated)

. timer off 3

. 
. timer list
   1:      0.28 /        4 =       0.0690
   2:      0.96 /        4 =       0.2397
   3:      0.81 /        4 =       0.2035

. 
