# Data Cleaning
Data cleaning for ECON672 Project

In [118]:
%%capture
# Capture suppresses output to screen
import stata_setup
stata_setup.config("/Applications/Stata 17", "be")

In [119]:
%%stata
/* Define paths for data and output */
global data "Data"
global output "Output"
global raw "$data/Raw"
global processed "$data/Processed"
global finalData "$processed/taxIncentives.dta"


. /* Define paths for data and output */
. global data "Data"

. global output "Output"

. global raw "$data/Raw"

. global processed "$data/Processed"

. global finalData "$processed/taxIncentives.dta"

. 


In [120]:
%%capture
%%stata
log using "$output/dataClean.log", replace

## Import Wage and Workforce data
Data is for NAICS 3364, Aircraft manufacturing from BLS QCEW

| i.year | c.employ | c.wage | i.state |
| --- | ---- | --- | --- |
| year | employment | wage |  MO |
| year | employment | wage |  WA |  

In [121]:
%%stata
ls $raw


total 208
-rw-r--r--@ 1 alwashere  staff  24768 Jan 20 16:58 CensusPopEst2010to2019.xlsx
-rw-r--r--@ 1 alwashere  staff  10278 Jan 20 16:48 MO_employment_QCEW.xlsx
-rw-r--r--@ 1 alwashere  staff  10205 Jan 20 16:50 MO_wages_QCEW.xlsx
-rw-r--r--@ 1 alwashere  staff  10287 Jan 20 16:44 WA_employment_QCEW.xlsx
-rw-r--r--@ 1 alwashere  staff  10118 Jan 20 16:45 WA_wages_QCEW.xlsx
-rw-r--r--@ 1 alwashere  staff  24987 Jan 26 20:26 unemploymentBLS.xlsx


---

### Missouri QCEW

In [122]:
%%stata
import excel "$raw/MO_employment_QCEW.xlsx", /// 
    sheet("BLS Data Series") /// 
    cellrange(A14:N34) /// 
    firstrow ///
    clear
describe
summarize


. import excel "$raw/MO_employment_QCEW.xlsx", /// 
>     sheet("BLS Data Series") /// 
>     cellrange(A14:N34) /// 
>     firstrow ///
>     clear
(14 vars, 20 obs)

. describe

Contains data
 Observations:            20                  
    Variables:            14                  
-------------------------------------------------------------------------------
Variable      Storage   Display    Value
    name         type    format    label      Variable label
-------------------------------------------------------------------------------
Year            int     %10.0g                Year
Jan             byte    %10.0g                Jan
Feb             byte    %10.0g                Feb
Mar             byte    %10.0g                Mar
Apr             byte    %10.0g                Apr
May             byte    %10.0g                May
Jun             byte    %10.0g                Jun
Jul             byte    %10.0g                Jul
Aug             byte    %10.0g                Au

Data is only at the annual level.  I will  drop the the month fields.

In [123]:
%%stata
keep Year Annual
rename Year year
rename Annual employment
label variable employment "Annual Reported Employment (NAICS 3364)"


. keep Year Annual

. rename Year year

. rename Annual employment

. label variable employment "Annual Reported Employment (NAICS 3364)"

. 


Create "working" frame and change to "working".  
The "default" frame with be merged with the "working" frame to 
create the primary dataset for analysis.

In [124]:
%%stata
frame create working
frame change working


. frame create working

. frame change working

. 


In [125]:
%%stata
// Import Missouri Wages
import excel "$raw/MO_wages_QCEW.xlsx", /// 
    sheet("BLS Data Series") /// 
    cellrange(A14:B34) /// 
    firstrow ///
    clear
describe
summarize


. // Import Missouri Wages
. import excel "$raw/MO_wages_QCEW.xlsx", /// 
>     sheet("BLS Data Series") /// 
>     cellrange(A14:B34) /// 
>     firstrow ///
>     clear
(2 vars, 20 obs)

. describe

Contains data
 Observations:            20                  
    Variables:             2                  
-------------------------------------------------------------------------------
Variable      Storage   Display    Value
    name         type    format    label      Variable label
-------------------------------------------------------------------------------
Year            int     %10.0g                Year
Annual          long    %10.0g                Annual
-------------------------------------------------------------------------------
Sorted by: 
     Note: Dataset has changed since last saved.

. summarize

    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
        Year |         20

In [126]:
%%stata
rename Year year
rename Annual wages
label variable wages "Annual Reported Wages (USD) (NAICS 3364)"


. rename Year year

. rename Annual wages

. label variable wages "Annual Reported Wages (USD) (NAICS 3364)"

. 


In [127]:
%%stata
frame change default
// Create link to "working" frame by matching on year
frlink 1:1 year, frame(working)
frget wages, from(working)


. frame change default

. // Create link to "working" frame by matching on year
. frlink 1:1 year, frame(working)
  (all observations in frame default matched)

. frget wages, from(working)
  (1 variable copied from linked frame)

. 


In [128]:
%%stata
generate state="MO"

// Drop frame link variable
drop working


. generate state="MO"

. 
. // Drop frame link variable
. drop working

. 


In [129]:
%%stata
// Check data
list in 1/10


. // Check data
. list in 1/10

     +---------------------------------+
     | year   employ~t   wages   state |
     |---------------------------------|
  1. | 2001      13998   63203      MO |
  2. | 2002      13556   70430      MO |
  3. | 2003      12552   78051      MO |
  4. | 2004      15900   76518      MO |
  5. | 2005      15452   81340      MO |
     |---------------------------------|
  6. | 2006      15306   90639      MO |
  7. | 2007      14560   86164      MO |
  8. | 2008      14549   89035      MO |
  9. | 2009      14922   87379      MO |
 10. | 2010      14657   91617      MO |
     +---------------------------------+

. 


In [130]:
%%stata
// Save Missouri data
save "$processed/missouri.dta", replace


. // Save Missouri data
. save "$processed/missouri.dta", replace
file Data/Processed/missouri.dta saved

. 


In [131]:
%%stata
frame reset

---

### Washington QCEW Data

In [132]:
%%stata
// Import Washington Employment
import excel "$raw/WA_employment_QCEW.xlsx", /// 
    sheet("BLS Data Series") /// 
    cellrange(A14:N34) /// 
    firstrow ///
    clear
describe
summarize


. // Import Washington Employment
. import excel "$raw/WA_employment_QCEW.xlsx", /// 
>     sheet("BLS Data Series") /// 
>     cellrange(A14:N34) /// 
>     firstrow ///
>     clear
(14 vars, 20 obs)

. describe

Contains data
 Observations:            20                  
    Variables:            14                  
-------------------------------------------------------------------------------
Variable      Storage   Display    Value
    name         type    format    label      Variable label
-------------------------------------------------------------------------------
Year            int     %10.0g                Year
Jan             byte    %10.0g                Jan
Feb             byte    %10.0g                Feb
Mar             byte    %10.0g                Mar
Apr             byte    %10.0g                Apr
May             byte    %10.0g                May
Jun             byte    %10.0g                Jun
Jul             byte    %10.0g                Jul
Aug           

Data is only at the annual level.  I will  drop the the month fields.

In [133]:
%%stata
keep Year Annual
rename Year year
rename Annual employment
label variable employment "Annual Reported Employment (NAICS 3364)"


. keep Year Annual

. rename Year year

. rename Annual employment

. label variable employment "Annual Reported Employment (NAICS 3364)"

. 


In [134]:
%%stata
// Create and change to working frame
frame create working
frame change working


. // Create and change to working frame
. frame create working

. frame change working

. 


In [135]:
%%stata
// Import Washington Wages
import excel "$raw/WA_wages_QCEW.xlsx", /// 
    sheet("BLS Data Series") /// 
    cellrange(A14:B34) /// 
    firstrow ///
    clear
describe
summarize


. // Import Washington Wages
. import excel "$raw/WA_wages_QCEW.xlsx", /// 
>     sheet("BLS Data Series") /// 
>     cellrange(A14:B34) /// 
>     firstrow ///
>     clear
(2 vars, 20 obs)

. describe

Contains data
 Observations:            20                  
    Variables:             2                  
-------------------------------------------------------------------------------
Variable      Storage   Display    Value
    name         type    format    label      Variable label
-------------------------------------------------------------------------------
Year            int     %10.0g                Year
Annual          long    %10.0g                Annual
-------------------------------------------------------------------------------
Sorted by: 
     Note: Dataset has changed since last saved.

. summarize

    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
        Year |         

In [136]:
%%stata
rename Year year
rename Annual wages
label variable wages "Annual Reported Wages (USD) (NAICS 3364)"


. rename Year year

. rename Annual wages

. label variable wages "Annual Reported Wages (USD) (NAICS 3364)"

. 


In [137]:
%%stata
// Rebuild the link using current dataset and pull data into primary frame.
frame change default
frlink 1:1 year, frame(working)
frget wages, from(working)


. // Rebuild the link using current dataset and pull data into primary frame.
. frame change default

. frlink 1:1 year, frame(working)
  (all observations in frame default matched)

. frget wages, from(working)
  (1 variable copied from linked frame)

. 


---

In [138]:
%%stata
generate state="WA"
drop working


. generate state="WA"

. drop working

. 


In [139]:
%%stata
list in 1/10


     +---------------------------------+
     | year   employ~t   wages   state |
     |---------------------------------|
  1. | 2001      87243   65162      WA |
  2. | 2002      75653   71465      WA |
  3. | 2003      65274   73460      WA |
  4. | 2004      61384   77747      WA |
  5. | 2005      65616   84146      WA |
     |---------------------------------|
  6. | 2006      73180   90155      WA |
  7. | 2007      80036   86735      WA |
  8. | 2008      82932   87114      WA |
  9. | 2009      82920   87887      WA |
 10. | 2010      80762   91526      WA |
     +---------------------------------+


In [140]:
%%stata
// Save Washington data
save "$processed/washington.dta", replace


. // Save Washington data
. save "$processed/washington.dta", replace
file Data/Processed/washington.dta saved

. 


In [141]:
%%stata
// Clean up frame space, drop link
frame reset


. // Clean up frame space, drop link
. frame reset

. 


### Append Missouri and Washington QCEW data

In [142]:
%%stata
use $processed/missouri, clear
append using $processed/washington

// Sort by year and state and check first 10 rows.
sort year state
list in 1/10


. use $processed/missouri, clear

. append using $processed/washington
(variable employment was int, now long to accommodate using data's values)

. 
. // Sort by year and state and check first 10 rows.
. sort year state

. list in 1/10

     +---------------------------------+
     | year   employ~t   wages   state |
     |---------------------------------|
  1. | 2001      13998   63203      MO |
  2. | 2001      87243   65162      WA |
  3. | 2002      13556   70430      MO |
  4. | 2002      75653   71465      WA |
  5. | 2003      12552   78051      MO |
     |---------------------------------|
  6. | 2003      65274   73460      WA |
  7. | 2004      15900   76518      MO |
  8. | 2004      61384   77747      WA |
  9. | 2005      15452   81340      MO |
 10. | 2005      65616   84146      WA |
     +---------------------------------+

. 


In [143]:
%%stata
// Save intermediate file
save "$finalData", replace


. // Save intermediate file
. save "$finalData", replace
file Data/Processed/taxIncentives.dta saved

. 


---

### Add Washington and Missouri Census Data

In [144]:
%%stata
import excel "$raw/CensusPopEst2010to2019.xlsx", ///
    sheet("Data") ///
    firstrow ///
    clear

/*
    Drop B which is"2010 Census population"
    Drop C which is the estimate base
    Data will only include population estimates 
      July 2010- July 2019
*/
drop B C

// Drop row 1 which is also header
keep in 2/3


. import excel "$raw/CensusPopEst2010to2019.xlsx", ///
>     sheet("Data") ///
>     firstrow ///
>     clear
(13 vars, 3 obs)

. 
. /*
>     Drop B which is"2010 Census population"
>     Drop C which is the estimate base
>     Data will only include population estimates 
>       July 2010- July 2019
> */
. drop B C

. 
. // Drop row 1 which is also header
. keep in 2/3
(1 observation deleted)

. 


In [145]:
%%stata
rename A state
replace state="WA" if state=="Washington"
replace state="MO" if state=="Missouri"

// Rename each of the population for reshape cmd
scalar i = 2010
foreach var of varlist D-M{
    local j: display int(i)
    rename `var' pop`j'
    scalar i = i + 1
}


. rename A state

. replace state="WA" if state=="Washington"
(1 real change made)

. replace state="MO" if state=="Missouri"
(1 real change made)

. 
. // Rename each of the population for reshape cmd
. scalar i = 2010

. foreach var of varlist D-M{
  2.     local j: display int(i)
  3.     rename `var' pop`j'
  4.     scalar i = i + 1
  5. }

. 


In [146]:
%%stata
reshape long pop, i(state) j(year) 

(j = 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019)

Data                               Wide   ->   Long
-----------------------------------------------------------------------------
Number of observations                2   ->   20          
Number of variables                  11   ->   3           
j variable (10 values)                    ->   year
xij variables:
            pop2010 pop2011 ... pop2019   ->   pop
-----------------------------------------------------------------------------


In [147]:
%%stata
/* 
Convert to Strings by ignoring the ",".  
  Raw data is formated with comma separator for thousands
  and millions.
*/
destring(pop), replace ignore(",")
label variable pop "US Census Population Estimate"
by state: list


. /* 
> Convert to Strings by ignoring the ",".  
>   Raw data is formated with comma separator for thousands
>   and millions.
> */
. destring(pop), replace ignore(",")
pop: character , removed; replaced as long

. label variable pop "US Census Population Estimate"

. by state: list

-------------------------------------------------------------------------------
-> state = MO

     +------------------------+
     | state   year       pop |
     |------------------------|
  1. |    MO   2010   5995974 |
  2. |    MO   2011   6010275 |
  3. |    MO   2012   6024367 |
  4. |    MO   2013   6040715 |
  5. |    MO   2014   6056202 |
     |------------------------|
  6. |    MO   2015   6071732 |
  7. |    MO   2016   6087135 |
  8. |    MO   2017   6106670 |
  9. |    MO   2018   6121623 |
 10. |    MO   2019   6137428 |
     +------------------------+

-------------------------------------------------------------------------------
-> state = WA

     +------------------------+
     | sta

---

## Merge Census and QCEW Data

In [148]:
%%stata
merge 1:1 state year using $finalData


    Result                      Number of obs
    -----------------------------------------
    Not matched                            20
        from master                         0  (_merge==1)
        from using                         20  (_merge==2)

    Matched                                20  (_merge==3)
    -----------------------------------------


In [149]:
%%stata
drop _merge

In [150]:
%%stata
// Check Data
sort state year
by state: list year pop employment wages if year>2008


. // Check Data
. sort state year

. by state: list year pop employment wages if year>2008

-------------------------------------------------------------------------------
-> state = MO

     +------------------------------------+
     | year       pop   employ~t    wages |
     |------------------------------------|
  9. | 2009         .      14922    87379 |
 10. | 2010   5995974      14657    91617 |
 11. | 2011   6010275      14490    97100 |
 12. | 2012   6024367      14235   102881 |
 13. | 2013   6040715      17689   105111 |
     |------------------------------------|
 14. | 2014   6056202      17778   111073 |
 15. | 2015   6071732      16778   108757 |
 16. | 2016   6087135      16044   108886 |
 17. | 2017   6106670      15814   114566 |
 18. | 2018   6121623      16127   118080 |
     |------------------------------------|
 19. | 2019   6137428      17406   115738 |
 20. | 2020         .      17814   105995 |
     +------------------------------------+

-------------------

---

## Generate Calculated Parameters

### Add indicator dummies for incentive year, leads and lags
Tax incentive was passed in 2014 in WA.  In 2014 WA was selected as the expansion site.

In [151]:
%%stata
sort state year
by state: generate postIncentive=(year>=2014)
label variable postIncentive "=1 if year >=2014"


. sort state year

. by state: generate postIncentive=(year>=2014)

. label variable postIncentive "=1 if year >=2014"

. 


In [152]:
%%stata
// T0 to examine the pre and post trends
by state: generate t_0 = (postIncentive==1 & postIncentive[_n-1]==0) 

// Generate 4 years of lags and leads
foreach i of numlist -4/4 {
    if `i' < 0 {
        local var "lag"
    }
    if `i' > 0 {
        local var "lead"
    }
    
    // So the lag/lead numbers are always positive
    local j: display int(abs(`i'))
    
    /* 
        Note: A negative value of i will result
            in lag.
    */
    by state: generate t_`var'`j'=(t_0[_n+`i']==1)
}
// Group all lags older than 4 year together
by state: replace t_lag4=1 if (t_lag4[_n-1]==1)


. // T0 to examine the pre and post trends
. by state: generate t_0 = (postIncentive==1 & postIncentive[_n-1]==0) 

. 
. // Generate 4 years of lags and leads
. foreach i of numlist -4/4 {
  2.     if `i' < 0 {
  3.         local var "lag"
  4.     }
  5.     if `i' > 0 {
  6.         local var "lead"
  7.     }
  8.     
.     // So the lag/lead numbers are always positive
.     local j: display int(abs(`i'))
  9.     
.     /* 
>         Note: A negative value of i will result
>             in lag.
>     */
.     by state: generate t_`var'`j'=(t_0[_n+`i']==1)
 10. }

. // Group all lags older than 4 year together
. by state: replace t_lag4=1 if (t_lag4[_n-1]==1)
(4 real changes made)

. 


In [153]:
%%stata
// Check the data
by state: list state year t_lead2 t_lead1 t_0 t_lag1 t_lag2, sepby(t_0) 


. // Check the data
. by state: list state year t_lead2 t_lead1 t_0 t_lag1 t_lag2, sepby(t_0) 

-------------------------------------------------------------------------------
-> state = MO

     +----------------------------------------------------------+
     | state   year   t_lead2   t_lead1   t_0   t_lag1   t_lag2 |
     |----------------------------------------------------------|
  1. |    MO   2001         0         0     0        0        0 |
  2. |    MO   2002         0         0     0        0        0 |
  3. |    MO   2003         0         0     0        0        0 |
  4. |    MO   2004         0         0     0        0        0 |
  5. |    MO   2005         0         0     0        0        0 |
  6. |    MO   2006         0         0     0        0        0 |
  7. |    MO   2007         0         0     0        0        0 |
  8. |    MO   2008         0         0     0        0        0 |
  9. |    MO   2009         0         0     0        0        0 |
 10. |    MO   2

### Add ln(\*) for *pop*, *employment*, and *wages*

In [154]:
%%stata
generate lnPop=ln(pop)
generate lnEmployment=ln(employment)
generate lnWages = ln(wages)

generate wa=(state=="WA")
label variable wa "=1 if the state is WA"

generate treated = postIncentive*wa
label variable treated "=1 if after 2014 and in WA"


. generate lnPop=ln(pop)
(20 missing values generated)

. generate lnEmployment=ln(employment)

. generate lnWages = ln(wages)

. 
. generate wa=(state=="WA")

. label variable wa "=1 if the state is WA"

. 
. generate treated = postIncentive*wa

. label variable treated "=1 if after 2014 and in WA"

. 


## Save Final Data

In [155]:
%%stata
save $finalData, replace


file Data/Processed/taxIncentives.dta saved


In [156]:
%%capture
%%stata
clear
log close