# Data Cleaning - QCEW
Data cleaning for ECON672 Project

In [50]:
%%capture
# Capture suppresses output to screen
import stata_setup
stata_setup.config("/Applications/Stata 17", "be")

In [51]:
%%stata
/* Define paths for data and output */
global data "Data"
global output "Output"
global raw "$data/Raw"
global rawEmploy "$raw/QCEW/Employment"
global rawPay "$raw/QCEW/AnnualPay"
global processed "$data/Processed"
global processedEmploy "$processed/QCEW/Employment"
global processedPay "$processed/QCEW/AnnualPay"


global finalEmploy "$processed/QCEW/MergedEmploy.dta"
global finalQCEWData "$processed/QCEW/MergedQCEW.dta"


. /* Define paths for data and output */
. global data "Data"

. global output "Output"

. global raw "$data/Raw"

. global rawEmploy "$raw/QCEW/Employment"

. global rawPay "$raw/QCEW/AnnualPay"

. global processed "$data/Processed"

. global processedEmploy "$processed/QCEW/Employment"

. global processedPay "$processed/QCEW/AnnualPay"

. 
. 
. global finalEmploy "$processed/QCEW/MergedEmploy.dta"

. global finalQCEWData "$processed/QCEW/MergedQCEW.dta"

. 


In [52]:
%%capture
%%stata
log using "$output/dataClean-syth.log", replace

## Import Wage and Workforce data
Data is for NAICS 3364, Aircraft manufacturing from BLS QCEW

| i.year | c.employ | c.wage | i.state |
| --- | ---- | --- | --- |
| year | employment | wage |  MO |
| year | employment | wage |  WA |  

## Import  Employment Numbers from the States

In [53]:
%%stata
/* Read all the xlsx files into Stata files */   
local fileList: dir "$rawEmploy" files "*"
local n_files: word count `fileList'
 
forval i=1/`n_files' {
    local thisFile `: word `i' of `fileList''
    scalar thisState = substr("`thisFile'", 1, 2)                           
    
    di ">>>Importing data for ", thisState, ": `thisFile'"
    import excel "$rawEmploy/`thisFile'", /// 
    sheet("BLS Data Series") /// 
    cellrange(A14:N34) /// 
    firstrow ///
    clear
    
    /* This will be defaulted to int sometimes
        and long others.  Force all files to be
        long dataype
    */
    recast long Annual
    
    /* Drop unneeded data and fix variable names 
        Note: Employment data is reported annually
    */
    keep Year Annual
    rename Year year
    rename Annual employment
    label variable ///
        employment "Annual Reported Employment (NAICS 3364)"
    
    local dtaFile: di (thisState+".dta")
    save "$processedEmploy/`dtaFile'", replace
}


. /* Read all the xlsx files into Stata files */   
. local fileList: dir "$rawEmploy" files "*"

. local n_files: word count `fileList'

.  
. forval i=1/`n_files' {
  2.     local thisFile `: word `i' of `fileList''
  3.     scalar thisState = substr("`thisFile'", 1, 2)                         
>   
  4.     
.     di ">>>Importing data for ", thisState, ": `thisFile'"
  5.     import excel "$rawEmploy/`thisFile'", /// 
>     sheet("BLS Data Series") /// 
>     cellrange(A14:N34) /// 
>     firstrow ///
>     clear
  6.     
.     /* This will be defaulted to int sometimes
>         and long others.  Force all files to be
>         long dataype
>     */
.     recast long Annual
  7.     
.     /* Drop unneeded data and fix variable names 
>         Note: Employment data is reported annually
>     */
.     keep Year Annual
  8.     rename Year year
  9.     rename Annual employment
 10.     label variable ///
>         employment "Annual Reported Employment (NAICS 3364)"
 11.     
. 

In [54]:
%%stata 
/* Append all the State-level Stata files into a single file */   
local fileList: dir "$processedEmploy" files "*"
local n_files: word count `fileList'
 
forval i=1/`n_files' {
    local thisFile `: word `i' of `fileList''
    scalar thisState = substr("`thisFile'", 1, 2)                           
    

    /* Special actions for first file */
    if `i'==1 {
        di ">>>Importing data for ", thisState, ": `thisFile'"
        use "$processedEmploy/`thisFile'", clear
        generate state = thisState
    }

    /* Actions for all subsequent files */
    if `i'!=1 {
        di ">>>Importing data for ", thisState, ": `thisFile'"
        append using "$processedEmploy/`thisFile'"
        replace state = thisState if state==""
    }
    
    if `i'==`n_files'{
        save "$finalEmploy", replace
    }

}


. /* Append all the State-level Stata files into a single file */   
. local fileList: dir "$processedEmploy" files "*"

. local n_files: word count `fileList'

.  
. forval i=1/`n_files' {
  2.     local thisFile `: word `i' of `fileList''
  3.     scalar thisState = substr("`thisFile'", 1, 2)                         
>   
  4.     
. 
.     /* Special actions for first file */
.     if `i'==1 {
  5.         di ">>>Importing data for ", thisState, ": `thisFile'"
  6.         use "$processedEmploy/`thisFile'", clear
  7.         generate state = thisState
  8.     }
  9. 
.     /* Actions for all subsequent files */
.     if `i'!=1 {
 10.         di ">>>Importing data for ", thisState, ": `thisFile'"
 11.         append using "$processedEmploy/`thisFile'"
 12.         replace state = thisState if state==""
 13.     }
 14.     
.     if `i'==`n_files'{
 15.         save "$finalEmploy", replace
 16.     }
 17. 
. }
>>>Importing data for  GA : GA.dta
>>>Importing data for  KS : KS.dta
(2

In [55]:
%%stata
describe


Contains data from Data/Processed/QCEW/MergedEmploy.dta
 Observations:           680                  
    Variables:             3                  5 Feb 2022 16:28
-------------------------------------------------------------------------------
Variable      Storage   Display    Value
    name         type    format    label      Variable label
-------------------------------------------------------------------------------
year            int     %10.0g                Year
employment      long    %10.0g                Annual Reported Employment (NAICS
                                                3364)
state           str2    %9s                   
-------------------------------------------------------------------------------
Sorted by: 


---

## Import annual pay data from the states.

Create "working" frame and change to "working".  
The "default" frame with be merged with the "working" frame to 
create the primary dataset for analysis.

In [56]:
%%stata
frame create working
frame change working


. frame create working

. frame change working

. 


In [57]:
%%stata
/* Read all the xlsx files into Stata files */   
local fileList: dir "$rawPay" files "*"
local n_files: word count `fileList'
 
forval i=1/`n_files' {
    local thisFile `: word `i' of `fileList''
    scalar thisState = substr("`thisFile'", 1, 2)                           
    
    di ">>>Importing data for ", thisState, ": `thisFile'"
    import excel "$rawPay/`thisFile'", /// 
    sheet("BLS Data Series") /// 
    cellrange(A14:B34) /// 
    firstrow ///
    clear
    
    /* This will be defaulted to int sometimes
        and long others.  Force all files to be
        long dataype
    */
    recast long Annual
    
    /* Fix variable names */
    rename Year year
    rename Annual annualPay
    label variable annual "Average Annual Pay (USD) (NAICS 3364)"
    
    local dtaFile: di (thisState+".dta")
    save "$processedPay/`dtaFile'", replace
}


. /* Read all the xlsx files into Stata files */   
. local fileList: dir "$rawPay" files "*"

. local n_files: word count `fileList'

.  
. forval i=1/`n_files' {
  2.     local thisFile `: word `i' of `fileList''
  3.     scalar thisState = substr("`thisFile'", 1, 2)                         
>   
  4.     
.     di ">>>Importing data for ", thisState, ": `thisFile'"
  5.     import excel "$rawPay/`thisFile'", /// 
>     sheet("BLS Data Series") /// 
>     cellrange(A14:B34) /// 
>     firstrow ///
>     clear
  6.     
.     /* This will be defaulted to int sometimes
>         and long others.  Force all files to be
>         long dataype
>     */
.     recast long Annual
  7.     
.     /* Fix variable names */
.     rename Year year
  8.     rename Annual annualPay
  9.     label variable annual "Average Annual Pay (USD) (NAICS 3364)"
 10.     
.     local dtaFile: di (thisState+".dta")
 11.     save "$processedPay/`dtaFile'", replace
 12. }
>>>Importing data for  OH : OH.xlsx
(2 

In [58]:
%%stata 
/* Append all the State-level Stata files into a single file */   
local fileList: dir "$processedPay" files "*"
local n_files: word count `fileList'
 
forval i=1/`n_files' {
    local thisFile `: word `i' of `fileList''
    scalar thisState = substr("`thisFile'", 1, 2)                           
    

    /* Special actions for first file */
    if `i'==1 {
        di ">>>Importing data for ", thisState, ": `thisFile'"
        use "$processedPay/`thisFile'", clear
        generate state = thisState
    }

    /* Actions for all subsequent files */
    if `i'!=1 {
        di ">>>Importing data for ", thisState, ": `thisFile'"
        append using "$processedPay/`thisFile'"
        replace state = thisState if state==""
    }
    
    if `i'==`n_files'{
        save "$MergedPay", replace
    }

}


. /* Append all the State-level Stata files into a single file */   
. local fileList: dir "$processedPay" files "*"

. local n_files: word count `fileList'

.  
. forval i=1/`n_files' {
  2.     local thisFile `: word `i' of `fileList''
  3.     scalar thisState = substr("`thisFile'", 1, 2)                         
>   
  4.     
. 
.     /* Special actions for first file */
.     if `i'==1 {
  5.         di ">>>Importing data for ", thisState, ": `thisFile'"
  6.         use "$processedPay/`thisFile'", clear
  7.         generate state = thisState
  8.     }
  9. 
.     /* Actions for all subsequent files */
.     if `i'!=1 {
 10.         di ">>>Importing data for ", thisState, ": `thisFile'"
 11.         append using "$processedPay/`thisFile'"
 12.         replace state = thisState if state==""
 13.     }
 14.     
.     if `i'==`n_files'{
 15.         save "$MergedPay", replace
 16.     }
 17. 
. }
>>>Importing data for  GA : GA.dta
>>>Importing data for  KS : KS.dta
(20 real chan

Exception in thread Stata:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/Applications/Stata 17/utilities/pystata/core/stout.py", line 176, in run
    raise SystemError(output)
SystemError: (20 real changes made)
>>>Importing data for  OR : OR.dta
(20 real changes made)
>>>Importing data for  TX : TX.dta
(20 real changes made)
>>>Importing data for  TN : TN.dta
(20 real changes made)
>>>Importing data for  FL : FL.dta
(20 real changes made)
>>>Importing data for  PA : PA.dta
(20 real changes made)
>>>Importing data for  KY : KY.dta
(20 real changes made)
invalid file specification
r(198);
r(198);



In [59]:
%%stata
describe
tab state


. describe

Contains data from Data/Processed/QCEW/AnnualPay/GA.dta
 Observations:           680                  
    Variables:             3                  5 Feb 2022 16:28
-------------------------------------------------------------------------------
Variable      Storage   Display    Value
    name         type    format    label      Variable label
-------------------------------------------------------------------------------
year            int     %10.0g                Year
annualPay       long    %10.0g                Average Annual Pay (USD) (NAICS
                                                3364)
state           str2    %9s                   
-------------------------------------------------------------------------------
Sorted by: 
     Note: Dataset has changed since last saved.

. tab state

      state |      Freq.     Percent        Cum.
------------+-----------------------------------
         AL |         20        2.94        2.94
         AR |         20   

In [60]:
%%stata
frame change default
// Create link to "working" frame by matching on year
frlink 1:1 year state, frame(working)
frget annualPay, from(working)


. frame change default

. // Create link to "working" frame by matching on year
. frlink 1:1 year state, frame(working)
  (all observations in frame default matched)

. frget annualPay, from(working)
(4 missing values generated)
  (1 variable copied from linked frame)

. 


In [61]:
%%stata
// Drop frame link variable
drop working


. // Drop frame link variable
. drop working

. 


In [62]:
%%stata
// Check data
list if state=="WA"
list if state=="TX"
list if state=="MO"


. // Check data
. list if state=="WA"

     +------------------------------------+
     | year   employ~t   state   annual~y |
     |------------------------------------|
481. | 2001      87243      WA      65162 |
482. | 2002      75653      WA      71465 |
483. | 2003      65274      WA      73460 |
484. | 2004      61384      WA      77747 |
485. | 2005      65616      WA      84146 |
     |------------------------------------|
486. | 2006      73180      WA      90155 |
487. | 2007      80036      WA      86735 |
488. | 2008      82932      WA      87114 |
489. | 2009      82920      WA      87887 |
490. | 2010      80762      WA      91526 |
     |------------------------------------|
491. | 2011      86577      WA      97215 |
492. | 2012      94224      WA      96688 |
493. | 2013      96012      WA     100105 |
494. | 2014      93889      WA     110016 |
495. | 2015      93816      WA     107138 |
     |------------------------------------|
496. | 2016      90845      WA     1

---

## Generate Calculated Parameters

### Add ln(\*) for *pop*, *employment*, and *wages*

In [63]:
%%stata
generate lnEmployment=ln(employment)
generate lnAnnualPay = ln(annualPay)


. generate lnEmployment=ln(employment)
(4 missing values generated)

. generate lnAnnualPay = ln(annualPay)
(4 missing values generated)

. 


In [64]:
%%stata
// Combined QCEW Data
save "$finalQCEWData", replace


. // Combined QCEW Data
. save "$finalQCEWData", replace
file Data/Processed/QCEW/MergedQCEW.dta saved

. 


## Save Final Data

In [65]:
%%capture
%%stata
frame reset
clear
log close