## A.1. Import libraries

In [3]:
library(dplyr)
library(readstata13)

## A.2 Set path

In [30]:
path = getwd()
path

## A.3 List all files in dataset

### A.3.1 Lists all the files from 2019 Q1

In [9]:
file_2019Q1 = list.files(paste(path,'/Data/2019 Data/Quarter 1' , sep=''), 
                         #full.names = TRUE
                        )
file_2019Q1 

### A.3.2 Lists all files from 2019 Q2


In [10]:
file_2019Q2 = list.files(paste(path,'/Data/2019 Data/Quarter 2' , sep=''), 
                         #full.names = TRUE
                        )
file_2019Q2

### A.3.3 Lists all the files from 2018 Q1

In [11]:
file_2018 = list.files(paste(path,'/Data/2018 Data/Quarter 4' , sep=''))
file_2018

# 1. Upload 1 dataset

## 1.1. Set folder and file location

- You can also skip this step if you know the full path name of file

In [12]:
folder_2019Q1 = paste(path,'/Data/2019 Data/Quarter 1', sep='') #Sets folder location
sdemt_2019Q1_path = paste(folder_2019Q1, '/sdemt119.dta', sep='') #Sets file location

## 1.2 Read dta

In [13]:
#df_sdemt = read.dta13(sdemt_2019Q1_path) #This line of code is easier but will show some warning messages
df_sdemt = suppressWarnings(read.dta13(sdemt_2019Q1_path)) #This line of code gets rid of warning messages
df_sdemt %>% head(2) #Shows first two observations of the data set

R_DEF,LOC,MUN,EST,EST_D,AGEB,T_LOC,CD_A,ENT,CON,...,MA48ME1SM,P14APOYOS,SCIAN,T_TRA,EMP_PPAL,TUE_PPAL,TRANS_PPAL,MH_FIL2,MH_COL,SEC_INS
0,1,2,10,117,0,1,1,9,40001,...,0,0,18,1,2,2,0,3,6,4
0,1,2,10,117,0,1,1,9,40001,...,0,0,6,1,2,2,0,3,2,2


# 2. Select columns

## 2.1 Show all column names

In [22]:
colnames(df_sdemt)[c(1:3)] #[c(1:3)] shows first three observations

## 2.2 Select two columns

In [23]:
df_sdemt %>% select(MUN, LOC) %>% head(2) #Erase %>% head(2) to show the whole columns

MUN,LOC
2,1
2,1


## 2.3 Show summary of data

In [24]:
df_sdemt %>% select(MUN, LOC) %>% summary()

      MUN              LOC         
 Min.   :  1.00   Min.   :   1.00  
 1st Qu.:  6.00   1st Qu.:   1.00  
 Median : 21.00   Median :   1.00  
 Mean   : 40.96   Mean   :  40.02  
 3rd Qu.: 50.00   3rd Qu.:   2.00  
 Max.   :570.00   Max.   :9046.00  

### 2.3.1 Show number of observations

In [29]:
df_sdemt %>% count()

n
406036


## 2.4 Rename columns

### 2.4.1 Basic renaming

In [26]:
df_sdemt %>% rename('Education'= CS_P13_1,
                   'Municipality'=MUN,
                   #'NEWNAME'= ORIGINAL_NAME
                   ) %>% head(3)


R_DEF,LOC,Municipality,EST,EST_D,AGEB,T_LOC,CD_A,ENT,CON,...,MA48ME1SM,P14APOYOS,SCIAN,T_TRA,EMP_PPAL,TUE_PPAL,TRANS_PPAL,MH_FIL2,MH_COL,SEC_INS
0,1,2,10,117,0,1,1,9,40001,...,0,0,18,1,2,2,0,3,6,4
0,1,2,10,117,0,1,1,9,40001,...,0,0,6,1,2,2,0,3,2,2
0,1,2,10,117,0,1,1,9,40001,...,0,0,0,1,0,0,0,0,0,0


### 2.4.2 Rename and save

In [28]:
#New column
df_sdemt_renamed = df_sdemt %>% rename('Education'= CS_P13_1)
#Check to see if the column exists
df_sdemt_renamed %>% select(Education)  %>% head(2) 

Education
6
7


# 3. Recode data

## 3.1 Show the unique observations for one column 

In [55]:
unique(df_sdemt$T_TRA)

## 3.2 Change all non-1 observations to zero

### 3.2.1 Using recode function 

In [63]:
df_sdemt  %>%  mutate(Dummy_example = recode(T_TRA, 
                                             '1' = '1',
                                            '0'='0',
                                            '2'='0'))  %>%  select(Dummy_example) %>% head(3)

Dummy_example
1
1
1


### 3.2.2 Using as.numeric function

In [68]:
df_sdemt  %>%  mutate(Dummy_example = as.numeric(T_TRA==1))   %>% head(3)

R_DEF,LOC,MUN,EST,EST_D,AGEB,T_LOC,CD_A,ENT,CON,...,P14APOYOS,SCIAN,T_TRA,EMP_PPAL,TUE_PPAL,TRANS_PPAL,MH_FIL2,MH_COL,SEC_INS,Dummy_example
0,1,2,10,117,0,1,1,9,40001,...,0,18,1,2,2,0,3,6,4,1
0,1,2,10,117,0,1,1,9,40001,...,0,6,1,2,2,0,3,2,2,1
0,1,2,10,117,0,1,1,9,40001,...,0,0,1,0,0,0,0,0,0,1


### 3.2.3 Save the code from 3.2 into dataframe 

In [69]:
df_sdemt = df_sdemt  %>%  mutate(Dummy_example = as.numeric(T_TRA==1)) 
#Now we over-wrote our old dataframe with new column you will notice on the far right

df_sdemt %>% head(2)

R_DEF,LOC,MUN,EST,EST_D,AGEB,T_LOC,CD_A,ENT,CON,...,P14APOYOS,SCIAN,T_TRA,EMP_PPAL,TUE_PPAL,TRANS_PPAL,MH_FIL2,MH_COL,SEC_INS,Dummy_example
0,1,2,10,117,0,1,1,9,40001,...,0,18,1,2,2,0,3,6,4,1
0,1,2,10,117,0,1,1,9,40001,...,0,6,1,2,2,0,3,2,2,1


# 4. Merge

In [None]:
#merge(df1, df2, by=c("INDVIDUAL ID","HOUSEHOLD ID"), all=FALSE)