## Cameron Bale
***
#### Code for pre-processing the IPUMS data.
***

In [1]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
ipums = pd.read_csv("../Data/IPUMS/ipums_data.csv")

In [3]:
ipums

Unnamed: 0,YEAR,SERIAL,MONTH,CPSID,ASECFLAG,ASECWTH,PERNUM,CPSIDP,ASECWT,AGE,...,RACE,EDUC,WKSWORK1,UHRSWORKLY,INCWAGE,INCWAGE95,years_of_educ,potential_experience,hourly_wage,non_white
0,1994,4,March,19940302415500,1,838.36,1,19940302415501,838.36,40,...,White,111,52,20,11000,11306.594154,16.000000,18.000000,10.871725,0
1,1994,4,March,19940302415500,1,838.36,2,19940302415502,838.36,44,...,White,92,52,40,38000,39059.143440,15.000000,23.000000,18.778434,0
2,1994,4,March,19940302415500,1,838.36,3,19940302415503,1043.34,21,...,White,81,20,40,7000,7195.105370,13.000000,2.000000,8.993882,0
3,1994,4,March,19940302415500,1,838.36,4,19940302415504,884.61,17,...,White,60,52,10,2600,2672.467709,11.000000,0.000000,5.139361,0
4,1994,5,March,19940302415400,1,566.21,1,19940302415401,566.21,51,...,Asian or Pacific Islander,73,52,50,20800,21379.741672,12.000000,33.000000,8.222978,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197751,1996,63334,March,19960101534500,1,1003.70,4,19960101534502,1380.60,35,...,White,111,52,40,22500,21877.813505,16.000000,13.000000,10.518180,0
197752,1996,63336,March,19960301536000,1,827.95,1,19960301536001,827.95,33,...,White,111,44,35,4500,4375.562701,16.000000,11.000000,2.841274,0
197753,1996,63337,March,19960301535900,1,827.95,1,19960301535901,827.95,43,...,White,122,52,40,37000,35976.848875,17.737673,19.262327,17.296562,0
197754,1996,63337,March,19960301535900,1,827.95,2,19960301535902,827.95,34,...,White,92,12,20,1500,1458.520900,15.000000,13.000000,6.077170,0


In the Barrientos et al. paper, they use `INCWAGE`, `years_of_educ`, `potential_experience`, `non_white`, and `SEX`, so we select these variables as well.

In [4]:
ipums = ipums.loc[:,['INCWAGE', 'years_of_educ', 'potential_experience', 'non_white', 'SEX']]

In [5]:
np.mean(ipums.SEX == "Female")

0.4812850179008475

In [6]:
np.sum(ipums.non_white)

28072

Rename columns to be all lowercase.

In [7]:
ipums = ipums.rename(columns={'INCWAGE': 'incwage', 'SEX': 'sex'})

In [8]:
ipums

Unnamed: 0,incwage,years_of_educ,potential_experience,non_white,sex
0,11000,16.000000,18.000000,0,Female
1,38000,15.000000,23.000000,0,Male
2,7000,13.000000,2.000000,0,Male
3,2600,11.000000,0.000000,0,Male
4,20800,12.000000,33.000000,1,Male
...,...,...,...,...,...
197751,22500,16.000000,13.000000,0,Male
197752,4500,16.000000,11.000000,0,Female
197753,37000,17.737673,19.262327,0,Male
197754,1500,15.000000,13.000000,0,Female


In [9]:
ipums.isnull().sum()

incwage                 0
years_of_educ           0
potential_experience    0
non_white               0
sex                     0
dtype: int64

There is no missing data.

We convert `sex` to a binary variable.

In [10]:
new_sex = (ipums["sex"] == "Female").astype(int)

In [11]:
ipums = ipums.assign(sex=new_sex).reset_index(drop=True)

In [12]:
ipums

Unnamed: 0,incwage,years_of_educ,potential_experience,non_white,sex
0,11000,16.000000,18.000000,0,1
1,38000,15.000000,23.000000,0,0
2,7000,13.000000,2.000000,0,0
3,2600,11.000000,0.000000,0,0
4,20800,12.000000,33.000000,1,0
...,...,...,...,...,...
197751,22500,16.000000,13.000000,0,0
197752,4500,16.000000,11.000000,0,1
197753,37000,17.737673,19.262327,0,0
197754,1500,15.000000,13.000000,0,1


In [13]:
# ipums.loc[:,['years_of_educ', 'potential_experience']] = ipums.loc[:,['years_of_educ', 'potential_experience']].apply(lambda x: np.round(x, 0))

In [14]:
# ipums.loc[:,['years_of_educ', 'potential_experience']] = ipums.loc[:,['years_of_educ', 'potential_experience']].apply(lambda x: x.astype('object'))

In [15]:
# ipums.loc[:,['years_of_educ', 'potential_experience']] = ipums.loc[:,['years_of_educ', 'potential_experience']].apply(lambda x: x.astype('int'))

We will standardize the variables for earnings, years of education, and potential experience. These will be treated as continuous in the GMM (but we will round to whole numbers after the synthesis), and non-white and SEX will be treated as categorical.

In [16]:
# # standardize location measurements
# means = np.mean(ipums[["incwage", "years_of_educ", "potential_experience"]], axis = 0)
# stds = np.std(ipums[["incwage", "years_of_educ", "potential_experience"]], axis = 0)
# ipums[["incwage", "years_of_educ", "potential_experience"]] = (ipums[["incwage", "years_of_educ", "potential_experience"]]-means)/stds

# standardization_stats = pd.DataFrame({"means":means, "stds":stds})

In [16]:
ipums.to_csv("../Data/IPUMS/non_normalized_cleaned_ipums_data.csv", index = False)
# standardization_stats.to_csv("../Data/IPUMS/ipums_standardization_stats.csv", index = False)