## Cameron Bale
***
#### Code for pre-processing the IPUMS data.
***

In [1]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
ipums = pd.read_csv("Data/new_subset.csv")

In [3]:
ipums

Unnamed: 0,INCWAGE,years_of_educ,potential_experience,non_white,SEX
0,11000,16.000000,18.000000,0,Female
1,38000,15.000000,23.000000,0,Male
2,7000,13.000000,2.000000,0,Male
3,2600,11.000000,0.000000,0,Male
4,20800,12.000000,33.000000,1,Male
...,...,...,...,...,...
197751,22500,16.000000,13.000000,0,Male
197752,4500,16.000000,11.000000,0,Female
197753,37000,17.737673,19.262327,0,Male
197754,1500,15.000000,13.000000,0,Female


There is no missing data.

We convert the SEX variable to a binary variable.

In [4]:
new_sex = (ipums["SEX"] == "Female").astype(int)

In [5]:
ipums = ipums.assign(SEX=new_sex).reset_index(drop=True)

In [6]:
ipums

Unnamed: 0,INCWAGE,years_of_educ,potential_experience,non_white,SEX
0,11000,16.000000,18.000000,0,1
1,38000,15.000000,23.000000,0,0
2,7000,13.000000,2.000000,0,0
3,2600,11.000000,0.000000,0,0
4,20800,12.000000,33.000000,1,0
...,...,...,...,...,...
197751,22500,16.000000,13.000000,0,0
197752,4500,16.000000,11.000000,0,1
197753,37000,17.737673,19.262327,0,0
197754,1500,15.000000,13.000000,0,1


We will standardize the variables for earnings, years of education, and potential experience. These will be treated as continuous in the GMM (but we will round to whole numbers after the synthesis), and non-white and SEX will be treated as categorical.

In [7]:
# standardize location measurements
means = np.mean(ipums[["INCWAGE", "years_of_educ", "potential_experience"]], axis = 0)
stds = np.std(ipums[["INCWAGE", "years_of_educ", "potential_experience"]], axis = 0)
ipums[["INCWAGE", "years_of_educ", "potential_experience"]] = (ipums[["INCWAGE", "years_of_educ", "potential_experience"]]-means)/stds

standardization_stats = pd.DataFrame({"means":means, "stds":stds})

In [8]:
means

INCWAGE                 24678.867448
years_of_educ              13.116651
potential_experience       18.481890
dtype: float64

In [9]:
stds

INCWAGE                 23607.263564
years_of_educ               2.599788
potential_experience       12.231322
dtype: float64

In [10]:
standardization_stats

Unnamed: 0,means,stds
INCWAGE,24678.867448,23607.263564
years_of_educ,13.116651,2.599788
potential_experience,18.48189,12.231322


In [11]:
# ipums = ipums.assign(INCWAGE=ipums.INCWAGE, years_of_educ=ipums.years_of_educ, potential_experience=ipums.potential_experience)

In [12]:
ipums

Unnamed: 0,INCWAGE,years_of_educ,potential_experience,non_white,SEX
0,-0.579435,1.109071,-0.039398,0,1
1,0.564281,0.724424,0.369388,0,0
2,-0.748874,-0.044869,-1.347515,0,0
3,-0.935257,-0.814163,-1.511030,0,0
4,-0.164308,-0.429516,1.186962,1,0
...,...,...,...,...,...
197751,-0.092296,1.109071,-0.448185,0,0
197752,-0.854774,1.109071,-0.611699,0,1
197753,0.521921,1.777461,0.063806,0,0
197754,-0.981853,0.724424,-0.448185,0,1


In [13]:
ipums.to_csv("Data/cleaned_ipums_data.csv", index = False)
standardization_stats.to_csv("Data/ipums_standardization_stats.csv", index = False)