In [1]:
import pandas as pd

In [38]:
# load my IPUMS chicago query
df = pd.read_csv('chi_ipums_10.11.23.csv')

In [40]:
df.head(2)

Unnamed: 0,YEAR,SAMPLE,SERIAL,CBSERIAL,HHWT,CLUSTER,STATEFIP,COUNTYFIP,CITY,STRATA,...,LIT,EMPSTAT,EMPSTATD,OCC1990,WORKEDYR,POVERTY,MIGRATE1,MIGRATE1D,MIGPLAC1,MIGCITY1
0,1850,185001,330301,,98.34,1850003303011,17,,1190,10211703100,...,1.0,,,,,,,,,
1,1850,185001,330301,,98.34,1850003303011,17,,1190,10211703100,...,4.0,,,,,,,,,


In [41]:
# what years did I pull?
df.groupby('YEAR').size().reset_index()

Unnamed: 0,YEAR,0
0,1850,312
1,1860,1109
2,1870,2846
3,1880,4918
4,1900,16701
5,1910,20839
6,1920,27073
7,1930,33703
8,1940,34447
9,1950,42532


In [42]:
# add in birthplace codes
bpl = pd.read_csv('IPUMS codes - BPL.csv')

In [43]:
df = pd.merge(df, bpl, left_on='BPL', right_on='bpl')

In [90]:
# create a dict of census years and corresponding immigration years
year_dict = {1900:1899,
            1910:1909,
            1920:1919,
            1930:1929,
            1980:1975,
            1990:1987,
            2000:1999,
            2005:2004,
            2006:2005,
            2007:2006,
            2008:2007,
            2009:2008,
            2010:2009,
            2011:2010,
            2012:2011,
            2013:2012,
            2014:2013,
            2015:2014,
            2016:2015,
            2017:2016,
            2018:2017,
            2019:2018,
            2020:2019,
            2021:2020}

In [101]:
# inspect 1980 to see what YRIMMIG are avail - 1975
df[df['YEAR'] == 1980].groupby('YRIMMIG').size().reset_index()

Unnamed: 0,YRIMMIG,0
0,0.0,25556
1,1949.0,847
2,1950.0,681
3,1960.0,266
4,1965.0,580
5,1970.0,891
6,1975.0,1242


In [102]:
# inspect 1990 to see what YRIMMIG are avail - 1987
df[df['YEAR'] == 1990].groupby('YRIMMIG').size().reset_index()

Unnamed: 0,YRIMMIG,0
0,0.0,17189
1,1949.0,303
2,1950.0,404
3,1960.0,235
4,1965.0,370
5,1970.0,494
6,1975.0,630
7,1980.0,396
8,1982.0,369
9,1985.0,420


In [103]:
# inspect 2000 to see what YRIMMIG are avail - 1987
df[df['YEAR'] == 2000].groupby('YRIMMIG').size().reset_index()

Unnamed: 0,YRIMMIG,0
0,0.0,16868
1,1910.0,1
2,1914.0,2
3,1919.0,2
4,1920.0,2
...,...,...
73,1996.0,204
74,1997.0,198
75,1998.0,282
76,1999.0,322


In [92]:
# loop through all years

temp_list = [] # create a list of pivot table dataframes

for key,value in year_dict.items():
    
    # create the temp pivot table
    temp = pd.pivot_table(df[(df['YEAR'] == key) & (df['YRIMMIG'] == value)],
              values='PERWT',
              index='bpl_desc',
              columns='YRIMMIG',
              aggfunc='sum').reset_index()
    
    # rename pivot table cols
    temp.columns = ['birthplace', key]
    
    # add each df to the list
    temp_list.append(temp)

In [98]:
# merge all the lists together
merged = pd.merge(temp_list[0],temp_list[1], on='birthplace', how='outer')

for i in range(2,24,1): # loop through list index 2 inclusive thru 27 inclusive
    # merge each successive year 
    merged = pd.merge(merged,temp_list[i], on='birthplace', how='outer')

In [99]:
merged

Unnamed: 0,birthplace,1900,1910,1920,1930,1980,1990,2000,2005,2006,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Austria,600.0,8630.0,,,200.0,,,,,...,,,,,,84.0,,,,
1,Canada,1002.0,2006.0,901.62,403.80,300.0,465.0,312.0,,355.0,...,1082.0,189.0,,501.0,541.0,1521.0,389.0,57.0,70.0,113.0
2,China,100.0,,,,2700.0,4620.0,1201.0,870.0,1260.0,...,3248.0,3014.0,3378.0,4493.0,2847.0,2451.0,3092.0,1105.0,3700.0,737.0
3,Czechoslovakia,502.0,,200.36,504.75,,120.0,416.0,,,...,,,563.0,,,223.0,,,,
4,Denmark,100.0,200.0,200.36,,100.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,Saudi Arabia,,,,,,,,,,...,329.0,44.0,943.0,163.0,513.0,761.0,,,,
65,Kuwait,,,,,,,,,,...,,113.0,,,,,,,,
66,Afghanistan,,,,,,,,,,...,,,,799.0,,,,,,
67,Cyprus,,,,,,,,,,...,,,,129.0,,,,,,


In [100]:
# export merged
merged.to_csv('processed/migrants_bpl_1900_2021.csv')