## IRS Cleaning File
    This file takes data from the IRS(https://www.irs.gov/statistics/soi-tax-stats-migration-data), containing information on county inflow and outflow on a yearly basis, concatenates each year, and saves an inflow and ouflow file in the 'cleaned-csvs' folder.
 

In [None]:
import pandas as pd

In [None]:
# Complete outflow df
out_cols = ['individual_outflow', 'countyfips', 'statename', 'countyname', 'statefips', 'year']
outflow_df = pd.DataFrame(columns=out_cols)

for year in range(2014, 2020):
    # Create filepath read in df
    yr_range = str(year-1)[-2:] + str(year)[-2:]
    filepath = '/work/assets/countyoutflow{}.csv'.format(yr_range)
    df = pd.read_csv(filepath)

    # Drop rows with aggregated county information
    df = df[df['y2_countyname'].str.contains(' Total Migration-US and Foreign')]
    df = df[~df['y2_state'].isin(['FR', 'SS', 'DS'])]

    # Fix column format
    out_rename_dict = {'y1_statefips':'statefips', 'y1_countyfips':'countyfips', 'y2_state':'statename',
                       'y2_countyname':'countyname', 'n1':'outflow_family', 'n2':'individual_outflow'}
    df = df.rename(columns=out_rename_dict)
    df['year'] = year
    df = df[out_cols]

    # Concatenate
    outflow_df = pd.concat([outflow_df,df], ignore_index=True)

    

In [None]:
# Create inflow csv
in_cols = ['individual_inflow', 'countyfips', 'statename', 'countyname', 'statefips', 'year']
inflow_df = pd.DataFrame(columns=in_cols)

for year in range(2014, 2020):
    # Create filepath read in df
    yr_range = str(year-1)[-2:] + str(year)[-2:]
    filepath = '/work/assets/countyinflow{}.csv'.format(yr_range)
    df = pd.read_csv(filepath)

    df = df[df['y1_countyname'].str.contains(' Total Migration-US and Foreign')]
    df = df[~df['y1_state'].isin(['FR', 'SS', 'DS'])]

    in_rename_dict = {'y2_statefips':'statefips', 'y2_countyfips':'countyfips', 'y1_state':'statename',
                    'y1_countyname':'countyname', 'n1':'inflow_family', 'n2':'individual_inflow'}
    df = df.rename(columns=in_rename_dict)
    df['year'] = year
    df = df[in_cols]

    # Concatenate
    inflow_df = pd.concat([inflow_df,df], ignore_index=True)


In [None]:
# Checking dimensionality of each dataframe
print('Inflow Shape:', inflow_df.shape)
print('Outflow Shape:', outflow_df.shape)

Inflow Shape: (18779, 6)
Outflow Shape: (18799, 6)


In [None]:
# Delete Duplicate columns from outflow_df
outflow_df = outflow_df[['individual_outflow', 'year', 'statefips', 'countyfips']]

# Concatenate Inflow and Outflow DataFrames and reorder columns
in_out_df = pd.merge(inflow_df, outflow_df, how='inner', on=['year', 'statefips', 'countyfips'])
order = ['individual_inflow','individual_outflow','year','statefips','countyfips','statename','countyname']
in_out_df = in_out_df[order]

# Replace -1 values with 0
in_out_df['individual_inflow'] = in_out_df['individual_inflow'].replace(-1, 0)
in_out_df['individual_outflow'] = in_out_df['individual_outflow'].replace(-1, 0) 

# Save csv
in_out_df.to_csv('/work/cleaned-csvs/irs_in_out')

print('New Shape:', in_out_df.shape)
in_out_df.head()

New Shape: (18772, 7)


Unnamed: 0,individual_inflow,individual_outflow,year,statefips,countyfips,statename,countyname
0,4342,4352,2014,1,1,AL,Autauga County Total Migration-US and Foreign
1,12068,8113,2014,1,3,AL,Baldwin County Total Migration-US and Foreign
2,1088,1197,2014,1,5,AL,Barbour County Total Migration-US and Foreign
3,1023,888,2014,1,7,AL,Bibb County Total Migration-US and Foreign
4,2490,2411,2014,1,9,AL,Blount County Total Migration-US and Foreign


In [None]:
# Checking missing 7 values
in_grouped = set(inflow_df.groupby(['year', 'statefips', 'countyfips']).count().index)
out_grouped = set(outflow_df.groupby(['year', 'statefips', 'countyfips']).count().index)
print('In Len:', len(in_grouped))
print('Out Len:', len(out_grouped))


missing1 = in_grouped-out_grouped
missing2 = out_grouped-in_grouped
missing = missing1.union(missing2)
print('Number Missing in Inflow', len(missing1))
print('Number Missing in Outflow', len(missing2))

missing

In Len: 18779
Out Len: 18799
Number Missing in Inflow 7
Number Missing in Outflow 27


{(2015, 48, 301),
 (2019, 8, 61),
 (2019, 8, 79),
 (2019, 20, 33),
 (2019, 20, 71),
 (2019, 20, 75),
 (2019, 30, 19),
 (2019, 30, 37),
 (2019, 30, 75),
 (2019, 31, 15),
 (2019, 31, 57),
 (2019, 31, 91),
 (2019, 31, 113),
 (2019, 31, 115),
 (2019, 31, 171),
 (2019, 31, 183),
 (2019, 32, 9),
 (2019, 35, 11),
 (2019, 38, 7),
 (2019, 38, 33),
 (2019, 38, 37),
 (2019, 38, 83),
 (2019, 40, 25),
 (2019, 46, 17),
 (2019, 46, 49),
 (2019, 46, 55),
 (2019, 46, 69),
 (2019, 48, 101),
 (2019, 48, 155),
 (2019, 48, 263),
 (2019, 48, 311),
 (2019, 48, 345),
 (2019, 48, 443),
 (2019, 56, 27)}

### Inflow outflow ### 
each county is a row and there data for each year are the columns

In [None]:
in_out_df.columns

Index(['individual_inflow', 'individual_outflow', 'year', 'statefips',
       'countyfips', 'statename', 'countyname'],
      dtype='object')

In [None]:
inflow_df = in_out_df.pivot(index=['statefips','countyfips',], columns='year')['individual_inflow']

outflow_df = in_out_df.pivot(index=['statefips','countyfips',], columns='year')['individual_outflow']
yr_cols = list(inflow_df.columns)

for col in yr_cols:
    inflow_df.rename(columns = {col:'in_'+str(col)},inplace=True)
    outflow_df.rename(columns = {col:'out_'+str(col)},inplace=True)
in_out_one_per_county = pd.merge(inflow_df,outflow_df,how='inner',left_index=True,right_index=True)

In [None]:
in_out_one_per_county.to_csv('/work/cleaned-csvs/irs_in_out_one_per_county')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f6c76417-5fde-42f3-8920-755838dec3fa' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>