## Transform

Transform data (extracted as 51 individual CSV files) into one large dataframe. Clean up and create unique rows and columns

In [1]:
import pandas as pd

In [2]:
# Read CSV
alabama = pd.read_csv("../Data/States/Alabama.csv")

In [3]:
# Set Age (AGEP) as index
alabama = alabama.set_index("Age (AGEP)")
alabama.head()

Unnamed: 0_level_0,Total,Alabama/AL,Alaska/AK,Arizona/AZ,Arkansas/AR,California/CA,Colorado/CO,Connecticut/CT,Delaware/DE,District of Columbia/DC,...,"South America, Not Specified",Egypt,Ethiopia (2017 or later),Kenya (2017 or later),Nigeria,"Western Africa, Not Specified","Northern Africa or Other Africa, Not Specified","Eastern Africa, Not Specified",Australia,"Other US Island Areas, Oceania, Not Specified, or At Sea"
Age (AGEP),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-> Total,319852,253611,429,868,859,1674,983,393,64,167,...,162,36,0,12,105,0,69,103,109,128
-> Total ->,319852,253611,429,868,859,1674,983,393,64,167,...,162,36,0,12,105,0,69,103,109,128
Total 12th grade - no diploma,7237,5807,0,4,0,63,9,99,0,0,...,0,0,0,0,0,0,0,0,0,0
25 years Total 12th grade - no diploma,260,191,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26 years Total 12th grade - no diploma,336,287,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Transpose to switch rows with columns
alabama_df = alabama.T

In [5]:
# Add "To_State" so that each origin state on far left shows which state they moved to on far right
# Makes data multi-dimensional

alabama_df["To_State"] = "Alabama"


In [6]:
# Code for one state
alabama = pd.read_csv("../Data/States/Alabama.csv").set_index("Age (AGEP)").T
alabama["To_State"] = "Alabama"
alabama.head()

Age (AGEP),-> Total,-> Total ->,Total 12th grade - no diploma,25 years Total 12th grade - no diploma,26 years Total 12th grade - no diploma,27 years Total 12th grade - no diploma,28 years Total 12th grade - no diploma,29 years Total 12th grade - no diploma,30 years Total 12th grade - no diploma,31 years Total 12th grade - no diploma,...,67 years Total Doctorate degree,68 years Total Doctorate degree,69 years Total Doctorate degree,70 years Total Doctorate degree,71 years Total Doctorate degree,72 years Total Doctorate degree,73 years Total Doctorate degree,74 years Total Doctorate degree,75 years Total Doctorate degree,To_State
Total,319852,319852,7237,260,336,227,132,275,265,339,...,32,2,2,31,0,4,8,10,60,Alabama
Alabama/AL,253611,253611,5807,191,287,227,125,163,178,231,...,13,2,0,31,0,4,8,0,45,Alabama
Alaska/AK,429,429,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Alabama
Arizona/AZ,868,868,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Alabama
Arkansas/AR,859,859,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Alabama


In [7]:
# All states in Data folder
states_list = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", \
               "Delaware", "District of Columbia", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", \
              "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", \
              "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", \
              "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", \
              "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", \
              "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"]

# Dictionary to hold state name as key, dataframe as value
dict_state = {}

# For loop to iterate through all states to shape data
for state in states_list:
    df = pd.read_csv(f'../Data/States/{state}.csv').rename(columns = {"Age (AGEP)":"Origin"}).set_index("Origin").T
    df["To_State"] = f'{state}'
    # Rename and drop columns
    df.rename(columns={df.columns[0]: "Total", df.columns[1]: "Total2"}, inplace=True)
    df = df.drop(columns=["Total2"])
    dict_state[state] = df


In [8]:
# Can call any state dataframe when using state name as dictionary key
dict_state["Alabama"].head()

Origin,Total,Total 12th grade - no diploma,25 years Total 12th grade - no diploma,26 years Total 12th grade - no diploma,27 years Total 12th grade - no diploma,28 years Total 12th grade - no diploma,29 years Total 12th grade - no diploma,30 years Total 12th grade - no diploma,31 years Total 12th grade - no diploma,32 years Total 12th grade - no diploma,...,67 years Total Doctorate degree,68 years Total Doctorate degree,69 years Total Doctorate degree,70 years Total Doctorate degree,71 years Total Doctorate degree,72 years Total Doctorate degree,73 years Total Doctorate degree,74 years Total Doctorate degree,75 years Total Doctorate degree,To_State
Total,319852,7237,260,336,227,132,275,265,339,263,...,32,2,2,31,0,4,8,10,60,Alabama
Alabama/AL,253611,5807,191,287,227,125,163,178,231,218,...,13,2,0,31,0,4,8,0,45,Alabama
Alaska/AK,429,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Alabama
Arizona/AZ,868,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Alabama
Arkansas/AR,859,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Alabama


In [9]:
# Concat dictionary values into one dataframe
all_states_df = pd.concat(dict_state.values(), ignore_index=False)
all_states_df

Origin,Total,Total 12th grade - no diploma,25 years Total 12th grade - no diploma,26 years Total 12th grade - no diploma,27 years Total 12th grade - no diploma,28 years Total 12th grade - no diploma,29 years Total 12th grade - no diploma,30 years Total 12th grade - no diploma,31 years Total 12th grade - no diploma,32 years Total 12th grade - no diploma,...,67 years Total Doctorate degree,68 years Total Doctorate degree,69 years Total Doctorate degree,70 years Total Doctorate degree,71 years Total Doctorate degree,72 years Total Doctorate degree,73 years Total Doctorate degree,74 years Total Doctorate degree,75 years Total Doctorate degree,To_State
Total,319852,7237,260,336,227,132,275,265,339,263,...,32,2,2,31,0,4,8,10,60,Alabama
Alabama/AL,253611,5807,191,287,227,125,163,178,231,218,...,13,2,0,31,0,4,8,0,45,Alabama
Alaska/AK,429,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Alabama
Arizona/AZ,868,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Alabama
Arkansas/AR,859,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Alabama
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Western Africa, Not Specified",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Wyoming
"Northern Africa or Other Africa, Not Specified",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Wyoming
"Eastern Africa, Not Specified",54,54,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Wyoming
Australia,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Wyoming


In [10]:
# del_total = [" -> Total", " -> Total ->"]
# clean_df2 = []
# for x in df.columns:
#     if x == del_total:
#         clean_df = df.rename(columns={x: "Total"})
#         clean_df2 = clean_df

    

## Load

Create engine and load all_states_df into SQL database

In [11]:
# Dependencies
from login import port, pw
from sqlalchemy import create_engine
from sqlalchemy.ext.automap import automap_base

In [12]:
# Create engine
# engine = create_engine(f'postgresql://postgres:{pw}@localhost:{port}/interstate_migration_db')
engine = create_engine("postgresql://postgres:postgres@localhost:5432/interstate_migration_db")

# Connect to pgAdmin 4
conn = engine.connect()

In [13]:
base = automap_base()
base.prepare(engine, reflect=True)
base.classes.keys()

[]

In [14]:
# Load all_states_df into Postgres database
all_states_df.to_sql(name="states", con=engine, if_exists="replace", index=True)

In [15]:
# Confirm table names
engine.table_names()

['states']

In [20]:
# Query database for all listings from states
df = pd.read_sql_query("SELECT * FROM states", con=engine)
df.head()

Unnamed: 0,index,Total,Total 12th grade - no diploma,25 years Total 12th grade - no diploma,26 years Total 12th grade - no diploma,27 years Total 12th grade - no diploma,28 years Total 12th grade - no diploma,29 years Total 12th grade - no diploma,30 years Total 12th grade - no diploma,31 years Total 12th grade - no diploma,...,67 years Total Doctorate degree,68 years Total Doctorate degree,69 years Total Doctorate degree,70 years Total Doctorate degree,71 years Total Doctorate degree,72 years Total Doctorate degree,73 years Total Doctorate degree,74 years Total Doctorate degree,75 years Total Doctorate degree,To_State
0,Total,319852,7237,260,336,227,132,275,265,339,...,32,2,2,31,0,4,8,10,60,Alabama
1,Alabama/AL,253611,5807,191,287,227,125,163,178,231,...,13,2,0,31,0,4,8,0,45,Alabama
2,Alaska/AK,429,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Alabama
3,Arizona/AZ,868,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Alabama
4,Arkansas/AR,859,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Alabama


In [21]:
# Keep
df = df.rename(columns={"index":"Origin"})

In [22]:
# Keep
df.head()

Unnamed: 0,Origin,Total,Total 12th grade - no diploma,25 years Total 12th grade - no diploma,26 years Total 12th grade - no diploma,27 years Total 12th grade - no diploma,28 years Total 12th grade - no diploma,29 years Total 12th grade - no diploma,30 years Total 12th grade - no diploma,31 years Total 12th grade - no diploma,...,67 years Total Doctorate degree,68 years Total Doctorate degree,69 years Total Doctorate degree,70 years Total Doctorate degree,71 years Total Doctorate degree,72 years Total Doctorate degree,73 years Total Doctorate degree,74 years Total Doctorate degree,75 years Total Doctorate degree,To_State
0,Total,319852,7237,260,336,227,132,275,265,339,...,32,2,2,31,0,4,8,10,60,Alabama
1,Alabama/AL,253611,5807,191,287,227,125,163,178,231,...,13,2,0,31,0,4,8,0,45,Alabama
2,Alaska/AK,429,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Alabama
3,Arizona/AZ,868,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Alabama
4,Arkansas/AR,859,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Alabama


In [23]:
# Keep
df.sum(axis=1)

0       959556
1       760833
2         1287
3         2604
4         2577
         ...  
5554         0
5555         0
5556       162
5557         0
5558        42
Length: 5559, dtype: int64

In [None]:
# Keep
#Isolate From state (index) and to state, and 

In [None]:
# Keep
#df[index, to_state, -> Total]

In [None]:
# Keep
#group index by index and to_state and get sum of the total
#groupby dataframe, if doesn't match statelist, do not include it

In [27]:
# Keep
all_states_df.set_index("To_State", inplace=True)
# all_states_df


Origin,Total,Total 12th grade - no diploma,25 years Total 12th grade - no diploma,26 years Total 12th grade - no diploma,27 years Total 12th grade - no diploma,28 years Total 12th grade - no diploma,29 years Total 12th grade - no diploma,30 years Total 12th grade - no diploma,31 years Total 12th grade - no diploma,32 years Total 12th grade - no diploma,...,66 years Total Doctorate degree,67 years Total Doctorate degree,68 years Total Doctorate degree,69 years Total Doctorate degree,70 years Total Doctorate degree,71 years Total Doctorate degree,72 years Total Doctorate degree,73 years Total Doctorate degree,74 years Total Doctorate degree,75 years Total Doctorate degree
To_State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,319852,7237,260,336,227,132,275,265,339,263,...,44,32,2,2,31,0,4,8,10,60
Alabama,253611,5807,191,287,227,125,163,178,231,218,...,31,13,2,0,31,0,4,8,0,45
Alabama,429,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alabama,868,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alabama,859,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wyoming,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Wyoming,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Wyoming,54,54,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Wyoming,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
# Keep
# all_states_df.loc["Alabama", " -> Total"].iloc[3:].sum()

In [None]:
# Keep
all_states_df.columns

In [None]:
# Keep
all_states_df.loc["Alabama"]

In [None]:
# Keep
list(all_states_df.columns)
drop_these = [" -> Total", " -> Total ->"]

In [None]:
# Keep
all_states_df["Origin"]

In [None]:
# Keep
## need state to be a column, need origin to be a column
# reset index so they can be columns