# Setup

## Import Libraries

In [1]:
import pandas as pd
import os
import json

## Set file locations

In [2]:
# tract data
tract_data_csv = "../00_Data/cleaned_data/cleaned_pdb_tract.csv"
tract_data_json = "../00_Data/cleaned_data/cleaned_pdb_tract.json"

# block data
block_data_csv = "../00_Data/cleaned_data/cleaned_pdb_block_group.csv"
block_data_json = "../00_Data/cleaned_data/cleaned_pdb_block_group.json"

# site data
site_data_csv = "../00_Data/cleaned_data/cleaned_priorities_list.csv"
site_data_json = "../00_Data/cleaned_data/cleaned_priorities_list.json"

# inspection csv
inspection_csv = "../00_Data/inspection_files/inspection_file.csv"

# cleaned, merged data
merged_data_csv = "../00_Data/cleaned_data/cleaned_merged_data.csv"
merged_data_json = "../00_Data/cleaned_data/cleaned_merged_data.json"

# Import Data

In [3]:
# Import census data
census_df = pd.read_csv(block_data_csv)
census_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220354 entries, 0 to 220353
Data columns (total 34 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   fips_block_group              220354 non-null  int64  
 1   state_name                    220354 non-null  object 
 2   county_name                   220354 non-null  object 
 3   tract                         220354 non-null  int64  
 4   block_group                   220354 non-null  int64  
 5   tot_population_cen_2010       220354 non-null  float64
 6   hispanic_cen_2010             220354 non-null  float64
 7   nh_blk_alone_cen_2010         220354 non-null  float64
 8   nh_aian_alone_cen_2010        220354 non-null  float64
 9   nh_asian_alone_cen_2010       220354 non-null  float64
 10  nh_nhopi_alone_cen_2010       220354 non-null  float64
 11  nh_sor_alone_cen_2010         220354 non-null  float64
 12  college_acs_09_13             220354 non-nul

In [4]:
# Import superfund site data
site_df = pd.read_json(site_data_json)
site_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1344 entries, 0 to 1343
Data columns (total 39 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   fips_block_group              1344 non-null   int64  
 1   address                       1344 non-null   object 
 2   city                          1344 non-null   object 
 3   latitude                      1344 non-null   float64
 4   longitude                     1344 non-null   float64
 5   site_score                    1344 non-null   float64
 6   site_text                     1344 non-null   object 
 7   state_name                    1344 non-null   object 
 8   county_name                   1344 non-null   object 
 9   tract                         1344 non-null   int64  
 10  block_group                   1344 non-null   int64  
 11  tot_population_cen_2010       1344 non-null   int64  
 12  hispanic_cen_2010             1344 non-null   int64  
 13  nh_

# Merge Data

In [5]:
merged_df = pd.merge(census_df, site_df, how='outer', on='fips_block_group')
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 220431 entries, 0 to 220430
Data columns (total 72 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   fips_block_group                220431 non-null  int64  
 1   state_name_x                    220423 non-null  object 
 2   county_name_x                   220423 non-null  object 
 3   tract_x                         220423 non-null  float64
 4   block_group_x                   220423 non-null  float64
 5   tot_population_cen_2010_x       220423 non-null  float64
 6   hispanic_cen_2010_x             220423 non-null  float64
 7   nh_blk_alone_cen_2010_x         220423 non-null  float64
 8   nh_aian_alone_cen_2010_x        220423 non-null  float64
 9   nh_asian_alone_cen_2010_x       220423 non-null  float64
 10  nh_nhopi_alone_cen_2010_x       220423 non-null  float64
 11  nh_sor_alone_cen_2010_x         220423 non-null  float64
 12  college_acs_09_1

## Inspect/Clean the Merged Data
Looking at the .info() above, we notice that there are some entries that may be missing some information, i.e. there are FIPS block IDs that are not part of the census data. Let's take a closer look at those. Later, we need to make some decisions about how we deal with that data.

### Inspection

In [6]:
inspection_df = merged_df[merged_df['state_name_x'].isnull()]

In [7]:
inspection_df.to_csv(inspection_csv, index = False)

### Cleaning

In [8]:
# drop the duplicate columns
column_names = list(merged_df.columns.values)

exclusion_list = ['_y']
selected_columns = [x for x in column_names if all(y not in x for y in exclusion_list)]

clean_merged_df = merged_df[selected_columns]

# tidy up the column names
clean_merged_df.columns = clean_merged_df.columns.str.rstrip('_x')

# fill in nulls with zeros
clean_merged_df.fillna(0, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [9]:
clean_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 220431 entries, 0 to 220430
Data columns (total 40 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   fips_block_group              220431 non-null  int64  
 1   state_name                    220431 non-null  object 
 2   county_name                   220431 non-null  object 
 3   tract                         220431 non-null  float64
 4   block_group                   220431 non-null  float64
 5   tot_population_cen_2010       220431 non-null  float64
 6   hispanic_cen_2010             220431 non-null  float64
 7   nh_blk_alone_cen_2010         220431 non-null  float64
 8   nh_aian_alone_cen_2010        220431 non-null  float64
 9   nh_asian_alone_cen_2010       220431 non-null  float64
 10  nh_nhopi_alone_cen_2010       220431 non-null  float64
 11  nh_sor_alone_cen_2010         220431 non-null  float64
 12  college_acs_09_13             220431 non-nul

# Export the Merged Data

In [10]:
# Export to csv
clean_merged_df.to_csv(merged_data_csv, index = False)

In [11]:
# Export to flat json
clean_merged_df.to_json(merged_data_json, orient='records')