# Data Cleaning for Project 3: Block Group Data

## Setup

### Import Libraries

In [1]:
import pandas as pd
import os

### Set File Locations

In [2]:
# note that some of the raw data files are very large
# these very large files are located in a gitignored directory.

# raw data files
raw_data_file = "../00_Data/raw_data/pdb_block_group.csv"

# clean data files
cleaned_data_csv = "../00_Data/cleaned_data/cleaned_pdb_block_group.csv"
cleaned_data_json = "../00_Data/cleaned_data/cleaned_pdb_block_group.json"

## Exploring the Dataset

### Bring in the block group data

In [3]:
data_df = pd.read_csv(raw_data_file)

### Take a look at the block group data
Some things to note here:
1. There are 220354 records and 345 columns.
2. There are plenty of nulls about.
3. There are no nulls for 'has_superfund', which is our outcome of interest.
4. Financial values, e.g. 'Med_House_Value_BG_ACS_09_13', are stored as strings.

In [4]:
data_df.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220354 entries, 0 to 220353
Data columns (total 345 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   FIPS_Block_Group                  220354 non-null  int64  
 1   State                             220354 non-null  int64  
 2   State_name                        220354 non-null  object 
 3   County                            220354 non-null  int64  
 4   County_name                       220354 non-null  object 
 5   Tract                             220354 non-null  int64  
 6   Block_Group                       220354 non-null  int64  
 7   Flag                              600 non-null     float64
 8   LAND_AREA                         220334 non-null  float64
 9   AIAN_LAND                         220334 non-null  float64
 10  URBANIZED_AREA_POP_CEN_2010       220334 non-null  float64
 11  URBAN_CLUSTER_POP_CEN_2010        220334 non-null  

### Reformat financial data

In [5]:
# take a peek at the financial data stored as strings to see what kind of fromatting we're dealing with.
data_df['Aggr_House_Value_ACS_09_13'].head()

0     $25,895,000
1     $54,637,000
2     $61,012,600
3     $30,169,400
4    $104,252,000
Name: Aggr_House_Value_ACS_09_13, dtype: object

In [6]:
# make a list of the financial format columns
# grab all the columns with 'object' dtype
string_columns = list(data_df.select_dtypes(include=['object']).columns)
string_columns

['State_name',
 'County_name',
 'Med_HHD_Inc_BG_ACS_09_13',
 'Med_HHD_Inc_BG_ACSMOE_09_13',
 'Med_HHD_Inc_TR_ACS_09_13',
 'Med_HHD_Inc_TR_ACSMOE_09_13',
 'Aggregate_HH_INC_ACS_09_13',
 'Aggregate_HH_INC_ACSMOE_09_13',
 'Med_House_Value_BG_ACS_09_13',
 'Med_House_Value_BG_ACSMOE_09_13',
 'Med_house_value_TR_ACS_09_13',
 'Med_house_value_TR_ACSMOE_09_13',
 'Aggr_House_Value_ACS_09_13',
 'Aggr_House_Value_ACSMOE_09_13',
 'avg_Agg_HH_INC_ACS_09_13',
 'avg_Agg_HH_INC_ACSMOE_09_13',
 'avg_Agg_House_Value_ACS_09_13',
 'avg_Agg_House_Value_ACSMOE_09_13']

In [7]:
# leave off the first two entries ('State_name' and 'County_name') since we don't need to reformat those
financial_columns = string_columns[2:]

In [8]:
# Loop through and reformat the columns by taking out the '$' and ',', and then changing the dtype to 'float'.
# This can take a minute or two.
for x in financial_columns:
    data_df[[x]] = (data_df[x].replace( '[\$,)]','', regex=True )
                     .replace( '[(]','-',   regex=True ).astype(float))

In [9]:
# Quick check to verify the format.
data_df['Aggr_House_Value_ACS_09_13'].head()

0     25895000.0
1     54637000.0
2     61012600.0
3     30169400.0
4    104252000.0
Name: Aggr_House_Value_ACS_09_13, dtype: float64

### Other Cleaning

In [10]:
# convert the column names to lowercase to avoid problems later on.
data_df.columns= data_df.columns.str.lower()
# fill in nulls with zeros
data_df.fillna(0, inplace = True)

### Select Columns

In [11]:
# select columns we want to keep here
# for now, proof-of-concept
desired_columns = [x.lower() for x in [
                "FIPS_Block_Group",
                "State_name",
                "County_name",
                "Tract",
                "Block_Group",
                "Tot_Population_CEN_2010",
                "Hispanic_CEN_2010",
                "NH_Blk_alone_CEN_2010",
                "NH_AIAN_alone_CEN_2010",
                "NH_Asian_alone_CEN_2010",
                "NH_NHOPI_alone_CEN_2010",
                "NH_SOR_alone_CEN_2010",
                "College_ACS_09_13",
                "No_Health_Ins_ACS_09_13",
                "Med_HHD_Inc_BG_ACS_09_13",
                "Aggregate_HH_INC_ACS_09_13",
                "Tot_Vacant_Units_CEN_2010",
                "Renter_Occp_HU_CEN_2010",
                "Owner_Occp_HU_CEN_2010",
                "No_Plumb_ACS_09_13",
                "Med_House_Value_BG_ACS_09_13",
                "pct_Hispanic_CEN_2010",
                "pct_NH_Blk_alone_CEN_2010",
                "pct_NH_AIAN_alone_CEN_2010",
                "pct_NH_Asian_alone_CEN_2010",
                "pct_NH_NHOPI_alone_CEN_2010",
                "pct_NH_SOR_alone_CEN_2010",
                "pct_Not_HS_Grad_ACS_09_13",
                "pct_No_Health_Ins_ACS_09_13",
                "pct_Vacant_Units_CEN_2010",
                "pct_Renter_Occp_HU_CEN_2010",
                "pct_Owner_Occp_HU_CEN_2010",
                "pct_No_Plumb_ACS_09_13"]]
cleaned_data_df = data_df[desired_columns]
cleaned_data_df['has_superfund'] = data_df['has_superfund']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [12]:
# last check
cleaned_data_df.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220354 entries, 0 to 220353
Data columns (total 34 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   fips_block_group              220354 non-null  int64  
 1   state_name                    220354 non-null  object 
 2   county_name                   220354 non-null  object 
 3   tract                         220354 non-null  int64  
 4   block_group                   220354 non-null  int64  
 5   tot_population_cen_2010       220354 non-null  float64
 6   hispanic_cen_2010             220354 non-null  float64
 7   nh_blk_alone_cen_2010         220354 non-null  float64
 8   nh_aian_alone_cen_2010        220354 non-null  float64
 9   nh_asian_alone_cen_2010       220354 non-null  float64
 10  nh_nhopi_alone_cen_2010       220354 non-null  float64
 11  nh_sor_alone_cen_2010         220354 non-null  float64
 12  college_acs_09_13             220354 non-nul

## Export

In [13]:
# Export to csv
cleaned_data_df.to_csv(cleaned_data_csv, index = False)

In [14]:
# Export to flat json
cleaned_data_df.to_json(cleaned_data_json, orient='records')