#### Initial Set Up

In [14]:
# Import Dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Remove dataFrame display size restrictions
#pd.set_option("display.max_rows", None, "display.max_columns", None)

# Create path
path = "resources/brownfields.csv"

# Read in csv
df = pd.read_csv(path)

# Display first 5 records
df.head()

# Grab original dimensions before clean
original_dimensions = df.shape
print(f'Original dimensions of the brownfield site dataset (rows/columns): {original_dimensions}') 

Original dimensions of the brownfield site dataset (rows/columns): (5278, 16)


#### Clean Up

In [15]:
# Remove all Brownfield sites that outside NYC
df = df[df['Town'].str.contains("New York City", na=False)]

nyc_only = df.shape
print(f"Dimensions of the brownfield site dataset after excluding all sites outside NYC's 5 boroughs: {nyc_only}")  

Dimensions of the brownfield site dataset after excluding all sites outside NYC's 5 boroughs: (1080, 16)


In [16]:
# Remove irrelevant columns
df = df[df.columns.difference(['accuracy', "Address1", 'accuracyunit', 'x_coord', 'y_coord', "Locality", 'Town', 'Address2', 'Region', 'method', 'Program'])]

columns_removed = df.shape
print(f'The dimensions of the brownfield site dataset after removing irrelevant columns: {columns_removed}') 

The dimensions of the brownfield site dataset after removing irrelevant columns: (1080, 5)


In [17]:
# Rename column names
df = df.rename(columns={"Sitename": "sitename", "County": "borough", "ZIPCode": "zip", "SiteCode": "site_code"}) 
# Convert all strings to lowercase
df["borough"] = df["borough"].str.lower()
df

Unnamed: 0,borough,site_code,sitename,zip,siteclass
112,bronx,C203015,"Gateway Center at Bronx Terminal Market, Eastern",10451,C
113,bronx,C203023,Plaza 163 LLC,10457,N
114,bronx,C203014,Parkview Commons,10451,C
115,bronx,C203024,Melrose Commons North,10451-,N
116,bronx,C203028,"Gateway Center at Bronx Terminal Market, Western",10451,N
...,...,...,...,...,...
3998,richmond,V00318,Carole Cleaners - Staten Island Mall,10314,N
3999,richmond,V00615,Port Ivory Site (Former P & G) Site 1,10303,C
4000,richmond,V00674,Port Ivory Site (Former P & G) Site 2,10303,C
4001,richmond,V00675,Port Ivory Site (Former P & G) Site 3,10303,N


In [18]:
# Ensure zipcodes are only 5 characters in length
df['zip'].str.split("-", 1)

# Remove all whitespaces and commas


112       [10451]
113       [10457]
114       [10451]
115     [10451, ]
116       [10451]
          ...    
3998      [10314]
3999      [10303]
4000      [10303]
4001      [10303]
4002      [10302]
Name: zip, Length: 1080, dtype: object

In [6]:
# Replace county name with borough name
df['borough'] = df['borough'].replace({"richmond": "staten island"})
df['borough'] = df['borough'].replace({"new york": "manhattan"})

#### Remove completed cleanup sites

* Classification Code: 5
    * The classification assigned to a site that has been properly closed and requires no further action. This may include a site where continued operation, maintenance, or monitoring is not needed to achieve/maintain protectiveness, but the site is not suitable for delisting from the Registry (e.g., DEC is unable to obtain an institutional control).

* Classification Code: C (Completed)
    * The classification used for sites where the Department has determined that remediation has been satisfactorily completed under a remedial program (i. e., State Superfund, Brownfield Cleanup Program, Environmental Restoration Program, Voluntary Cleanup Program, and RCRA Corrective Action Program). State Superfund (Registry) sites must have completed all active operation, maintenance, or monitoring requirements before they can be delisted and made class C. Non-registry sites may be made a class C after successful completion of all required construction or after a no further action remedy has been selected by the Department. These sites will be issued a Certificate of Completion (COC), but may still require ongoing maintenance and periodic certification of institutional/engineering controls (IC/ECs).

* Classification Code: N (No Further Action at this Time)



In [7]:
df = df[df["siteclass"] != '5']
#df = df[df["siteclass"] != 'C']
df = df[df["siteclass"] != 'N']

active_sites = len(df)
print(f'The dimensions of the Brownfield site dataset after removing completed/inactive sites: {active_sites}') 

The dimensions of the Brownfield site dataset after removing completed/inactive sites: 725


* Use categorical encoding using 1-5 classification (severity) or number of active sites

#### Add Categorical Encoding & Binary Values

In [9]:
# # Convert type of columns to 'category'
# df['category'] = df['category'].astype('category')

# # Assigning numerical values and store in another column
# df['category_tier'] = df['category'].cat.codes


In [10]:
# # Generate binary values using get_dummies
# dum_df = pd.get_dummies(df, columns=["category"], prefix=["Type_is"] )

# # Merge with main df
# crime_df = df.merge(dum_df)
# crime_df