#### Initial Set Up

In [None]:
# Import Dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Remove dataFrame display size restrictions
#pd.set_option("display.max_rows", None, "display.max_columns", None)

# Create path
path = "Resources/brownfields.csv"

# Read in csv
df = pd.read_csv(path)

# Display first 5 records
df.head()

# Grab original dimensions before clean
original_dimensions = df.shape
print(f'Original dimensions of the brownfield site dataset (rows/columns): {original_dimensions}') 

#### Clean Up

In [None]:
# Remove all Brownfield sites that outside NYC
df = df[df['Town'].str.contains("New York City", na=False)]

nyc_only = df.shape
print(f"Dimensions of the brownfield site dataset after excluding all sites outside NYC's 5 boroughs: {nyc_only}")  

In [None]:
# Remove irrelevant columns
df = df[df.columns.difference(['accuracy', "Address1", 'accuracyunit', 'x_coord', 'y_coord', "Locality", 'Town', 'Address2', 'Region', 'method', 'Program'])]

columns_removed = df.shape
print(f'The dimensions of the brownfield site dataset after removing irrelevant columns: {columns_removed}') 

In [None]:
# Rename column names
df = df.rename(columns={"Sitename": "site_name", "siteclass": "status", "County": "borough", "ZIPCode": "zipcode", "SiteCode": "site_code"}) 

# Convert all strings to lowercase
df["borough"] = df["borough"].str.lower()
df["site_name"] = df["site_name"].str.lower()
df["site_code"] = df["site_code"].str.lower()
df

In [None]:
# Ensure zipcodes are only 5 characters in length
df['zip'].str.split("-", 1)

# Remove all whitespaces and commas


In [None]:
# Replace county name with borough name
df['borough'] = df['borough'].replace({"richmond": "staten island"})
df['borough'] = df['borough'].replace({"new york": "manhattan"})

#### Group status in advance of classification

In [None]:
# Create a segment for completed projects
df['status'] = df['status'].str.replace('N', 'completed - clean')
df['status'] = df['status'].str.replace('C', 'completed - clean')
df['status'] = df['status'].str.replace('5', 'completed - clean')

# Create a segment for sites that pose a significant thread
df['status'] = df['status'].str.replace('1', 'active - significant threat')
df['status'] = df['status'].str.replace('2', 'active - significant threat')

# Create a segment for sites that pose a significant thread
df['status'] = df['status'].str.replace('P','potential threat')

# Create a segment for sites that are clean but require continued maitenance
df['status'] = df['status'].str.replace('4', 'continued threat')

# Create a segment for sites that are clean but require continued maitenance
df['status'] = df['status'].str.replace('3', 'active cleanup - mild threat')
df['status'] = df['status'].str.replace('A', 'active cleanup - mild threat')

In [None]:
df['status'] = df['status'].replace({"potential threatR": "potential threat"})

In [None]:
df.status.value_counts()

#### Add Categorical Encoding & Binary Values

In [None]:
# Convert type of columns to 'category'
df['status'] = df['status'].astype('category')

# Assigning numerical values and store in another column
df['status_tier'] = df['status'].cat.codes


In [None]:
df

In [None]:
# Generate binary values using get_dummies
dum_df = pd.get_dummies(df, columns=["status"], prefix=["type_is"] )

# Merge with main df
brownfield_df = df.merge(dum_df)
brownfield_df