#### Initial Set Up

In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Remove dataFrame display size restrictions
#pd.set_option("display.max_rows", None, "display.max_columns", None)

# Create path
path = "Resources/brownfields.csv"

# Read in csv
df = pd.read_csv(path)

# Display first 5 records
df.head()

# Grab original dimensions before clean
original_dimensions = df.shape
print(f'Original dimensions of the brownfield site dataset (rows/columns): {original_dimensions}') 

Original dimensions of the brownfield site dataset (rows/columns): (5278, 16)


#### Clean Up

In [2]:
# Remove all Brownfield sites that outside NYC
df = df[df['Town'].str.contains("New York City", na=False)]

nyc_only = df.shape
print(f"Dimensions of the brownfield site dataset after excluding all sites outside NYC's 5 boroughs: {nyc_only}")  

Dimensions of the brownfield site dataset after excluding all sites outside NYC's 5 boroughs: (1080, 16)


In [3]:
# Remove irrelevant columns
df = df[df.columns.difference(['accuracy', "Address1", 'accuracyunit', 'x_coord', 'y_coord', "Locality", 'Town', 'Address2', 'Region', 'method', 'Program'])]

columns_removed = df.shape
print(f'The dimensions of the brownfield site dataset after removing irrelevant columns: {columns_removed}') 

The dimensions of the brownfield site dataset after removing irrelevant columns: (1080, 5)


In [4]:
# Rename column names
df = df.rename(columns={"Sitename": "site_name", "siteclass": "status", "County": "borough", "ZIPCode": "zipcode", "SiteCode": "site_code"}) 

# Convert all strings to lowercase
df["borough"] = df["borough"].str.lower()
df["site_name"] = df["site_name"].str.lower()
df["site_code"] = df["site_code"].str.lower()
df

Unnamed: 0,borough,site_code,site_name,zipcode,status
112,bronx,c203015,"gateway center at bronx terminal market, eastern",10451,C
113,bronx,c203023,plaza 163 llc,10457,N
114,bronx,c203014,parkview commons,10451,C
115,bronx,c203024,melrose commons north,10451-,N
116,bronx,c203028,"gateway center at bronx terminal market, western",10451,N
...,...,...,...,...,...
3998,richmond,v00318,carole cleaners - staten island mall,10314,N
3999,richmond,v00615,port ivory site (former p & g) site 1,10303,C
4000,richmond,v00674,port ivory site (former p & g) site 2,10303,C
4001,richmond,v00675,port ivory site (former p & g) site 3,10303,N


In [5]:
# Ensure zipcodes are only 5 characters in length
df['zipcode'].str.split("-", 1)

# Remove all whitespaces and commas and dashes
df['zipcode'].replace(',','', regex=True, inplace=True)
df['zipcode'].replace('-','', regex=True, inplace=True)

df['zipcode'] = df['zipcode'].str[:5]

In [6]:
# Replace county name with borough name
df['borough'] = df['borough'].replace({"richmond": "staten island"})
df['borough'] = df['borough'].replace({"new york": "manhattan"})

#### Group status in advance of classification

In [7]:
# Create a segment for completed projects
df['status'] = df['status'].str.replace('N', 'completed_clean')
df['status'] = df['status'].str.replace('C', 'completed_clean')
df['status'] = df['status'].str.replace('5', 'completed_clean')

# Create a segment for sites that pose a significant thread
df['status'] = df['status'].str.replace('1', 'active_significant_threat')
df['status'] = df['status'].str.replace('2', 'active_significant_threat')

# Create a segment for sites that pose a significant thread
df['status'] = df['status'].str.replace('P','potential_threat')

# Create a segment for sites that are clean but require continued maitenance
df['status'] = df['status'].str.replace('4', 'maintain_continued_threat')

# Create a segment for sites that are clean but require continued maitenance
df['status'] = df['status'].str.replace('3', 'active_cleanup_mild_threat')
df['status'] = df['status'].str.replace('A', 'active_cleanup_mild_threat')

In [8]:
df['status'] = df['status'].replace({"potential_threatR": "potential_threat"})

In [9]:
df.status.value_counts()

completed_clean               667
active_cleanup_mild_threat    306
potential_threat               49
active_significant_threat      44
maintain_continued_threat      14
Name: status, dtype: int64

#### Add Categorical Encoding & Binary Values

In [10]:
# Convert type of columns to 'category'
df['status'] = df['status'].astype('category')

# Assigning numerical values and store in another column
df['status_tier'] = df['status'].cat.codes


In [11]:
df

Unnamed: 0,borough,site_code,site_name,zipcode,status,status_tier
112,bronx,c203015,"gateway center at bronx terminal market, eastern",10451,completed_clean,2
113,bronx,c203023,plaza 163 llc,10457,completed_clean,2
114,bronx,c203014,parkview commons,10451,completed_clean,2
115,bronx,c203024,melrose commons north,10451,completed_clean,2
116,bronx,c203028,"gateway center at bronx terminal market, western",10451,completed_clean,2
...,...,...,...,...,...,...
3998,staten island,v00318,carole cleaners - staten island mall,10314,completed_clean,2
3999,staten island,v00615,port ivory site (former p & g) site 1,10303,completed_clean,2
4000,staten island,v00674,port ivory site (former p & g) site 2,10303,completed_clean,2
4001,staten island,v00675,port ivory site (former p & g) site 3,10303,completed_clean,2


In [12]:
# Generate binary values using get_dummies
dum_df = pd.get_dummies(df, columns=["status"], prefix=["type_is"] )

# Merge with main df and give name index column header
brownfield_df = df.merge(dum_df)
brownfield_df.index.name = 'index'

In [13]:
brownfield_df

Unnamed: 0_level_0,borough,site_code,site_name,zipcode,status,status_tier,type_is_active_cleanup_mild_threat,type_is_active_significant_threat,type_is_completed_clean,type_is_maintain_continued_threat,type_is_potential_threat
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,bronx,c203015,"gateway center at bronx terminal market, eastern",10451,completed_clean,2,0,0,1,0,0
1,bronx,c203023,plaza 163 llc,10457,completed_clean,2,0,0,1,0,0
2,bronx,c203014,parkview commons,10451,completed_clean,2,0,0,1,0,0
3,bronx,c203024,melrose commons north,10451,completed_clean,2,0,0,1,0,0
4,bronx,c203028,"gateway center at bronx terminal market, western",10451,completed_clean,2,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1075,staten island,v00318,carole cleaners - staten island mall,10314,completed_clean,2,0,0,1,0,0
1076,staten island,v00615,port ivory site (former p & g) site 1,10303,completed_clean,2,0,0,1,0,0
1077,staten island,v00674,port ivory site (former p & g) site 2,10303,completed_clean,2,0,0,1,0,0
1078,staten island,v00675,port ivory site (former p & g) site 3,10303,completed_clean,2,0,0,1,0,0


In [14]:
brownfield_df.to_csv("output/brownfields_clean.csv")