#### Initial Set Up

In [1]:
# Import libraries
import os
import glob
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Remove dataFrame display size restrictions
#pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
# Create file path
manhattan_path = "Resources/manhattan_sales.csv"
bronx_path = "Resources/bronx_sales.csv"
staten_island_path = "Resources/staten_island_sales.csv"
queens_path = "Resources/queens_sales.csv"
brooklyn_path = "Resources/brooklyn_sales.csv"

# Read in files
manhattan = pd.read_csv(manhattan_path)
bronx = pd.read_csv(bronx_path)
staten_island = pd.read_csv(staten_island_path)
queens = pd.read_csv(queens_path)
brooklyn = pd.read_csv(brooklyn_path)

# Grab original dimensions before clean
manhattan_rows = len(manhattan)
bronx_rows = len(bronx)
staten_island_rows = len(staten_island)
queens_rows = len(queens)
brooklyn_rows = len(brooklyn)

print(f' Total Sales (rows) in Manhattan dataset Prior to Merge/Clean: {manhattan_rows}')
print(f' Total Sales (rows) in Staten Island dataset Prior to Merge/Clean: {staten_island_rows}')
print(f' Total Sales (rows) in The Bronx dataset Prior to Merge/Clean: {bronx_rows}')
print(f' Total Sales (rows) in The Queens dataset Prior to Merge/Clean: {queens_rows}')
print(f' Total Sales (rows) in The Brooklyn dataset Prior to Merge/Clean: {brooklyn_rows}') 

 Total Sales (rows) in Manhattan dataset Prior to Merge/Clean: 11031
 Total Sales (rows) in Staten Island dataset Prior to Merge/Clean: 5729
 Total Sales (rows) in The Bronx dataset Prior to Merge/Clean: 3936
 Total Sales (rows) in The Queens dataset Prior to Merge/Clean: 18076
 Total Sales (rows) in The Brooklyn dataset Prior to Merge/Clean: 11091


#### Merge

In [3]:
# Create list of csvs 
list_boroughs = [manhattan, bronx, staten_island, queens, brooklyn] 

# Join using concat( ) function
df = pd.concat(list_boroughs,ignore_index=True)

In [4]:
# Display original dimensions of dataframe pre-clean
orig_rows = len(df)
orig_columns = len(df.columns) 
print(f' Total Sales (rows) in The New York City Real Estate dataset Prior to Merge/Clean: {orig_rows}') 
print(f' Total fields (columns) in The New York City Real Estate dataset Prior to Merge/Clean: {orig_columns}') 


 Total Sales (rows) in The New York City Real Estate dataset Prior to Merge/Clean: 49863
 Total fields (columns) in The New York City Real Estate dataset Prior to Merge/Clean: 45


#### Clean Up

In [5]:
# Display columns before transformation process
df.columns

Index(['Address', 'Building Name', 'Neighborhood', 'City',
       'Price (Last Known)', 'PPSF', 'R', 'Bd', 'Total Ba', 'MLS Ba',
       'Full Ba', 'HBa', 'Sq Ft', 'Monthly Fees', 'Monthly Fees & Taxes',
       'Taxes Per Mo.', 'MLS Property Type', 'MLS Property Subtype',
       'Compass Property Type', 'Status', 'DOM', 'Updated Date', 'Listed Date',
       'Listed Price', 'Last Asking', 'Contract Date', 'Sold Price',
       'Closed Price Verification', 'Sold Date', 'Open House',
       'Open House Type', 'ZIP', 'Latitude', 'Longitude', 'Building Size',
       'Lot Size', 'Outdoor Space', 'Year Built', 'Service Level',
       'Building Units', 'Notes', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 32',
       'Unnamed: 33'],
      dtype='object')

In [7]:
# Drop irrelevant columns
df = df.loc[:, ['Address', 'Neighborhood', 'City',
       'Bd','MLS Ba','Compass Property Type','DOM', 'Listed Date',
       'Listed Price', 'Sold Price','Sold Date', 'ZIP', 'Latitude', 'Longitude', 'Year Built']]  

clean_columns01 = len(df.columns)

print(f' Total fields (columns) in The New York City Real Estate dataset after dropping columns: {clean_columns01}')

 Total fields (columns) in The New York City Real Estate dataset after dropping columns: 15


In [8]:
# Rename columns
df.rename(columns={'Address': 'address', 'Building Name': 'building_name', 'Neighborhood': 'neighborhood', 
                              'City': 'borough', 'Bd': 'bed', 'MLS Ba': 'bath',
                              'Compass Property Type': 'property_type', 
                              'DOM': 'days_on_market', 'Listed Date': 'listed_date',
                              'Listed Price': 'listed_price', 'Sold Price': 'sold_price', 'Sold Date': 'sold_date',
                              'Latitude': 'lat', 'Longitude': 'long',
                              'ZIP': 'zipcode','Year Built': 'year_built'}, inplace=True)

df.index.name = 'index'

In [10]:
# Drop all rows where zipcode equals NaN
df = df.dropna(subset=['zipcode'])

rows_after_zipcode = len(df)
print(f'After removing all rows with a NaN value under zipcode column, there were {rows_after_zipcode} rows (sold listings) remaining')

After removing all rows with a NaN value under zipcode column, there were 37477 rows (sold listings) remaining


In [11]:
# Drop all rows where sold price equals NaN
df = df.dropna(subset=['sold_price'])

rows_after_soldprice = len(df)
print(f'After removing all rows with a NaN value under sold price column, there were {rows_after_soldprice} rows (sold listings) remaining')

After removing all rows with a NaN value under sold price column, there were 36243 rows (sold listings) remaining


In [12]:
# In bed column, replace 'Studio' + Alcove with '0'
df['bed'] = df['bed'].str.replace('Studio', '0')
df['bed'] = df['bed'].str.replace('Alcove', '0')

df['bed'] = df['bed'].astype(float)

In [13]:
# Change all string column values to lower case
df = df.applymap(lambda s:s.lower() if type(s) == str else s)

In [14]:
# Remove decimal and zero from string 
df['zipcode'] = df['zipcode'].astype(str).replace('\.0', '', regex=True)

df['year_built'] = df['year_built'].astype(str).replace('\.0', '', regex=True)

In [16]:
# Convert listed date, sold date and year built to datetime
df.year_built = pd.to_datetime(df.year_built, format='%Y')
df.sold_date = pd.to_datetime(df.year_built, format='%m/%d/%Y')
df.listed_date = pd.to_datetime(df.listed_date, format='%m/%d/%Y')

# Convert days on market columns to floats
df['days_on_market'] = df['days_on_market'].str.replace(',', '') # Removes commas from any properties on the market for 1000+ days
df['days_on_market'] = pd.to_numeric(df['days_on_market'])

# Convert bed column to float
df['bed'] = df.bed.astype(float)

# Remove commas and dollar signs from sold price listed price and convert to float
df['sold_price'] = df['sold_price'].str.replace('$', '')
df['sold_price'] = df['sold_price'].str.replace(',', '')
df['sold_price'] = pd.to_numeric(df['sold_price'])

df['listed_price'] = df['listed_price'].str.replace('$', '')
df['listed_price'] = df['listed_price'].str.replace(',', '')
df['listed_price'] = pd.to_numeric(df['listed_price'])


ValueError: time data '9/28/19' does not match format '%m/%d/%Y' (match)

In [None]:
df.dtypes

In [None]:
# Update incorrect borough names
#df['borough'].value_counts()

df['borough'] = df['borough'].str.replace('the bronx', 'bronx')
df['borough'] = df['borough'].str.replace('richmond', 'staten island')
df['borough'] = df['borough'].str.replace('s. ozone park', 'queens')
df['borough'] = df['borough'].str.replace('kew gardens', 'queens')

In [None]:
df['borough'].value_counts()

In [None]:
df['listed_date'].isna().value_counts()

In [None]:
# Export Clean DataFrame to CSV
df.to_csv("output/re_clean.csv")