The dataset contains real estate sales records in NYC.
The following code performs an exploratory analysis on this dataset.

In [14]:
import pandas as pd
import numpy as np

In [15]:

file_path = "Files\\"
dataset_name = "nyc-rolling-sales.csv"
path = file_path + dataset_name
df_nyc_rolling_sales = pd.read_csv(path, header=0)
# convert the sales price column from text to a nullable integer
column_names_to_reformat = ["SALE PRICE", "LAND SQUARE FEET", "GROSS SQUARE FEET"]
for name in column_names_to_reformat:
    df_nyc_rolling_sales[name] = df_nyc_rolling_sales[name].str.strip().replace('-', pd.NA).astype('Int64')
df_nyc_rolling_sales.head()



Unnamed: 0.1,Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,6,,C2,153 AVENUE B,...,5,0,5,1633,6440,1900,2,C2,6625000.0,2017-07-19 00:00:00
1,5,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,26,,C7,234 EAST 4TH STREET,...,28,3,31,4616,18690,1900,2,C7,,2016-12-14 00:00:00
2,6,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,39,,C7,197 EAST 3RD STREET,...,16,1,17,2212,7803,1900,2,C7,,2016-12-09 00:00:00
3,7,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,21,,C4,154 EAST 7TH STREET,...,10,0,10,2272,6794,1913,2,C4,3936272.0,2016-09-23 00:00:00
4,8,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,55,,C2,301 EAST 10TH STREET,...,6,0,6,2369,4615,1900,2,C2,8000000.0,2016-11-17 00:00:00


#Clean and filter the dataset
An explanation for each data column can be found on the NYC website: https://www.nyc.gov/site/finance/taxes/

Remove unnecessary columns

In [16]:
df_nyc_rolling_sales.drop(['Unnamed: 0', 'LOT', 'EASE-MENT','APARTMENT NUMBER', 'ADDRESS','ZIP CODE'], axis=1, inplace=True)
df_nyc_rolling_sales.head()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,BUILDING CLASS AT PRESENT,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,C2,5,0,5,1633,6440,1900,2,C2,6625000.0,2017-07-19 00:00:00
1,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,C7,28,3,31,4616,18690,1900,2,C7,,2016-12-14 00:00:00
2,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,C7,16,1,17,2212,7803,1900,2,C7,,2016-12-09 00:00:00
3,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,C4,10,0,10,2272,6794,1913,2,C4,3936272.0,2016-09-23 00:00:00
4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,C2,6,0,6,2369,4615,1900,2,C2,8000000.0,2016-11-17 00:00:00


Remove duplicate rows

In [17]:
no_duplicates = sum(df_nyc_rolling_sales.duplicated())
df_nyc_rolling_sales.drop_duplicates(inplace=True)
print(f"{no_duplicates} duplicate rows have been removed")

2751 duplicate rows have been removed


If we have null values for Sales Price, then since this is the value we are measuring against,
then it will be better to remove these rows.

In [18]:
orig_no_rows = len(df_nyc_rolling_sales)
df_nyc_rolling_sales.dropna(subset="SALE PRICE", inplace=True)
curr_no_rows = len(df_nyc_rolling_sales)
rows_deleted = orig_no_rows - curr_no_rows
percentage = (rows_deleted / orig_no_rows) * 100
print(f"There were {orig_no_rows} rows. After deleting {round(percentage, 2)}% of them with no Sales Price values there are now {curr_no_rows}")

There were 81797 rows. After deleting 15.73% of them with no Sales Price values there are now 68928


An explanation for each data column can be found on the NYC website: https://www.nyc.gov/site/finance/taxes/
From the website;

 ```python
A $0 sale indicates that there was a transfer of ownership without a cash consideration. There can be a number of reasons for a $0 sale including transfers of ownership from parents to children.
```

We need to remove these sales entries as well as this is a special case that does not indicate the sales price of the property

In [19]:
df_nyc_rolling_sales =df_nyc_rolling_sales[df_nyc_rolling_sales['SALE PRICE'] !=0]
curr_no_rows = len(df_nyc_rolling_sales)
rows_deleted1 = orig_no_rows - (curr_no_rows + rows_deleted)
percentage = (rows_deleted1 / orig_no_rows) * 100
print(f"There were originally {orig_no_rows} rows. After deleting {round(percentage, 2)}% of them with ZERO Sales Price values there are now {curr_no_rows}")

There were originally 81797 rows. After deleting 11.7% of them with ZERO Sales Price values there are now 59354


Create a column to rescale the Sales Price to millions to improve the visual representation of Sales Price

In [20]:
df_nyc_rolling_sales['SALE_PRICE_MILLIONS'] = df_nyc_rolling_sales['SALE PRICE'].astype(np.float64) / 1000000

Replace the borough numbers with their names

In [21]:
borough_names = {1:'Manhattan',2:'Bronx',3:'Brooklyn',4:'Queens',5:'Staten Island'}
df_nyc_rolling_sales['BOROUGH_NAME'] = df_nyc_rolling_sales['BOROUGH'].map(borough_names)
df_nyc_rolling_sales.head()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,BUILDING CLASS AT PRESENT,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,SALE_PRICE_MILLIONS,BOROUGH_NAME
0,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,C2,5,0,5,1633,6440,1900,2,C2,6625000,2017-07-19 00:00:00,6.625,Manhattan
3,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,C4,10,0,10,2272,6794,1913,2,C4,3936272,2016-09-23 00:00:00,3.936272,Manhattan
4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,C2,6,0,6,2369,4615,1900,2,C2,8000000,2016-11-17 00:00:00,8.0,Manhattan
6,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,406,C4,8,0,8,1750,4226,1920,2,C4,3192840,2016-09-23 00:00:00,3.19284,Manhattan
9,1,ALPHABET CITY,08 RENTALS - ELEVATOR APARTMENTS,2,387,D9,24,0,24,4489,18523,1920,2,D9,16232000,2016-11-07 00:00:00,16.232,Manhattan


In [22]:
df_nyc_rolling_sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59354 entries, 0 to 84547
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   BOROUGH                         59354 non-null  int64  
 1   NEIGHBORHOOD                    59354 non-null  object 
 2   BUILDING CLASS CATEGORY         59354 non-null  object 
 3   TAX CLASS AT PRESENT            59354 non-null  object 
 4   BLOCK                           59354 non-null  int64  
 5   BUILDING CLASS AT PRESENT       59354 non-null  object 
 6   RESIDENTIAL UNITS               59354 non-null  int64  
 7   COMMERCIAL UNITS                59354 non-null  int64  
 8   TOTAL UNITS                     59354 non-null  int64  
 9   LAND SQUARE FEET                38284 non-null  Int64  
 10  GROSS SQUARE FEET               37738 non-null  Int64  
 11  YEAR BUILT                      59354 non-null  int64  
 12  TAX CLASS AT TIME OF SALE       

Create a Sales Month column representing the month of the sales date

In [23]:
df_nyc_rolling_sales['SALE DATE'] = pd.to_datetime(df_nyc_rolling_sales['SALE DATE'])
df_nyc_rolling_sales['SALE_MONTH']= df_nyc_rolling_sales['SALE DATE'].dt.month

df_nyc_rolling_sales.head()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,BUILDING CLASS AT PRESENT,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,SALE_PRICE_MILLIONS,BOROUGH_NAME,SALE_MONTH
0,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,C2,5,0,5,1633,6440,1900,2,C2,6625000,2017-07-19,6.625,Manhattan,7
3,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,C4,10,0,10,2272,6794,1913,2,C4,3936272,2016-09-23,3.936272,Manhattan,9
4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,C2,6,0,6,2369,4615,1900,2,C2,8000000,2016-11-17,8.0,Manhattan,11
6,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,406,C4,8,0,8,1750,4226,1920,2,C4,3192840,2016-09-23,3.19284,Manhattan,9
9,1,ALPHABET CITY,08 RENTALS - ELEVATOR APARTMENTS,2,387,D9,24,0,24,4489,18523,1920,2,D9,16232000,2016-11-07,16.232,Manhattan,11


In [24]:
df_nyc_rolling_sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59354 entries, 0 to 84547
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   BOROUGH                         59354 non-null  int64         
 1   NEIGHBORHOOD                    59354 non-null  object        
 2   BUILDING CLASS CATEGORY         59354 non-null  object        
 3   TAX CLASS AT PRESENT            59354 non-null  object        
 4   BLOCK                           59354 non-null  int64         
 5   BUILDING CLASS AT PRESENT       59354 non-null  object        
 6   RESIDENTIAL UNITS               59354 non-null  int64         
 7   COMMERCIAL UNITS                59354 non-null  int64         
 8   TOTAL UNITS                     59354 non-null  int64         
 9   LAND SQUARE FEET                38284 non-null  Int64         
 10  GROSS SQUARE FEET               37738 non-null  Int64         
 11  YE

Now the data is ready for EDA. Some of the data is categorical and some of data is continuous.

Lets have a look at the correlations

In [25]:
corr = df_nyc_rolling_sales.corr().abs()
corr

Unnamed: 0,BOROUGH,BLOCK,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,SALE PRICE,SALE_PRICE_MILLIONS,SALE_MONTH
BOROUGH,1.0,0.442627,0.015391,0.000783,0.012149,0.032238,0.071813,0.116259,0.364667,0.076983,0.076983,0.003719
BLOCK,0.442627,1.0,0.007733,0.002335,0.005187,0.001953,0.042323,0.087651,0.329072,0.062297,0.062297,0.014593
RESIDENTIAL UNITS,0.015391,0.007733,1.0,0.014332,0.81344,0.343801,0.713107,0.019808,0.000328,0.142244,0.142244,0.012747
COMMERCIAL UNITS,0.000783,0.002335,0.014332,1.0,0.593202,0.035839,0.058455,0.004015,0.04233,0.04495,0.04495,0.004626
TOTAL UNITS,0.012149,0.005187,0.81344,0.593202,1.0,0.297506,0.60774,0.017806,0.026798,0.140743,0.140743,0.012984
LAND SQUARE FEET,0.032238,0.001953,0.343801,0.035839,0.297506,1.0,0.619104,0.023567,0.068748,0.061439,0.061439,0.001467
GROSS SQUARE FEET,0.071813,0.042323,0.713107,0.058455,0.60774,0.619104,1.0,0.025664,0.123834,0.513771,0.513771,0.005809
YEAR BUILT,0.116259,0.087651,0.019808,0.004015,0.017806,0.023567,0.025664,1.0,0.160291,0.006765,0.006765,0.002495
TAX CLASS AT TIME OF SALE,0.364667,0.329072,0.000328,0.04233,0.026798,0.068748,0.123834,0.160291,1.0,0.11127,0.11127,0.003461
SALE PRICE,0.076983,0.062297,0.142244,0.04495,0.140743,0.061439,0.513771,0.006765,0.11127,1.0,1.0,0.004048
