# 📚 ***Import libraries***

In [1]:
# base libraries for data science
from pathlib import Path

import numpy as np
import pandas as pd
import pyarrow as pa

# 💾 ***Load data***

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
# data directory path
DATA_DIR = Path.cwd().resolve().parents[1] / "data"

nyc_houses_df = pd.read_csv(DATA_DIR / "01_raw/nyc_houses_raw.csv")

# 📊 ***Data description***

In [4]:
nyc_houses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137276 entries, 0 to 137275
Data columns (total 26 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   BUILDING CLASS AT PRESENT       137274 non-null  object 
 1   COMMERCIAL UNITS                137274 non-null  float64
 2   value                           137274 non-null  float64
 3   BUILDING CLASS CATEGORY         137274 non-null  object 
 4   BUILDING CLASS AT TIME OF SALE  137274 non-null  object 
 5   LOT                             137274 non-null  float64
 6   LOT                             137274 non-null  float64
 7   TOTAL UNITS                     137274 non-null  float64
 8   TAX CLASS AT TIME OF SALE       137274 non-null  float64
 9   ADDRESS                         137274 non-null  object 
 10  EASE-MENT                       137274 non-null  object 
 11  TAX CLASS AT PRESENT            137274 non-null  object 
 12  ZIP CODE        

In [5]:
nyc_houses_df.sample(10)

Unnamed: 0,BUILDING CLASS AT PRESENT,COMMERCIAL UNITS,value,BUILDING CLASS CATEGORY,BUILDING CLASS AT TIME OF SALE,LOT,LOT.1,TOTAL UNITS,TAX CLASS AT TIME OF SALE,ADDRESS,EASE-MENT,TAX CLASS AT PRESENT,ZIP CODE,SALE DATE,NEIGHBORHOOD,SALE DATE.1,RESIDENTIAL UNITS,COMMERCIAL UNITS.1,GROSS SQUARE FEET,BOROUGH,BLOCK,SALE PRICE,APARTMENT NUMBER,YEAR BUILT,LAND SQUARE FEET,ZIP CODE.1
116964,D4,0.0,243637.0,10 COOPS - ELEVATOR APARTMENTS,D4,136.0,136.0,0.0,2.0,"76-35 113TH ST, 2D",,2,11375.0,2017-06-26 00:00:00,FLUSHING MEADOW PARK,2017-06-26 00:00:00,0.0,0.0,-,4.0,2267.0,-,,1941.0,-,11375.0
120317,D4,0.0,243637.0,10 COOPS - ELEVATOR APARTMENTS,D4,1.0,1.0,0.0,2.0,"2601 GLENWOOD ROAD, 2M",,2,11210.0,2016-09-23 00:00:00,FLATBUSH-CENTRAL,2016-09-23 00:00:00,0.0,0.0,0,3.0,5247.0,179000,,1940.0,0,11210.0
40328,B2,0.0,243637.0,02 TWO FAMILY DWELLINGS,B2,25.0,25.0,2.0,1.0,144-15 LUX ROAD,,1,11435.0,2017-05-31 00:00:00,SOUTH JAMAICA,2017-05-31 00:00:00,2.0,0.0,1800,4.0,10077.0,630000,,2004.0,2500,11435.0
10058,A2,0.0,243637.0,01 ONE FAMILY DWELLINGS,A2,48.0,48.0,1.0,1.0,614 BRITTON AVENUE,,1,10304.0,2016-10-17 00:00:00,GRASMERE,2016-10-17 00:00:00,1.0,0.0,832,5.0,3170.0,450000,,1940.0,4000,10304.0
114460,D4,0.0,243637.0,10 COOPS - ELEVATOR APARTMENTS,D4,3.0,3.0,0.0,2.0,"1410 AVENUE S, 5F",,2,11229.0,2017-01-23 00:00:00,MADISON,2017-01-23 00:00:00,0.0,0.0,0,3.0,7293.0,403077,,1939.0,0,11229.0
75822,D4,0.0,243637.0,10 COOPS - ELEVATOR APARTMENTS,D4,51.0,51.0,0.0,2.0,"34-20 PARSONS BLVD, 1W",,2,11354.0,2017-04-20 00:00:00,FLUSHING-NORTH,2017-04-20 00:00:00,0.0,0.0,-,4.0,4994.0,245000,,1962.0,-,11354.0
128715,S1,1.0,243637.0,01 ONE FAMILY DWELLINGS,S1,1.0,1.0,2.0,1.0,2901 CRUGER AVE,,1,10467.0,2017-06-29 00:00:00,BRONXDALE,2017-06-29 00:00:00,1.0,1.0,3592,2.0,4546.0,0,,1940.0,4515,10467.0
93384,D4,0.0,243637.0,10 COOPS - ELEVATOR APARTMENTS,D4,15.0,15.0,0.0,2.0,"525 EAST 82ND STREET, 9G",,2,10028.0,2016-11-07 00:00:00,UPPER EAST SIDE (79-96),2016-11-07 00:00:00,0.0,0.0,-,1.0,1579.0,1050000,,1962.0,-,10028.0
79577,C6,0.0,243637.0,09 COOPS - WALKUP APARTMENTS,C6,60.0,60.0,0.0,2.0,"15-78 212 STREET, 259",,2,11360.0,2017-05-15 00:00:00,BAYSIDE,2017-05-15 00:00:00,0.0,0.0,-,4.0,5863.0,312500,,1957.0,-,11360.0
78260,R4,0.0,243637.0,13 CONDOS - ELEVATOR APARTMENTS,R4,1072.0,1072.0,1.0,2.0,245 EAST 93RD STREET,,2,10128.0,2017-03-07 00:00:00,UPPER EAST SIDE (79-96),2017-03-07 00:00:00,1.0,0.0,-,1.0,1539.0,3713000,7C,1985.0,-,10128.0


# 🧐***Verify duplicates***

In [None]:
duplicated_columns_content = []
for col1 in nyc_houses_df.columns:
    for col2 in nyc_houses_df.columns:
        if col1 != col2 and nyc_houses_df[col1].equals(nyc_houses_df[col2]):
            duplicated_columns_content.append((col1, col2))

# Mostrar columnas duplicadas después de limpieza
print("Duplicates:", duplicated_columns_content)

# Select the columns to check
columns_to_check = [
    "COMMERCIAL UNITS ",
    "COMMERCIAL UNITS",
    "LOT ",
    "LOT",
    "ZIP CODE ",
    "ZIP CODE",
    "SALE DATE ",
    "SALE DATE",
]

# Check if the columns have the same content
for i in range(0, len(columns_to_check), 2):  # Iterate in pairs
    col1 = columns_to_check[i]
    col2 = columns_to_check[i + 1]

    if nyc_houses_df[col1].equals(nyc_houses_df[col2]):
        print(f"✅ The columns '{col1}' and '{col2}' have exactly the same content.")
    else:
        print(f"❌ The columns '{col1}' and '{col2}' have different content.")

Duplicates: [('COMMERCIAL UNITS', 'COMMERCIAL UNITS '), ('LOT ', 'LOT'), ('LOT', 'LOT '), ('ZIP CODE', 'ZIP CODE '), ('SALE DATE', 'SALE DATE '), ('SALE DATE ', 'SALE DATE'), ('COMMERCIAL UNITS ', 'COMMERCIAL UNITS'), ('ZIP CODE ', 'ZIP CODE')]
✅ The columns 'COMMERCIAL UNITS ' and 'COMMERCIAL UNITS' have exactly the same content.
✅ The columns 'LOT ' and 'LOT' have exactly the same content.
✅ The columns 'ZIP CODE ' and 'ZIP CODE' have exactly the same content.
✅ The columns 'SALE DATE ' and 'SALE DATE' have exactly the same content.


✖️ ***The following columns will be removed because they are duplicates***

In [7]:
# Identify columns with extra spaces
columns_to_remove = [col for col in columns_to_check if col.endswith(" ")]
columns_to_remove

['COMMERCIAL UNITS ', 'LOT ', 'ZIP CODE ', 'SALE DATE ']

In [8]:
nyc_houses_df = nyc_houses_df.drop(columns=columns_to_remove)

In [9]:
nyc_houses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137276 entries, 0 to 137275
Data columns (total 22 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   BUILDING CLASS AT PRESENT       137274 non-null  object 
 1   COMMERCIAL UNITS                137274 non-null  float64
 2   value                           137274 non-null  float64
 3   BUILDING CLASS CATEGORY         137274 non-null  object 
 4   BUILDING CLASS AT TIME OF SALE  137274 non-null  object 
 5   LOT                             137274 non-null  float64
 6   TOTAL UNITS                     137274 non-null  float64
 7   TAX CLASS AT TIME OF SALE       137274 non-null  float64
 8   ADDRESS                         137274 non-null  object 
 9   EASE-MENT                       137274 non-null  object 
 10  TAX CLASS AT PRESENT            137274 non-null  object 
 11  ZIP CODE                        137274 non-null  float64
 12  SALE DATE       

In [10]:
nyc_houses_df.sample(10)

Unnamed: 0,BUILDING CLASS AT PRESENT,COMMERCIAL UNITS,value,BUILDING CLASS CATEGORY,BUILDING CLASS AT TIME OF SALE,LOT,TOTAL UNITS,TAX CLASS AT TIME OF SALE,ADDRESS,EASE-MENT,TAX CLASS AT PRESENT,ZIP CODE,SALE DATE,NEIGHBORHOOD,RESIDENTIAL UNITS,GROSS SQUARE FEET,BOROUGH,BLOCK,SALE PRICE,APARTMENT NUMBER,YEAR BUILT,LAND SQUARE FEET
17196,A1,0.0,243637.0,01 ONE FAMILY DWELLINGS,A1,58.0,1.0,1.0,170 DANIEL LOW TERRACE,,1,10301.0,2017-02-02 00:00:00,NEW BRIGHTON,1.0,1680,5.0,26.0,-,,1899.0,5000
52183,RB,0.0,243637.0,43 CONDO OFFICE BUILDINGS,RB,1470.0,1.0,4.0,633 3 AVENUE,,4,10017.0,2017-04-21 00:00:00,MIDTOWN EAST,0.0,-,1.0,1314.0,-,9D,1960.0,-
83313,B2,0.0,243637.0,02 TWO FAMILY DWELLINGS,B2,16.0,2.0,1.0,20 ACKERMAN STREET,,1,10308.0,2017-04-26 00:00:00,GREAT KILLS,2.0,2455,5.0,5212.0,854311,,2016.0,4342
12705,B3,0.0,243637.0,02 TWO FAMILY DWELLINGS,B3,66.0,2.0,1.0,140 BAY 7TH ST,,1,11228.0,2016-11-16 00:00:00,BATH BEACH,2.0,2260,3.0,6391.0,0,,1960.0,1933
109852,RB,0.0,243637.0,43 CONDO OFFICE BUILDINGS,RB,1205.0,1.0,4.0,188 EAST 70TH STREET,,4,10021.0,2017-04-27 00:00:00,UPPER EAST SIDE (59-79),0.0,-,1.0,1404.0,1300000,C1,1986.0,-
128253,D4,0.0,243637.0,10 COOPS - ELEVATOR APARTMENTS,D4,40.0,0.0,2.0,"15030 71 AVE, 2F",,2,11367.0,2017-05-31 00:00:00,FLUSHING-SOUTH,0.0,-,4.0,6698.0,-,,1952.0,-
75783,R4,0.0,243637.0,13 CONDOS - ELEVATOR APARTMENTS,R4,1210.0,1.0,2.0,133 MULBERRY STREET,,2,10013.0,2017-03-15 00:00:00,CHINATOWN,1.0,-,1.0,236.0,2100000,4C,1920.0,-
119343,R4,0.0,243637.0,13 CONDOS - ELEVATOR APARTMENTS,R4,4859.0,1.0,2.0,1604 METROPOLITAN AVENUE,,2,10462.0,2017-05-11 00:00:00,PARKCHESTER,1.0,0,2.0,3943.0,170000,8F,0.0,0
29177,R3,0.0,243637.0,04 TAX CLASS 1 CONDOS,R3,1105.0,1.0,1.0,8 JOSEPH LANE,,1A,10305.0,2017-06-29 00:00:00,ROSEBANK,1.0,-,5.0,3024.0,-,,1977.0,-
30427,R4,0.0,243637.0,13 CONDOS - ELEVATOR APARTMENTS,R4,1714.0,1.0,2.0,28 LYNCH STREET,,2,11206.0,2017-05-25 00:00:00,WILLIAMSBURG-CENTRAL,1.0,0,3.0,2235.0,0,7R,0.0,0


In [11]:
unique_easements = nyc_houses_df["EASE-MENT"].unique()
print(unique_easements)

[' ' nan]


***Easement:***
An easement is a right, such as a right of way, which allows an entity to make limited use of 
another’s real property. For example:  MTA railroad tracks that run across a portion of another 
property.

***This column is empty, therefore it does not represent any information for the analysis.***

In [12]:
nyc_houses_df = nyc_houses_df.drop(columns=["EASE-MENT"])

***Value is a column that has only one value that is repeated for all rows, so it does not provide any contribution to the analysis.***

In [13]:
nyc_houses_df["value"].unique()

array([243637.,     nan])

In [14]:
nyc_houses_df = nyc_houses_df.drop(columns=["value"])

In [15]:
nyc_houses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137276 entries, 0 to 137275
Data columns (total 20 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   BUILDING CLASS AT PRESENT       137274 non-null  object 
 1   COMMERCIAL UNITS                137274 non-null  float64
 2   BUILDING CLASS CATEGORY         137274 non-null  object 
 3   BUILDING CLASS AT TIME OF SALE  137274 non-null  object 
 4   LOT                             137274 non-null  float64
 5   TOTAL UNITS                     137274 non-null  float64
 6   TAX CLASS AT TIME OF SALE       137274 non-null  float64
 7   ADDRESS                         137274 non-null  object 
 8   TAX CLASS AT PRESENT            137274 non-null  object 
 9   ZIP CODE                        137274 non-null  float64
 10  SALE DATE                       137274 non-null  object 
 11  NEIGHBORHOOD                    137274 non-null  object 
 12  RESIDENTIAL UNIT

## 🔠 ***Categorical Variables***
### ***Ordinal***
- ***TAX CLASS AT TIME OF SALE / TAX CLASS AT PRESENT:*** Every property in the city is assigned to one of four tax classes (Classes 1, 2, 3, and 4), 
based on the use of the property.  
    - Class 1: Includes most residential property of up to three units (such as one-, 
two-, and three-family homes and small stores or offices with one or two 
attached apartments), vacant land that is zoned for residential use, and most 
condominiums that are not more than three stories.  
    - Class 2: Includes all other property that is primarily residential, such as 
cooperatives and condominiums.  
    - Class 3: Includes property with equipment owned by a gas, telephone or electric 
company.  
    - Class 4: Includes all other properties not included in class 1,2, and 3, such as 
offices, factories, warehouses, garage buildings, etc.

### ***Nominal***
- ***BUILDING CLASS AT PRESENT:***  used to describe a property’s constructive use. The first position of the Building Class is a letter that is used to describe a general class of properties. The second position, a number, adds more specific information about the property’s use or construction style.
- ***BUILDING CLASS CATEGORY***:  field that we are including so that users of the Rolling Sales Files can easily identify similar properties.
- ***BUILDING CLASS AT TIME OF SALE:*** used to describe a property’s constructive use.
- ***LOT:***  a subdivision of a Tax Block and represents the property unique location.
- ***ADDRESS:*** The street address of the property as listed on the Sales File.
- ***ZIP CODE:*** The property’s postal code.
- ***NEIGHBORHOOD:*** Department of Finance assessors determine the neighborhood name in the course of valuing 
properties.
- ***BOROUGH:*** The name of the borough in which the property is located. 
- ***BLOCK:*** A Tax Block is a sub-division of the borough on which real properties are located. The Department of Finance uses a Borough-Block-Lot classification to label all real property in the City.
- ***APARTMENT NUMBER***


## 🔢 ***Numerical variables***
### ***Discrete***
- ***COMMERCIAL UNITS:*** The number of commercial units at the listed property.
- ***TOTAL UNITS:*** The total number of units at the listed property.
- ***RESIDENTIAL UNITS:*** The number of residential units at the listed property.

### ***Continuous***
- ***LAND SQUARE FEET:*** The land area of the property listed in square feet.
- ***GROSS SQUARE FEET:*** The total area of all the floors of a building as measured from the exterior surfaces of the 
outside walls of the building, including the land area and space within any building or structure 
on the property.
- ***SALE PRICE:*** Price paid for the property.


## 🗓️ ***Dates***
- ***SALE DATE:*** Date the property sold.
- ***YEAR BUILT:*** Year the structure on the property was built.

# 👁️‍🗨️ ***Convert data types***

In [16]:
# Unify null values
nyc_houses_df.replace(["NULL", "None", "", "?", " ", "  "], np.nan, inplace=True)

print(nyc_houses_df.isnull().sum())

BUILDING CLASS AT PRESENT           1233
COMMERCIAL UNITS                       2
BUILDING CLASS CATEGORY                2
BUILDING CLASS AT TIME OF SALE         2
LOT                                    2
TOTAL UNITS                            2
TAX CLASS AT TIME OF SALE              2
ADDRESS                                2
TAX CLASS AT PRESENT                1233
ZIP CODE                               2
SALE DATE                              2
NEIGHBORHOOD                           2
RESIDENTIAL UNITS                      2
GROSS SQUARE FEET                      2
BOROUGH                                2
BLOCK                                  2
SALE PRICE                             2
APARTMENT NUMBER                  106257
YEAR BUILT                             2
LAND SQUARE FEET                       2
dtype: int64


In [17]:
cols_categorical_nom = [
    "TAX CLASS AT TIME OF SALE",
    "TAX CLASS AT PRESENT",
    "BUILDING CLASS AT PRESENT",
    "BUILDING CLASS CATEGORY",
    "BUILDING CLASS AT TIME OF SALE",
    "LOT",
    "ADDRESS",
    "ZIP CODE",
    "NEIGHBORHOOD",
    "BOROUGH",
    "BLOCK",
    "APARTMENT NUMBER",
]
nyc_houses_df[cols_categorical_nom] = nyc_houses_df[cols_categorical_nom].astype("category")

In [18]:
cols_numeric_int = ["COMMERCIAL UNITS", "TOTAL UNITS", "RESIDENTIAL UNITS"]
nyc_houses_df[cols_numeric_int] = nyc_houses_df[cols_numeric_int].astype("Int64")

In [19]:
cols_numeric_float = ["LAND SQUARE FEET", "GROSS SQUARE FEET", "SALE PRICE"]

nyc_houses_df[cols_numeric_float] = nyc_houses_df[cols_numeric_float].replace(" -  ", np.nan)

nyc_houses_df[cols_numeric_float] = nyc_houses_df[cols_numeric_float].astype("float")

In [20]:
# cols_dates = ['SALE DATE', 'YEAR BUILT']
nyc_houses_df["SALE DATE"] = nyc_houses_df["SALE DATE"].astype("datetime64[ns]")

In [21]:
nyc_houses_df["YEAR BUILT"] = nyc_houses_df["YEAR BUILT"].astype("Int64")

In [22]:
nyc_houses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137276 entries, 0 to 137275
Data columns (total 20 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   BUILDING CLASS AT PRESENT       136043 non-null  category      
 1   COMMERCIAL UNITS                137274 non-null  Int64         
 2   BUILDING CLASS CATEGORY         137274 non-null  category      
 3   BUILDING CLASS AT TIME OF SALE  137274 non-null  category      
 4   LOT                             137274 non-null  category      
 5   TOTAL UNITS                     137274 non-null  Int64         
 6   TAX CLASS AT TIME OF SALE       137274 non-null  category      
 7   ADDRESS                         137274 non-null  category      
 8   TAX CLASS AT PRESENT            136043 non-null  category      
 9   ZIP CODE                        137274 non-null  category      
 10  SALE DATE                       137274 non-null  datetim

In [23]:
# Generate a random sample index
nyc_houses_df.sample(5)

Unnamed: 0,BUILDING CLASS AT PRESENT,COMMERCIAL UNITS,BUILDING CLASS CATEGORY,BUILDING CLASS AT TIME OF SALE,LOT,TOTAL UNITS,TAX CLASS AT TIME OF SALE,ADDRESS,TAX CLASS AT PRESENT,ZIP CODE,SALE DATE,NEIGHBORHOOD,RESIDENTIAL UNITS,GROSS SQUARE FEET,BOROUGH,BLOCK,SALE PRICE,APARTMENT NUMBER,YEAR BUILT,LAND SQUARE FEET
54293,R1,0,15 CONDOS - 2-10 UNIT RESIDENTIAL,R1,1704.0,1,2.0,139 MESEROLE STREET,2C,11206.0,2016-09-30,WILLIAMSBURG-EAST,1,0.0,3.0,3043.0,621132.0,3A,2015,0.0
51437,V0,0,05 TAX CLASS 1 VACANT LAND,V0,21.0,0,1.0,135TH STREET,1B,0.0,2017-04-05,SOUTH OZONE PARK,0,,4.0,11702.0,65000.0,,0,2088.0
101688,B1,0,02 TWO FAMILY DWELLINGS,B1,29.0,2,1.0,42-76 149TH STREET,1,11355.0,2017-01-05,FLUSHING-NORTH,2,2383.0,4.0,5379.0,1290000.0,,1965,2041.0
100064,B3,0,02 TWO FAMILY DWELLINGS,B3,60.0,2,1.0,33-44 59TH STREET,1,11377.0,2017-06-28,WOODSIDE,2,1784.0,4.0,1182.0,920000.0,,1940,1800.0
71222,D4,0,10 COOPS - ELEVATOR APARTMENTS,D4,6.0,0,2.0,"8301 RIDGE BOULEVARD, 6B",2,11209.0,2017-01-12,BAY RIDGE,0,0.0,3.0,6016.0,735000.0,,1963,0.0


In [24]:
schema = pa.Table.from_pandas(nyc_houses_df).schema

# 💾 ***Save dataframe with data types***

In [25]:
output_dir = DATA_DIR / "02_intermediate"
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [26]:
nyc_houses_df.to_parquet(output_dir / "nyc_houses_fixed.parquet", schema=schema, index=False)