# 📚 ***Import libraries***

In [1]:
# base libraries for data science
from pathlib import Path
import numpy as np
import pandas as pd
import pyarrow as pa

# 💾 ***Load data***

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
# data directory path
DATA_DIR = Path.cwd().resolve().parents[0] / "data"

nyc_houses_df = pd.read_csv(DATA_DIR / "01_raw/nyc_houses_raw.csv")

# 📊 ***Data description***

In [4]:
nyc_houses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137276 entries, 0 to 137275
Data columns (total 26 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   BUILDING CLASS AT PRESENT       137274 non-null  object 
 1   COMMERCIAL UNITS                137274 non-null  float64
 2   value                           137274 non-null  float64
 3   BUILDING CLASS CATEGORY         137274 non-null  object 
 4   BUILDING CLASS AT TIME OF SALE  137274 non-null  object 
 5   LOT                             137274 non-null  float64
 6   LOT                             137274 non-null  float64
 7   TOTAL UNITS                     137274 non-null  float64
 8   TAX CLASS AT TIME OF SALE       137274 non-null  float64
 9   ADDRESS                         137274 non-null  object 
 10  EASE-MENT                       137274 non-null  object 
 11  TAX CLASS AT PRESENT            137274 non-null  object 
 12  ZIP CODE        

In [5]:
nyc_houses_df.sample(10)

Unnamed: 0,BUILDING CLASS AT PRESENT,COMMERCIAL UNITS,value,BUILDING CLASS CATEGORY,BUILDING CLASS AT TIME OF SALE,LOT,LOT.1,TOTAL UNITS,TAX CLASS AT TIME OF SALE,ADDRESS,EASE-MENT,TAX CLASS AT PRESENT,ZIP CODE,SALE DATE,NEIGHBORHOOD,SALE DATE.1,RESIDENTIAL UNITS,COMMERCIAL UNITS.1,GROSS SQUARE FEET,BOROUGH,BLOCK,SALE PRICE,APARTMENT NUMBER,YEAR BUILT,LAND SQUARE FEET,ZIP CODE.1
119178,D4,0.0,243637.0,10 COOPS - ELEVATOR APARTMENTS,D4,29.0,29.0,0.0,2.0,"155 EAST 49TH STREET, 6A",,2,10017.0,2017-08-10 00:00:00,MIDTOWN CBD,2017-08-10 00:00:00,0.0,0.0,-,1.0,1304.0,460000,,1924.0,-,10017.0
101681,A5,0.0,243637.0,01 ONE FAMILY DWELLINGS,A5,8.0,8.0,1.0,1.0,32-03 163 ST,,1,11358.0,2017-07-17 00:00:00,FLUSHING-NORTH,2017-07-17 00:00:00,1.0,0.0,1224,4.0,4911.0,685000,,1950.0,1710,11358.0
29852,A2,0.0,243637.0,01 ONE FAMILY DWELLINGS,A2,137.0,137.0,1.0,1.0,99 BENEDICT AVENUE,,1,10314.0,2017-06-07 00:00:00,CASTLETON CORNERS,2017-06-07 00:00:00,1.0,0.0,765,5.0,356.0,640000,,1963.0,7400,10314.0
3009,B1,0.0,243637.0,02 TWO FAMILY DWELLINGS,B1,69.0,69.0,2.0,1.0,2423 GERRITSEN AVENUE,,1,11229.0,2017-01-31 00:00:00,GERRITSEN BEACH,2017-01-31 00:00:00,2.0,0.0,1672,3.0,8828.0,0,,1950.0,2033,11229.0
70138,A5,0.0,243637.0,01 ONE FAMILY DWELLINGS,A5,20.0,20.0,1.0,1.0,1117 EAST 36TH STREET,,1,11210.0,2017-08-23 00:00:00,FLATBUSH-EAST,2017-08-23 00:00:00,1.0,0.0,1400,3.0,7636.0,280000,,1905.0,2000,11210.0
5630,A5,0.0,243637.0,01 ONE FAMILY DWELLINGS,A5,141.0,141.0,1.0,1.0,254 LEGION STREET,,1,11212.0,2016-10-25 00:00:00,BROWNSVILLE,2016-10-25 00:00:00,1.0,0.0,1116,3.0,3567.0,0,,1985.0,1800,11212.0
107400,V1,1.0,243637.0,31 COMMERCIAL VACANT LAND,V1,11.0,11.0,1.0,4.0,189 MONTAGUE STREET,,4,11201.0,2016-11-15 00:00:00,BROOKLYN HEIGHTS,2016-11-15 00:00:00,0.0,1.0,0,3.0,244.0,0,5.0,1900.0,10000,11201.0
37437,B2,0.0,243637.0,02 TWO FAMILY DWELLINGS,B2,11.0,11.0,2.0,1.0,60 ELMWOOD AVENUE,,1,10308.0,2017-04-24 00:00:00,GREAT KILLS,2017-04-24 00:00:00,2.0,0.0,1339,5.0,5261.0,625000,,1950.0,3800,10308.0
136111,B9,0.0,243637.0,02 TWO FAMILY DWELLINGS,B9,88.0,88.0,2.0,1.0,85 GROTON STREET,,1,10312.0,2017-07-28 00:00:00,GREAT KILLS,2017-07-28 00:00:00,2.0,0.0,1176,5.0,5316.0,520000,,2000.0,1530,10312.0
136072,A1,0.0,243637.0,01 ONE FAMILY DWELLINGS,A1,25.0,25.0,1.0,1.0,3952 HILL AVENUE,,1,10466.0,2016-09-28 00:00:00,BAYCHESTER,2016-09-28 00:00:00,1.0,0.0,1479,2.0,4959.0,485000,,1931.0,2375,10466.0


# 🧐***Verify duplicates***

In [6]:
duplicated_columns_content = []
for col1 in nyc_houses_df.columns:
    for col2 in nyc_houses_df.columns:
        if col1 != col2 and nyc_houses_df[col1].equals(nyc_houses_df[col2]):
            duplicated_columns_content.append((col1, col2))

# Mostrar columnas duplicadas después de limpieza
print("Duplicates:", duplicated_columns_content)

# Select the columns to check
columns_to_check = ['COMMERCIAL UNITS ', 'COMMERCIAL UNITS', 'LOT ', 'LOT', 'ZIP CODE ', 'ZIP CODE', 'SALE DATE ', 'SALE DATE']

# Check if the columns have the same content
for i in range(0, len(columns_to_check), 2):  # Iterate in pairs
    col1 = columns_to_check[i]
    col2 = columns_to_check[i + 1]
    
    if nyc_houses_df[col1].equals(nyc_houses_df[col2]):
        print(f"✅ The columns '{col1}' and '{col2}' have exactly the same content.")
    else:
        print(f"❌ The columns '{col1}' and '{col2}' have different content.")

Duplicates: [('COMMERCIAL UNITS', 'COMMERCIAL UNITS '), ('LOT ', 'LOT'), ('LOT', 'LOT '), ('ZIP CODE', 'ZIP CODE '), ('SALE DATE', 'SALE DATE '), ('SALE DATE ', 'SALE DATE'), ('COMMERCIAL UNITS ', 'COMMERCIAL UNITS'), ('ZIP CODE ', 'ZIP CODE')]
✅ The columns 'COMMERCIAL UNITS ' and 'COMMERCIAL UNITS' have exactly the same content.
✅ The columns 'LOT ' and 'LOT' have exactly the same content.
✅ The columns 'ZIP CODE ' and 'ZIP CODE' have exactly the same content.
✅ The columns 'SALE DATE ' and 'SALE DATE' have exactly the same content.


✖️ ***The following columns will be removed because they are duplicates***

In [7]:
# Identify columns with extra spaces
columns_to_remove = [col for col in columns_to_check if col.endswith(' ')]
columns_to_remove

['COMMERCIAL UNITS ', 'LOT ', 'ZIP CODE ', 'SALE DATE ']

In [8]:
nyc_houses_df = nyc_houses_df.drop(columns=columns_to_remove)

In [9]:
nyc_houses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137276 entries, 0 to 137275
Data columns (total 22 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   BUILDING CLASS AT PRESENT       137274 non-null  object 
 1   COMMERCIAL UNITS                137274 non-null  float64
 2   value                           137274 non-null  float64
 3   BUILDING CLASS CATEGORY         137274 non-null  object 
 4   BUILDING CLASS AT TIME OF SALE  137274 non-null  object 
 5   LOT                             137274 non-null  float64
 6   TOTAL UNITS                     137274 non-null  float64
 7   TAX CLASS AT TIME OF SALE       137274 non-null  float64
 8   ADDRESS                         137274 non-null  object 
 9   EASE-MENT                       137274 non-null  object 
 10  TAX CLASS AT PRESENT            137274 non-null  object 
 11  ZIP CODE                        137274 non-null  float64
 12  SALE DATE       

In [10]:
nyc_houses_df.sample(10)

Unnamed: 0,BUILDING CLASS AT PRESENT,COMMERCIAL UNITS,value,BUILDING CLASS CATEGORY,BUILDING CLASS AT TIME OF SALE,LOT,TOTAL UNITS,TAX CLASS AT TIME OF SALE,ADDRESS,EASE-MENT,TAX CLASS AT PRESENT,ZIP CODE,SALE DATE,NEIGHBORHOOD,RESIDENTIAL UNITS,GROSS SQUARE FEET,BOROUGH,BLOCK,SALE PRICE,APARTMENT NUMBER,YEAR BUILT,LAND SQUARE FEET
123883,A5,0.0,243637.0,01 ONE FAMILY DWELLINGS,A5,49.0,1.0,1.0,840 THROGGS NECK EXPRESS,,1,10465.0,2017-07-28 00:00:00,COUNTRY CLUB,1.0,1998,2.0,5472.0,459000,,1965.0,1800
113246,R4,0.0,243637.0,13 CONDOS - ELEVATOR APARTMENTS,R4,1068.0,1.0,2.0,42-42 UNION STREET,,2,11355.0,2017-02-03 00:00:00,FLUSHING-NORTH,1.0,-,4.0,5180.0,-,4-F,0.0,-
135715,R4,0.0,243637.0,13 CONDOS - ELEVATOR APARTMENTS,R4,1005.0,1.0,2.0,41-28 HAIGHT STREET,,2,11355.0,2017-03-28 00:00:00,FLUSHING-NORTH,1.0,-,4.0,5062.0,427000,2A,0.0,-
63598,R4,0.0,243637.0,13 CONDOS - ELEVATOR APARTMENTS,R4,1004.0,1.0,2.0,30 EAST 85TH STREET,,2,10028.0,2017-02-21 00:00:00,UPPER EAST SIDE (79-96),1.0,-,1.0,1496.0,-,3A,1987.0,-
122164,A1,0.0,243637.0,01 ONE FAMILY DWELLINGS,A1,12.0,1.0,1.0,1669 KIMBALL STREET,,1,11234.0,2017-05-10 00:00:00,MARINE PARK,1.0,1260,3.0,7864.0,0,,1920.0,2000
134576,R1,0.0,243637.0,15 CONDOS - 2-10 UNIT RESIDENTIAL,R1,1004.0,1.0,2.0,116 COVERT STREET,,2C,11207.0,2016-11-03 00:00:00,BUSHWICK,1.0,0,3.0,3422.0,427975,2B,2015.0,0
58893,R4,0.0,243637.0,13 CONDOS - ELEVATOR APARTMENTS,R4,1004.0,1.0,2.0,77 EAST 110 STREET,,2,10029.0,2016-09-27 00:00:00,HARLEM-EAST,1.0,-,1.0,1616.0,173394,2B,2013.0,-
101101,C6,0.0,243637.0,09 COOPS - WALKUP APARTMENTS,C6,1.0,0.0,2.0,"155-15 79TH STREET, 183",,2,11414.0,2017-04-06 00:00:00,HOWARD BEACH,0.0,-,4.0,11459.0,155000,,1958.0,-
46249,B3,0.0,243637.0,02 TWO FAMILY DWELLINGS,B3,1.0,2.0,1.0,215-01 102ND AVENUE,,1,11429.0,2017-03-17 00:00:00,QUEENS VILLAGE,2.0,1670,4.0,11089.0,636525,,1930.0,2500
78590,R1,0.0,243637.0,15 CONDOS - 2-10 UNIT RESIDENTIAL,R1,1202.0,1.0,2.0,81 SAINT MARKS PLACE,,2C,11217.0,2017-02-03 00:00:00,PARK SLOPE,1.0,0,3.0,932.0,999000,2,0.0,0


In [11]:
unique_easements = nyc_houses_df['EASE-MENT'].unique()
print(unique_easements)

[' ' nan]


***Easement:***
An easement is a right, such as a right of way, which allows an entity to make limited use of 
another’s real property. For example:  MTA railroad tracks that run across a portion of another 
property.

***This column is empty, therefore it does not represent any information for the analysis.***

In [12]:
nyc_houses_df = nyc_houses_df.drop(columns=['EASE-MENT'])

***Value is a column that has only one value that is repeated for all rows, so it does not provide any contribution to the analysis.***

In [13]:
nyc_houses_df['value'].unique()

array([243637.,     nan])

In [14]:
nyc_houses_df = nyc_houses_df.drop(columns=['value'])

In [15]:
nyc_houses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137276 entries, 0 to 137275
Data columns (total 20 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   BUILDING CLASS AT PRESENT       137274 non-null  object 
 1   COMMERCIAL UNITS                137274 non-null  float64
 2   BUILDING CLASS CATEGORY         137274 non-null  object 
 3   BUILDING CLASS AT TIME OF SALE  137274 non-null  object 
 4   LOT                             137274 non-null  float64
 5   TOTAL UNITS                     137274 non-null  float64
 6   TAX CLASS AT TIME OF SALE       137274 non-null  float64
 7   ADDRESS                         137274 non-null  object 
 8   TAX CLASS AT PRESENT            137274 non-null  object 
 9   ZIP CODE                        137274 non-null  float64
 10  SALE DATE                       137274 non-null  object 
 11  NEIGHBORHOOD                    137274 non-null  object 
 12  RESIDENTIAL UNIT

## 🔠 ***Categorical Variables***
### ***Ordinal***
- ***TAX CLASS AT TIME OF SALE / TAX CLASS AT PRESENT:*** Every property in the city is assigned to one of four tax classes (Classes 1, 2, 3, and 4), 
based on the use of the property.  
    - Class 1: Includes most residential property of up to three units (such as one-, 
two-, and three-family homes and small stores or offices with one or two 
attached apartments), vacant land that is zoned for residential use, and most 
condominiums that are not more than three stories.  
    - Class 2: Includes all other property that is primarily residential, such as 
cooperatives and condominiums.  
    - Class 3: Includes property with equipment owned by a gas, telephone or electric 
company.  
    - Class 4: Includes all other properties not included in class 1,2, and 3, such as 
offices, factories, warehouses, garage buildings, etc.

### ***Nominal***
- ***BUILDING CLASS AT PRESENT:***  used to describe a property’s constructive use. The first position of the Building Class is a letter that is used to describe a general class of properties. The second position, a number, adds more specific information about the property’s use or construction style.
- ***BUILDING CLASS CATEGORY***:  field that we are including so that users of the Rolling Sales Files can easily identify similar properties.
- ***BUILDING CLASS AT TIME OF SALE:*** used to describe a property’s constructive use.
- ***LOT:***  a subdivision of a Tax Block and represents the property unique location.
- ***ADDRESS:*** The street address of the property as listed on the Sales File.
- ***ZIP CODE:*** The property’s postal code.
- ***NEIGHBORHOOD:*** Department of Finance assessors determine the neighborhood name in the course of valuing 
properties.
- ***BOROUGH:*** The name of the borough in which the property is located. 
- ***BLOCK:*** A Tax Block is a sub-division of the borough on which real properties are located. The Department of Finance uses a Borough-Block-Lot classification to label all real property in the City.
- ***APARTMENT NUMBER***


## 🔢 ***Numerical variables***
### ***Discrete***
- ***COMMERCIAL UNITS:*** The number of commercial units at the listed property.
- ***TOTAL UNITS:*** The total number of units at the listed property.
- ***RESIDENTIAL UNITS:*** The number of residential units at the listed property.

### ***Continuous***
- ***LAND SQUARE FEET:*** The land area of the property listed in square feet.
- ***GROSS SQUARE FEET:*** The total area of all the floors of a building as measured from the exterior surfaces of the 
outside walls of the building, including the land area and space within any building or structure 
on the property.
- ***SALE PRICE:*** Price paid for the property.


## 🗓️ ***Dates***
- ***SALE DATE:*** Date the property sold.
- ***YEAR BUILT:*** Year the structure on the property was built.

# 👁️‍🗨️ ***Convert data types***

In [16]:
# Unify null values
nyc_houses_df.replace(["NULL", "None", "", "?", " ", "  "], np.nan, inplace=True)

print(nyc_houses_df.isnull().sum())

BUILDING CLASS AT PRESENT           1233
COMMERCIAL UNITS                       2
BUILDING CLASS CATEGORY                2
BUILDING CLASS AT TIME OF SALE         2
LOT                                    2
TOTAL UNITS                            2
TAX CLASS AT TIME OF SALE              2
ADDRESS                                2
TAX CLASS AT PRESENT                1233
ZIP CODE                               2
SALE DATE                              2
NEIGHBORHOOD                           2
RESIDENTIAL UNITS                      2
GROSS SQUARE FEET                      2
BOROUGH                                2
BLOCK                                  2
SALE PRICE                             2
APARTMENT NUMBER                  106257
YEAR BUILT                             2
LAND SQUARE FEET                       2
dtype: int64


In [17]:
cols_categorical_nom = ['TAX CLASS AT TIME OF SALE', 'TAX CLASS AT PRESENT', 'BUILDING CLASS AT PRESENT', 'BUILDING CLASS CATEGORY', 'BUILDING CLASS AT TIME OF SALE', 'LOT', 'ADDRESS', 'ZIP CODE', 'NEIGHBORHOOD', 'BOROUGH', 'BLOCK', 'APARTMENT NUMBER']
nyc_houses_df[cols_categorical_nom] = nyc_houses_df[cols_categorical_nom].astype('category')

In [18]:
cols_numeric_int = ['COMMERCIAL UNITS', 'TOTAL UNITS', 'RESIDENTIAL UNITS']
nyc_houses_df[cols_numeric_int] = nyc_houses_df[cols_numeric_int].astype('Int64')

In [19]:
cols_numeric_float = ['LAND SQUARE FEET', 'GROSS SQUARE FEET', 'SALE PRICE']

nyc_houses_df[cols_numeric_float] = nyc_houses_df[cols_numeric_float].replace(' -  ', np.nan)

nyc_houses_df[cols_numeric_float] = nyc_houses_df[cols_numeric_float].astype('float')

In [20]:
#cols_dates = ['SALE DATE', 'YEAR BUILT']
nyc_houses_df['SALE DATE'] = nyc_houses_df['SALE DATE'].astype('datetime64[ns]')

In [21]:
nyc_houses_df['YEAR BUILT'] = nyc_houses_df['YEAR BUILT'].astype('Int64')

In [22]:
nyc_houses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137276 entries, 0 to 137275
Data columns (total 20 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   BUILDING CLASS AT PRESENT       136043 non-null  category      
 1   COMMERCIAL UNITS                137274 non-null  Int64         
 2   BUILDING CLASS CATEGORY         137274 non-null  category      
 3   BUILDING CLASS AT TIME OF SALE  137274 non-null  category      
 4   LOT                             137274 non-null  category      
 5   TOTAL UNITS                     137274 non-null  Int64         
 6   TAX CLASS AT TIME OF SALE       137274 non-null  category      
 7   ADDRESS                         137274 non-null  category      
 8   TAX CLASS AT PRESENT            136043 non-null  category      
 9   ZIP CODE                        137274 non-null  category      
 10  SALE DATE                       137274 non-null  datetim

In [23]:
# Generate a random sample index
nyc_houses_df.sample(5)

Unnamed: 0,BUILDING CLASS AT PRESENT,COMMERCIAL UNITS,BUILDING CLASS CATEGORY,BUILDING CLASS AT TIME OF SALE,LOT,TOTAL UNITS,TAX CLASS AT TIME OF SALE,ADDRESS,TAX CLASS AT PRESENT,ZIP CODE,SALE DATE,NEIGHBORHOOD,RESIDENTIAL UNITS,GROSS SQUARE FEET,BOROUGH,BLOCK,SALE PRICE,APARTMENT NUMBER,YEAR BUILT,LAND SQUARE FEET
134130,K1,2,22 STORE BUILDINGS,K1,26.0,2,4.0,812-818 EAST TREMONT AVENUE,4,10460.0,2017-02-27,EAST TREMONT,0,4100.0,2.0,2956.0,2040000.0,,1931,5000.0
76235,A5,0,01 ONE FAMILY DWELLINGS,A5,109.0,1,1.0,388 COLON AVENUE,1,10308.0,2017-05-26,GREAT KILLS,1,1092.0,5.0,5458.0,,,1975,2121.0
57312,R9,0,17 CONDO COOPS,R9,1901.0,0,2.0,"551 MAIN STREET, RES",2,10044.0,2016-10-05,ROOSEVELT ISLAND,0,,1.0,1373.0,444009.0,,1975,
74439,O6,35,21 OFFICE BUILDINGS,O6,64.0,35,4.0,240 WEST 35TH STREET,4,10001.0,2016-12-20,FASHION,0,145372.0,1.0,784.0,108000000.0,,1924,9875.0
132556,R4,0,13 CONDOS - ELEVATOR APARTMENTS,R4,1038.0,1,2.0,90 FURMAN STREET,2,11201.0,2016-12-19,BROOKLYN HEIGHTS,1,0.0,3.0,199.0,1850000.0,N-522,2015,0.0


In [24]:
schema = pa.Table.from_pandas(nyc_houses_df).schema

# 💾 ***Save dataframe with data types***

In [25]:
output_dir = DATA_DIR / "02_intermediate"
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [26]:
nyc_houses_df.to_parquet(
    output_dir / "nyc_houses_fixed.parquet",
    schema=schema,
    index=False
)