# 📚 ***Import libraries***

In [1]:
# base libraries for data science
from pathlib import Path
import numpy as np
import pandas as pd
import pyarrow as pa

# 💾 ***Load data***

In [2]:
# data directory path
DATA_DIR = Path.cwd().resolve().parents[0] / "data"

nyc_houses_df = pd.read_csv(DATA_DIR / "01_raw/nyc_houses_raw.csv")

# 📊 ***Data description***

In [3]:
nyc_houses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137276 entries, 0 to 137275
Data columns (total 26 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   BUILDING CLASS AT PRESENT       137274 non-null  object 
 1   COMMERCIAL UNITS                137274 non-null  float64
 2   value                           137274 non-null  float64
 3   BUILDING CLASS CATEGORY         137274 non-null  object 
 4   BUILDING CLASS AT TIME OF SALE  137274 non-null  object 
 5   LOT                             137274 non-null  float64
 6   LOT                             137274 non-null  float64
 7   TOTAL UNITS                     137274 non-null  float64
 8   TAX CLASS AT TIME OF SALE       137274 non-null  float64
 9   ADDRESS                         137274 non-null  object 
 10  EASE-MENT                       137274 non-null  object 
 11  TAX CLASS AT PRESENT            137274 non-null  object 
 12  ZIP CODE        

In [4]:
# Generate a random sample index
sample_indices = nyc_houses_df.sample(10, random_state=42).index

first_half = nyc_houses_df.iloc[:, :15]
second_half = nyc_houses_df.iloc[:, 15:]

first_half_sample = first_half.loc[sample_indices]
second_half_sample = second_half.loc[sample_indices]

display(first_half_sample)
display(second_half_sample)

Unnamed: 0,BUILDING CLASS AT PRESENT,COMMERCIAL UNITS,value,BUILDING CLASS CATEGORY,BUILDING CLASS AT TIME OF SALE,LOT,LOT.1,TOTAL UNITS,TAX CLASS AT TIME OF SALE,ADDRESS,EASE-MENT,TAX CLASS AT PRESENT,ZIP CODE,SALE DATE,NEIGHBORHOOD
127912,D4,0.0,243637.0,10 COOPS - ELEVATOR APARTMENTS,D4,9001.0,9001.0,0.0,2.0,"860 UNITED NATIONS PLAZA, 30A",,2,10017.0,2017-06-15 00:00:00,MIDTOWN EAST
84960,D4,0.0,243637.0,10 COOPS - ELEVATOR APARTMENTS,D4,1.0,1.0,0.0,2.0,"305 EAST 40TH STREET, 6N",,2,10016.0,2016-10-20 00:00:00,MIDTOWN EAST
43690,B2,0.0,243637.0,02 TWO FAMILY DWELLINGS,B2,19.0,19.0,2.0,1.0,1237 EAST 89TH STREET,,1,11236.0,2017-01-27 00:00:00,CANARSIE
34975,A1,0.0,243637.0,01 ONE FAMILY DWELLINGS,A1,24.0,24.0,1.0,1.0,21-40 74TH STREET,,1,11370.0,2017-04-24 00:00:00,JACKSON HEIGHTS
46232,R4,0.0,243637.0,13 CONDOS - ELEVATOR APARTMENTS,R4,1002.0,1002.0,1.0,2.0,520 WEST 19TH STREET,,2,10011.0,2017-06-13 00:00:00,CHELSEA
137220,B2,0.0,243637.0,02 TWO FAMILY DWELLINGS,B2,120.0,120.0,2.0,1.0,70-08 58TH ROAD,,1,11378.0,2017-07-26 00:00:00,MIDDLE VILLAGE
60625,R9,0.0,243637.0,17 CONDO COOPS,R9,1002.0,1002.0,0.0,2.0,"7-11 EAST 13TH STREET, 3G",,2,10003.0,2016-12-08 00:00:00,GREENWICH VILLAGE-CENTRAL
52228,C1,0.0,243637.0,07 RENTALS - WALKUP APARTMENTS,C1,25.0,25.0,8.0,2.0,262 ST MARKS AVENUE,,2B,11238.0,2017-06-30 00:00:00,PROSPECT HEIGHTS
87398,C6,0.0,243637.0,09 COOPS - WALKUP APARTMENTS,C6,139.0,139.0,0.0,2.0,"163-28 17TH AVENUE, 4-295",,2,11357.0,2016-10-14 00:00:00,WHITESTONE
92487,R4,0.0,243637.0,13 CONDOS - ELEVATOR APARTMENTS,R4,1227.0,1227.0,1.0,2.0,111 FULTON STREET,,2,10038.0,2017-04-26 00:00:00,SOUTHBRIDGE


Unnamed: 0,SALE DATE,RESIDENTIAL UNITS,COMMERCIAL UNITS,GROSS SQUARE FEET,BOROUGH,BLOCK,SALE PRICE,APARTMENT NUMBER,YEAR BUILT,LAND SQUARE FEET,ZIP CODE
127912,2017-06-15 00:00:00,0.0,0.0,-,1.0,1360.0,4000000,,1967.0,-,10017.0
84960,2016-10-20 00:00:00,0.0,0.0,-,1.0,1333.0,592000,,1963.0,-,10016.0
43690,2017-01-27 00:00:00,2.0,0.0,1890,3.0,8068.0,0,,1920.0,3000,11236.0
34975,2017-04-24 00:00:00,1.0,0.0,838,4.0,969.0,1400000,,1925.0,2500,11370.0
46232,2017-06-13 00:00:00,1.0,0.0,-,1.0,690.0,2560000,2A,2007.0,-,10011.0
137220,2017-07-26 00:00:00,2.0,0.0,1580,4.0,2801.0,860000,,1940.0,2375,11378.0
60625,2016-12-08 00:00:00,0.0,0.0,-,1.0,571.0,990000,,1980.0,-,10003.0
52228,2017-06-30 00:00:00,8.0,0.0,5652,3.0,1152.0,0,,1931.0,3537,11238.0
87398,2016-10-14 00:00:00,0.0,0.0,-,4.0,5740.0,230000,,1951.0,-,11357.0
92487,2017-04-26 00:00:00,1.0,0.0,-,1.0,91.0,750000,323,1940.0,-,10038.0


# 🧐***Verify duplicates***

In [5]:
duplicated_columns_content = []
for col1 in nyc_houses_df.columns:
    for col2 in nyc_houses_df.columns:
        if col1 != col2 and nyc_houses_df[col1].equals(nyc_houses_df[col2]):
            duplicated_columns_content.append((col1, col2))

# Mostrar columnas duplicadas después de limpieza
print("Duplicates:", duplicated_columns_content)

# Select the columns to check
columns_to_check = ['COMMERCIAL UNITS ', 'COMMERCIAL UNITS', 'LOT ', 'LOT', 'ZIP CODE ', 'ZIP CODE', 'SALE DATE ', 'SALE DATE']

# Check if the columns have the same content
for i in range(0, len(columns_to_check), 2):  # Iterate in pairs
    col1 = columns_to_check[i]
    col2 = columns_to_check[i + 1]
    
    if nyc_houses_df[col1].equals(nyc_houses_df[col2]):
        print(f"✅ The columns '{col1}' and '{col2}' have exactly the same content.")
    else:
        print(f"❌ The columns '{col1}' and '{col2}' have different content.")

Duplicates: [('COMMERCIAL UNITS', 'COMMERCIAL UNITS '), ('LOT ', 'LOT'), ('LOT', 'LOT '), ('ZIP CODE', 'ZIP CODE '), ('SALE DATE', 'SALE DATE '), ('SALE DATE ', 'SALE DATE'), ('COMMERCIAL UNITS ', 'COMMERCIAL UNITS'), ('ZIP CODE ', 'ZIP CODE')]
✅ The columns 'COMMERCIAL UNITS ' and 'COMMERCIAL UNITS' have exactly the same content.
✅ The columns 'LOT ' and 'LOT' have exactly the same content.
✅ The columns 'ZIP CODE ' and 'ZIP CODE' have exactly the same content.
✅ The columns 'SALE DATE ' and 'SALE DATE' have exactly the same content.


✖️ ***The following columns will be removed because they are duplicates***

In [6]:
# Identify columns with extra spaces
columns_to_remove = [col for col in columns_to_check if col.endswith(' ')]
columns_to_remove

['COMMERCIAL UNITS ', 'LOT ', 'ZIP CODE ', 'SALE DATE ']

In [7]:
nyc_houses_df = nyc_houses_df.drop(columns=columns_to_remove)

In [8]:
nyc_houses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137276 entries, 0 to 137275
Data columns (total 22 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   BUILDING CLASS AT PRESENT       137274 non-null  object 
 1   COMMERCIAL UNITS                137274 non-null  float64
 2   value                           137274 non-null  float64
 3   BUILDING CLASS CATEGORY         137274 non-null  object 
 4   BUILDING CLASS AT TIME OF SALE  137274 non-null  object 
 5   LOT                             137274 non-null  float64
 6   TOTAL UNITS                     137274 non-null  float64
 7   TAX CLASS AT TIME OF SALE       137274 non-null  float64
 8   ADDRESS                         137274 non-null  object 
 9   EASE-MENT                       137274 non-null  object 
 10  TAX CLASS AT PRESENT            137274 non-null  object 
 11  ZIP CODE                        137274 non-null  float64
 12  SALE DATE       

In [9]:
# Generate a random sample index
sample_indices = nyc_houses_df.sample(10, random_state=42).index

first_half = nyc_houses_df.iloc[:, :10]
second_half = nyc_houses_df.iloc[:, 10:]

first_half_sample = first_half.loc[sample_indices]
second_half_sample = second_half.loc[sample_indices]

display(first_half_sample)
display(second_half_sample)

Unnamed: 0,BUILDING CLASS AT PRESENT,COMMERCIAL UNITS,value,BUILDING CLASS CATEGORY,BUILDING CLASS AT TIME OF SALE,LOT,TOTAL UNITS,TAX CLASS AT TIME OF SALE,ADDRESS,EASE-MENT
127912,D4,0.0,243637.0,10 COOPS - ELEVATOR APARTMENTS,D4,9001.0,0.0,2.0,"860 UNITED NATIONS PLAZA, 30A",
84960,D4,0.0,243637.0,10 COOPS - ELEVATOR APARTMENTS,D4,1.0,0.0,2.0,"305 EAST 40TH STREET, 6N",
43690,B2,0.0,243637.0,02 TWO FAMILY DWELLINGS,B2,19.0,2.0,1.0,1237 EAST 89TH STREET,
34975,A1,0.0,243637.0,01 ONE FAMILY DWELLINGS,A1,24.0,1.0,1.0,21-40 74TH STREET,
46232,R4,0.0,243637.0,13 CONDOS - ELEVATOR APARTMENTS,R4,1002.0,1.0,2.0,520 WEST 19TH STREET,
137220,B2,0.0,243637.0,02 TWO FAMILY DWELLINGS,B2,120.0,2.0,1.0,70-08 58TH ROAD,
60625,R9,0.0,243637.0,17 CONDO COOPS,R9,1002.0,0.0,2.0,"7-11 EAST 13TH STREET, 3G",
52228,C1,0.0,243637.0,07 RENTALS - WALKUP APARTMENTS,C1,25.0,8.0,2.0,262 ST MARKS AVENUE,
87398,C6,0.0,243637.0,09 COOPS - WALKUP APARTMENTS,C6,139.0,0.0,2.0,"163-28 17TH AVENUE, 4-295",
92487,R4,0.0,243637.0,13 CONDOS - ELEVATOR APARTMENTS,R4,1227.0,1.0,2.0,111 FULTON STREET,


Unnamed: 0,TAX CLASS AT PRESENT,ZIP CODE,SALE DATE,NEIGHBORHOOD,RESIDENTIAL UNITS,GROSS SQUARE FEET,BOROUGH,BLOCK,SALE PRICE,APARTMENT NUMBER,YEAR BUILT,LAND SQUARE FEET
127912,2,10017.0,2017-06-15 00:00:00,MIDTOWN EAST,0.0,-,1.0,1360.0,4000000,,1967.0,-
84960,2,10016.0,2016-10-20 00:00:00,MIDTOWN EAST,0.0,-,1.0,1333.0,592000,,1963.0,-
43690,1,11236.0,2017-01-27 00:00:00,CANARSIE,2.0,1890,3.0,8068.0,0,,1920.0,3000
34975,1,11370.0,2017-04-24 00:00:00,JACKSON HEIGHTS,1.0,838,4.0,969.0,1400000,,1925.0,2500
46232,2,10011.0,2017-06-13 00:00:00,CHELSEA,1.0,-,1.0,690.0,2560000,2A,2007.0,-
137220,1,11378.0,2017-07-26 00:00:00,MIDDLE VILLAGE,2.0,1580,4.0,2801.0,860000,,1940.0,2375
60625,2,10003.0,2016-12-08 00:00:00,GREENWICH VILLAGE-CENTRAL,0.0,-,1.0,571.0,990000,,1980.0,-
52228,2B,11238.0,2017-06-30 00:00:00,PROSPECT HEIGHTS,8.0,5652,3.0,1152.0,0,,1931.0,3537
87398,2,11357.0,2016-10-14 00:00:00,WHITESTONE,0.0,-,4.0,5740.0,230000,,1951.0,-
92487,2,10038.0,2017-04-26 00:00:00,SOUTHBRIDGE,1.0,-,1.0,91.0,750000,323,1940.0,-


In [10]:
unique_easements = nyc_houses_df['EASE-MENT'].unique()
print(unique_easements)

[' ' nan]


***Easement:***
An easement is a right, such as a right of way, which allows an entity to make limited use of 
another’s real property. For example:  MTA railroad tracks that run across a portion of another 
property.

***This column is empty, therefore it does not represent any information for the analysis.***

In [11]:
nyc_houses_df = nyc_houses_df.drop(columns=['EASE-MENT'])

***Value is a column that has only one value that is repeated for all rows, so it does not provide any contribution to the analysis.***

In [12]:
nyc_houses_df['value'].unique()

array([243637.,     nan])

In [13]:
nyc_houses_df = nyc_houses_df.drop(columns=['value'])

In [14]:
nyc_houses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137276 entries, 0 to 137275
Data columns (total 20 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   BUILDING CLASS AT PRESENT       137274 non-null  object 
 1   COMMERCIAL UNITS                137274 non-null  float64
 2   BUILDING CLASS CATEGORY         137274 non-null  object 
 3   BUILDING CLASS AT TIME OF SALE  137274 non-null  object 
 4   LOT                             137274 non-null  float64
 5   TOTAL UNITS                     137274 non-null  float64
 6   TAX CLASS AT TIME OF SALE       137274 non-null  float64
 7   ADDRESS                         137274 non-null  object 
 8   TAX CLASS AT PRESENT            137274 non-null  object 
 9   ZIP CODE                        137274 non-null  float64
 10  SALE DATE                       137274 non-null  object 
 11  NEIGHBORHOOD                    137274 non-null  object 
 12  RESIDENTIAL UNIT

## 🔠 ***Categorical Variables***
### ***Ordinal***
- ***TAX CLASS AT TIME OF SALE / TAX CLASS AT PRESENT:*** Every property in the city is assigned to one of four tax classes (Classes 1, 2, 3, and 4), 
based on the use of the property.  
    - Class 1: Includes most residential property of up to three units (such as one-, 
two-, and three-family homes and small stores or offices with one or two 
attached apartments), vacant land that is zoned for residential use, and most 
condominiums that are not more than three stories.  
    - Class 2: Includes all other property that is primarily residential, such as 
cooperatives and condominiums.  
    - Class 3: Includes property with equipment owned by a gas, telephone or electric 
company.  
    - Class 4: Includes all other properties not included in class 1,2, and 3, such as 
offices, factories, warehouses, garage buildings, etc.

### ***Nominal***
- ***BUILDING CLASS AT PRESENT:***  used to describe a property’s constructive use. The first position of the Building Class is a letter that is used to describe a general class of properties. The second position, a number, adds more specific information about the property’s use or construction style.
- ***BUILDING CLASS CATEGORY***:  field that we are including so that users of the Rolling Sales Files can easily identify similar properties.
- ***BUILDING CLASS AT TIME OF SALE:*** used to describe a property’s constructive use.
- ***LOT:***  a subdivision of a Tax Block and represents the property unique location.
- ***ADDRESS:*** The street address of the property as listed on the Sales File.
- ***ZIP CODE:*** The property’s postal code.
- ***NEIGHBORHOOD:*** Department of Finance assessors determine the neighborhood name in the course of valuing 
properties.
- ***BOROUGH:*** The name of the borough in which the property is located. 
- ***BLOCK:*** A Tax Block is a sub-division of the borough on which real properties are located. The Department of Finance uses a Borough-Block-Lot classification to label all real property in the City.
- ***APARTMENT NUMBER***


## 🔢 ***Numerical variables***
### ***Discrete***
- ***COMMERCIAL UNITS:*** The number of commercial units at the listed property.
- ***TOTAL UNITS:*** The total number of units at the listed property.
- ***RESIDENTIAL UNITS:*** The number of residential units at the listed property.

### ***Continuous***
- ***LAND SQUARE FEET:*** The land area of the property listed in square feet.
- ***GROSS SQUARE FEET:*** The total area of all the floors of a building as measured from the exterior surfaces of the 
outside walls of the building, including the land area and space within any building or structure 
on the property.
- ***SALE PRICE:*** Price paid for the property.


## 🗓️ ***Dates***
- ***SALE DATE:*** Date the property sold.
- ***YEAR BUILT:*** Year the structure on the property was built.

# 👁️‍🗨️ ***Convert data types***

In [15]:
# Unify null values
nyc_houses_df.replace(["NULL", "None", "", "?", " ", "  "], np.nan, inplace=True)

print(nyc_houses_df.isnull().sum())

BUILDING CLASS AT PRESENT           1233
COMMERCIAL UNITS                       2
BUILDING CLASS CATEGORY                2
BUILDING CLASS AT TIME OF SALE         2
LOT                                    2
TOTAL UNITS                            2
TAX CLASS AT TIME OF SALE              2
ADDRESS                                2
TAX CLASS AT PRESENT                1233
ZIP CODE                               2
SALE DATE                              2
NEIGHBORHOOD                           2
RESIDENTIAL UNITS                      2
GROSS SQUARE FEET                      2
BOROUGH                                2
BLOCK                                  2
SALE PRICE                             2
APARTMENT NUMBER                  106257
YEAR BUILT                             2
LAND SQUARE FEET                       2
dtype: int64


In [16]:
cols_categorical_nom = ['TAX CLASS AT TIME OF SALE', 'TAX CLASS AT PRESENT', 'BUILDING CLASS AT PRESENT', 'BUILDING CLASS CATEGORY', 'BUILDING CLASS AT TIME OF SALE', 'LOT', 'ADDRESS', 'ZIP CODE', 'NEIGHBORHOOD', 'BOROUGH', 'BLOCK', 'APARTMENT NUMBER']
nyc_houses_df[cols_categorical_nom] = nyc_houses_df[cols_categorical_nom].astype('category')

In [17]:
cols_numeric_int = ['COMMERCIAL UNITS', 'TOTAL UNITS', 'RESIDENTIAL UNITS']
nyc_houses_df[cols_numeric_int] = nyc_houses_df[cols_numeric_int].astype('Int64')

In [18]:
cols_numeric_float = ['LAND SQUARE FEET', 'GROSS SQUARE FEET', 'SALE PRICE']

nyc_houses_df[cols_numeric_float] = nyc_houses_df[cols_numeric_float].replace(' -  ', np.nan)

nyc_houses_df[cols_numeric_float] = nyc_houses_df[cols_numeric_float].astype('float')

In [19]:
#cols_dates = ['SALE DATE', 'YEAR BUILT']
nyc_houses_df['SALE DATE'] = nyc_houses_df['SALE DATE'].astype('datetime64[ns]')

In [20]:
nyc_houses_df['YEAR BUILT'] = nyc_houses_df['YEAR BUILT'].astype('Int64')

In [21]:
nyc_houses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137276 entries, 0 to 137275
Data columns (total 20 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   BUILDING CLASS AT PRESENT       136043 non-null  category      
 1   COMMERCIAL UNITS                137274 non-null  Int64         
 2   BUILDING CLASS CATEGORY         137274 non-null  category      
 3   BUILDING CLASS AT TIME OF SALE  137274 non-null  category      
 4   LOT                             137274 non-null  category      
 5   TOTAL UNITS                     137274 non-null  Int64         
 6   TAX CLASS AT TIME OF SALE       137274 non-null  category      
 7   ADDRESS                         137274 non-null  category      
 8   TAX CLASS AT PRESENT            136043 non-null  category      
 9   ZIP CODE                        137274 non-null  category      
 10  SALE DATE                       137274 non-null  datetim

In [22]:
# Generate a random sample index
sample_indices = nyc_houses_df.sample(5, random_state=42).index

first_half = nyc_houses_df.iloc[:, :10]
second_half = nyc_houses_df.iloc[:, 10:]

first_half_sample = first_half.loc[sample_indices]
second_half_sample = second_half.loc[sample_indices]

display(first_half_sample)
display(second_half_sample)

Unnamed: 0,BUILDING CLASS AT PRESENT,COMMERCIAL UNITS,BUILDING CLASS CATEGORY,BUILDING CLASS AT TIME OF SALE,LOT,TOTAL UNITS,TAX CLASS AT TIME OF SALE,ADDRESS,TAX CLASS AT PRESENT,ZIP CODE
127912,D4,0,10 COOPS - ELEVATOR APARTMENTS,D4,9001.0,0,2.0,"860 UNITED NATIONS PLAZA, 30A",2,10017.0
84960,D4,0,10 COOPS - ELEVATOR APARTMENTS,D4,1.0,0,2.0,"305 EAST 40TH STREET, 6N",2,10016.0
43690,B2,0,02 TWO FAMILY DWELLINGS,B2,19.0,2,1.0,1237 EAST 89TH STREET,1,11236.0
34975,A1,0,01 ONE FAMILY DWELLINGS,A1,24.0,1,1.0,21-40 74TH STREET,1,11370.0
46232,R4,0,13 CONDOS - ELEVATOR APARTMENTS,R4,1002.0,1,2.0,520 WEST 19TH STREET,2,10011.0


Unnamed: 0,SALE DATE,NEIGHBORHOOD,RESIDENTIAL UNITS,GROSS SQUARE FEET,BOROUGH,BLOCK,SALE PRICE,APARTMENT NUMBER,YEAR BUILT,LAND SQUARE FEET
127912,2017-06-15,MIDTOWN EAST,0,,1.0,1360.0,4000000.0,,1967,
84960,2016-10-20,MIDTOWN EAST,0,,1.0,1333.0,592000.0,,1963,
43690,2017-01-27,CANARSIE,2,1890.0,3.0,8068.0,0.0,,1920,3000.0
34975,2017-04-24,JACKSON HEIGHTS,1,838.0,4.0,969.0,1400000.0,,1925,2500.0
46232,2017-06-13,CHELSEA,1,,1.0,690.0,2560000.0,2A,2007,


In [23]:
schema = pa.Table.from_pandas(nyc_houses_df).schema

# 💾 ***Save dataframe with data types***

In [24]:
output_dir = DATA_DIR / "02_intermediate"
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [25]:
nyc_houses_df.to_parquet(
    output_dir / "nyc_houses_fixed.parquet",
    schema=schema,
    index=False
)