# ***Feature Engineering***

# 📚 ***Import libraries***

In [1]:
# base libraries for data science
from pathlib import Path

import pandas as pd
import sklearn as sk
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# 💾 ***Load data***

In [3]:
DATA_DIR = Path.cwd().resolve().parents[1] / "data"

nyc_houses_df = pd.read_parquet(
    DATA_DIR / "02_intermediate/nyc_houses_fixed.parquet", engine="pyarrow"
)

In [4]:
# print library version for reproducibility

print("Pandas version: ", pd.__version__)
print("sklearn version: ", sk.__version__)

Pandas version:  2.2.3
sklearn version:  1.5.2


# 👷 ***Data preparation***

In [5]:
nyc_houses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137276 entries, 0 to 137275
Data columns (total 20 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   BUILDING CLASS AT PRESENT       136043 non-null  category      
 1   COMMERCIAL UNITS                137274 non-null  Int64         
 2   BUILDING CLASS CATEGORY         137274 non-null  category      
 3   BUILDING CLASS AT TIME OF SALE  137274 non-null  category      
 4   LOT                             137274 non-null  float64       
 5   TOTAL UNITS                     137274 non-null  Int64         
 6   TAX CLASS AT TIME OF SALE       137274 non-null  float64       
 7   ADDRESS                         137274 non-null  category      
 8   TAX CLASS AT PRESENT            136043 non-null  category      
 9   ZIP CODE                        137274 non-null  float64       
 10  SALE DATE                       137274 non-null  datetim

***For visualization purposes, we are going to see again how many nulls (and zero values) each column has***

In [None]:
nulos_count = nyc_houses_df.isnull().sum()
nulos_percentage = (nulos_count / len(nyc_houses_df)) * 100
zero_values_count = nyc_houses_df.eq(0).sum()
zero_values_percentage = (zero_values_count / len(nyc_houses_df)) * 100

nulos_df = pd.DataFrame({
    'Nulls': nulos_count,
    'Percentage Nulls (%)': nulos_percentage,
    'Zero values': zero_values_count,
    'Percentage Zeros (%)': zero_values_percentage
}).sort_values(by='Percentage Nulls (%)', ascending=False)

nulos_df

Unnamed: 0,Nulls,Percentage Nulls (%),Zero values,Percentage Zeros (%)
APARTMENT NUMBER,106257,77.403916,0,0.0
GROSS SQUARE FEET,44868,32.684519,18633,13.573385
LAND SQUARE FEET,42602,31.03383,16845,12.2709
SALE PRICE,23668,17.241178,16546,12.05309
TAX CLASS AT PRESENT,1233,0.898191,0,0.0
BUILDING CLASS AT PRESENT,1233,0.898191,0,0.0
COMMERCIAL UNITS,2,0.001457,129036,93.997494
BUILDING CLASS CATEGORY,2,0.001457,0,0.0
BUILDING CLASS AT TIME OF SALE,2,0.001457,0,0.0
LOT,2,0.001457,0,0.0


According to the glossary of variables, there are some that it makes ***NO SENSE*** to have a zero as data, therefore these will be converted to null, for a possible imputation of data.

In [None]:
# Columns to nullify (replace 0 by NaN)
columns_to_nullify = [
    "GROSS SQUARE FEET",
    "LAND SQUARE FEET",
    "SALE PRICE",
    "TOTAL UNITS",
    "ZIP CODE",
    "YEAR BUILT"
]

nyc_houses_df[columns_to_nullify] = nyc_houses_df[columns_to_nullify].replace(0, np.nan)

In [None]:
nulos_count = nyc_houses_df.isnull().sum()
nulos_percentage = (nulos_count / len(nyc_houses_df)) * 100
zero_values_count = nyc_houses_df.eq(0).sum()
zero_values_percentage = (zero_values_count / len(nyc_houses_df)) * 100

nulos_df = pd.DataFrame({
    'Nulls': nulos_count,
    'Percentage Nulls (%)': nulos_percentage,
    'Zero values': zero_values_count,
    'Percentage Zeros (%)': zero_values_percentage
}).sort_values(by='Percentage Nulls (%)', ascending=False)

nulos_df

The ***ADDRESS & APARTMENT NUMBER*** columns will be droped because aren't relevant for the model. In addition ***`APARTMENT NUMBER`*** has almost 80% of the data as null.

In [6]:
selected_features = [
    "BUILDING CLASS AT PRESENT",
    "COMMERCIAL UNITS",
    "BUILDING CLASS CATEGORY",
    "BUILDING CLASS AT TIME OF SALE",
    "LOT",
    "TOTAL UNITS",
    "TAX CLASS AT TIME OF SALE",
    "TAX CLASS AT PRESENT",
    "ZIP CODE",
    "SALE DATE",
    "NEIGHBORHOOD",
    "RESIDENTIAL UNITS",
    "GROSS SQUARE FEET",
    "BOROUGH",
    "BLOCK",
    "SALE PRICE",
    "YEAR BUILT",
    "LAND SQUARE FEET"
]

nyc_houses_features = nyc_houses_df[selected_features].copy()
nyc_houses_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137276 entries, 0 to 137275
Data columns (total 18 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   BUILDING CLASS AT PRESENT       136043 non-null  category      
 1   COMMERCIAL UNITS                137274 non-null  Int64         
 2   BUILDING CLASS CATEGORY         137274 non-null  category      
 3   BUILDING CLASS AT TIME OF SALE  137274 non-null  category      
 4   LOT                             137274 non-null  float64       
 5   TOTAL UNITS                     137274 non-null  Int64         
 6   TAX CLASS AT TIME OF SALE       137274 non-null  float64       
 7   TAX CLASS AT PRESENT            136043 non-null  category      
 8   ZIP CODE                        137274 non-null  float64       
 9   SALE DATE                       137274 non-null  datetime64[ns]
 10  NEIGHBORHOOD                    137274 non-null  categor

### ***Missing values***

In [6]:
nyc_houses_features.isna().sum()

BUILDING CLASS AT PRESENT          1233
COMMERCIAL UNITS                      2
BUILDING CLASS CATEGORY               2
BUILDING CLASS AT TIME OF SALE        2
LOT                                   2
TOTAL UNITS                           2
TAX CLASS AT TIME OF SALE             2
TAX CLASS AT PRESENT               1233
ZIP CODE                              2
SALE DATE                             2
NEIGHBORHOOD                          2
RESIDENTIAL UNITS                     2
GROSS SQUARE FEET                 44868
BOROUGH                               2
BLOCK                                 2
SALE PRICE                        23668
YEAR BUILT                            2
LAND SQUARE FEET                  42602
dtype: int64

In [18]:
nulos_count = nyc_houses_features.isnull().sum()
nulos_percentage = (nulos_count / len(nyc_houses_features)) * 100

In [19]:
# Dataframe with the count and percentage of missing values
nulos_df = pd.DataFrame({
    'Nulos': nulos_count,
    'Porcentaje (%)': nulos_percentage
}).sort_values(by='Porcentaje (%)', ascending=False)

nulos_df

Unnamed: 0,Nulos,Porcentaje (%)
GROSS SQUARE FEET,21897,32.671362
LAND SQUARE FEET,20799,31.033094
SALE PRICE,11354,16.940706
BUILDING CLASS AT PRESENT,607,0.905673
TAX CLASS AT PRESENT,607,0.905673
COMMERCIAL UNITS,1,0.001492
TOTAL UNITS,1,0.001492
BUILDING CLASS CATEGORY,1,0.001492
LOT,1,0.001492
BUILDING CLASS AT TIME OF SALE,1,0.001492


***The number of nulls less than 30%, therefore they will be imputed by mode.***

In [7]:
duplicate_rows = nyc_houses_features.duplicated().sum()
print("Number of duplicate rows: ", duplicate_rows)

Number of duplicate rows:  70254


In [8]:
nyc_houses_features = nyc_houses_features.drop_duplicates()
nyc_houses_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 67022 entries, 0 to 67637
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   BUILDING CLASS AT PRESENT       66415 non-null  category      
 1   COMMERCIAL UNITS                67021 non-null  Int64         
 2   BUILDING CLASS CATEGORY         67021 non-null  category      
 3   BUILDING CLASS AT TIME OF SALE  67021 non-null  category      
 4   LOT                             67021 non-null  float64       
 5   TOTAL UNITS                     67021 non-null  Int64         
 6   TAX CLASS AT TIME OF SALE       67021 non-null  float64       
 7   TAX CLASS AT PRESENT            66415 non-null  category      
 8   ZIP CODE                        67021 non-null  float64       
 9   SALE DATE                       67021 non-null  datetime64[ns]
 10  NEIGHBORHOOD                    67021 non-null  category      
 11  RESIDEN

In [9]:
cols_categorical = ['TAX CLASS AT TIME OF SALE', 'LOT', 'ZIP CODE', 'BOROUGH', 'BLOCK']
nyc_houses_features[cols_categorical] = nyc_houses_features[cols_categorical].astype('category')

In [10]:
nyc_houses_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 67022 entries, 0 to 67637
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   BUILDING CLASS AT PRESENT       66415 non-null  category      
 1   COMMERCIAL UNITS                67021 non-null  Int64         
 2   BUILDING CLASS CATEGORY         67021 non-null  category      
 3   BUILDING CLASS AT TIME OF SALE  67021 non-null  category      
 4   LOT                             67021 non-null  category      
 5   TOTAL UNITS                     67021 non-null  Int64         
 6   TAX CLASS AT TIME OF SALE       67021 non-null  category      
 7   TAX CLASS AT PRESENT            66415 non-null  category      
 8   ZIP CODE                        67021 non-null  category      
 9   SALE DATE                       67021 non-null  datetime64[ns]
 10  NEIGHBORHOOD                    67021 non-null  category      
 11  RESIDEN

### ***PipeLines***

In [14]:
cols_numeric = ['COMMERCIAL UNITS', 'TOTAL UNITS', 'RESIDENTIAL UNITS', 'GROSS SQUARE FEET', 'SALE PRICE', 'YEAR BUILT', 'LAND SQUARE FEET']
cols_categoric = ['BUILDING CLASS AT PRESENT', 'BUILDING CLASS CATEGORY', 'BUILDING CLASS AT TIME OF SALE', 'LOT', 'ZIP CODE', 'NEIGHBORHOOD', 'BOROUGH', 'BLOCK']
cols_categoric_ord = ['TAX CLASS AT TIME OF SALE', 'TAX CLASS AT PRESENT']

In [15]:
numeric_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
    ]
)

categorical_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder()),
    ]
)

categorical_ord_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OrdinalEncoder()),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipe, cols_numeric),
        ("categoric", categorical_pipe, cols_categoric),
        ("categoric ordinales", categorical_ord_pipe, cols_categoric_ord),
    ]
)


In [16]:
preprocessor

#### Example of the data preprocessing pipeline

***Train / Test split***

In [17]:
X_features = nyc_houses_features.drop("SALE PRICE", axis="columns")
Y_target = nyc_houses_features["SALE PRICE"]

# 80% train, 20% test
x_train, x_test, y_train, y_test = train_test_split(
    X_features, Y_target, test_size=0.2, stratify=Y_target
)


ValueError: Input y contains NaN.