In [1]:
import numpy as np
import pandas as pd
from cvxopt import sparse
from scipy.stats import zscore
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
np.random.seed(0)

In [2]:
file_path = 'modified_dataset/us_project_heatmap_remove_outliers.csv'
 
us_ds = pd.read_csv(file_path, sep=',')

In [3]:
us_ds.shape

(1789138, 74)

In [4]:
us_ds

Unnamed: 0,intercept,period_begin,period_end,period_duration,region_type,region_type_id,table_id,is_seasonally_adjusted,region,city,...,price_increased_yoy,inventory_turnover,sale_to_list_ppsf_ratio,supply_demand_balance,fast_selling,sin_year,cos_year,sin_month,cos_month,normalized_median_sale_price
0,1.0,2017-09-01,2017-09-30,30,place,6,29470,f,"Chicago, IL",Chicago,...,1,-0.659877,-0.223963,3.247222,0,0.573151,1.273751,-1.345297,0.094627,0.039266
1,1.0,2020-07-01,2020-07-31,30,place,6,37598,f,"Parsippany, NJ",Parsippany,...,0,0.591989,-0.005179,0.179183,0,1.103099,-0.638699,-0.637291,-1.137098,0.073469
2,1.0,2021-08-01,2021-08-31,30,place,6,24993,f,"Oakbrook, KY",Oakbrook,...,1,3.177449,-0.029312,0.179183,1,0.573151,-1.151138,-1.155588,-0.616510,0.040100
3,1.0,2018-08-01,2018-08-31,30,place,6,29754,f,"Dunstable, MA",Dunstable,...,0,-0.183649,0.082212,0.002180,0,1.103099,0.761311,-1.155588,-0.616510,0.079157
4,1.0,2023-01-01,2023-01-31,30,place,6,10728,f,"Kalamazoo, MI",Kalamazoo,...,1,-0.142691,0.004767,-0.469826,1,-0.874693,-1.151138,0.778721,1.326353,0.028345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1789133,1.0,2014-03-01,2014-03-31,30,place,6,7158,f,"Fullerton, CA",Fullerton,...,0,-0.571468,-0.050774,0.120182,0,-1.404641,0.761311,1.486727,0.094627,0.072711
1789134,1.0,2024-06-01,2024-06-30,30,place,6,31296,f,"Roebling, NJ",Roebling,...,0,0.074897,-0.201923,0.002180,1,-1.404641,-0.638699,0.070715,-1.327647,0.035171
1789135,1.0,2016-11-01,2016-11-30,30,place,6,245,f,"Albany, NY",Albany,...,0,-0.445167,-0.269634,-0.587827,0,-0.150771,1.461316,-0.637291,1.326353,0.012646
1789136,1.0,2019-12-01,2019-12-31,30,place,6,10832,f,"Liberty, TX",Liberty,...,0,-0.093720,-0.157056,-0.115821,0,1.297073,0.061306,0.070715,1.516901,0.011433


In [5]:
string_columns = us_ds.select_dtypes(include='object').columns

print(string_columns)

Index(['period_begin', 'period_end', 'region_type', 'is_seasonally_adjusted',
       'region', 'city', 'state', 'state_code', 'property_type',
       'parent_metro_region', 'last_updated'],
      dtype='object')


In [6]:
exclude_columns = [
    'period_begin', 'period_end', 'region_type', 'is_seasonally_adjusted', 'parent_metro_region', 'last_updated', 'state', 'city', 'region'
]

if exclude_columns:
    columns_for_encoding = [col for col in string_columns if col not in exclude_columns]
else:
    columns_for_encoding = string_columns
    
print(columns_for_encoding)

['state_code', 'property_type']


In [7]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
us_ds_one_hot = encoder.fit_transform(us_ds[columns_for_encoding])

us_ds_one_hot_df = pd.DataFrame(us_ds_one_hot, columns=encoder.get_feature_names_out(columns_for_encoding))

In [8]:
us_ds_one_hot_df.head(200)

Unnamed: 0,state_code_AK,state_code_AL,state_code_AR,state_code_AZ,state_code_CA,state_code_CO,state_code_CT,state_code_DC,state_code_DE,state_code_FL,...,state_code_VA,state_code_VT,state_code_WA,state_code_WI,state_code_WV,property_type_All Residential,property_type_Condo/Co-op,property_type_Multi-Family (2-4 Unit),property_type_Single Family Residential,property_type_Townhouse
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
198,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [9]:
import category_encoders as ce

# Apply Binary Encoding
binary_encoder = ce.BinaryEncoder(cols=columns_for_encoding)
us_ds_binary = binary_encoder.fit_transform(us_ds)


In [10]:
us_ds_binary.head(100)

Unnamed: 0,intercept,period_begin,period_end,period_duration,region_type,region_type_id,table_id,is_seasonally_adjusted,region,city,...,price_increased_yoy,inventory_turnover,sale_to_list_ppsf_ratio,supply_demand_balance,fast_selling,sin_year,cos_year,sin_month,cos_month,normalized_median_sale_price
0,1.0,2017-09-01,2017-09-30,30,place,6,29470,f,"Chicago, IL",Chicago,...,1,-0.659877,-0.223963,3.247222,0,0.573151,1.273751,-1.345297,0.094627,0.039266
1,1.0,2020-07-01,2020-07-31,30,place,6,37598,f,"Parsippany, NJ",Parsippany,...,0,0.591989,-0.005179,0.179183,0,1.103099,-0.638699,-0.637291,-1.137098,0.073469
2,1.0,2021-08-01,2021-08-31,30,place,6,24993,f,"Oakbrook, KY",Oakbrook,...,1,3.177449,-0.029312,0.179183,1,0.573151,-1.151138,-1.155588,-0.616510,0.040100
3,1.0,2018-08-01,2018-08-31,30,place,6,29754,f,"Dunstable, MA",Dunstable,...,0,-0.183649,0.082212,0.002180,0,1.103099,0.761311,-1.155588,-0.616510,0.079157
4,1.0,2023-01-01,2023-01-31,30,place,6,10728,f,"Kalamazoo, MI",Kalamazoo,...,1,-0.142691,0.004767,-0.469826,1,-0.874693,-1.151138,0.778721,1.326353,0.028345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.0,2024-07-01,2024-07-31,30,place,6,16583,f,"Saco, ME",Saco,...,0,0.074897,0.079835,-0.115821,1,-1.404641,-0.638699,-0.637291,-1.137098,0.085983
96,1.0,2020-01-01,2020-01-31,30,place,6,10233,f,"Lancaster, CA",Lancaster,...,1,0.066610,0.008287,-2.180848,0,1.103099,-0.638699,0.778721,1.326353,0.045409
97,1.0,2013-09-01,2013-09-30,30,place,6,24533,f,"Ocean Bluff-Brant Rock, MA",Ocean Bluff-Brant Rock,...,0,-0.839473,-0.057222,-0.174822,0,-1.598615,0.061306,-1.345297,0.094627,0.032327
98,1.0,2024-06-01,2024-06-30,30,place,6,15188,f,"Ogden, IA",Ogden,...,1,0.998275,0.330595,-0.115821,0,-1.404641,-0.638699,0.070715,-1.327647,0.041389


In [11]:
output_file = 'modified_dataset/us_project_encoding.csv'

us_ds_binary.to_csv(output_file, index=False)