In [2]:
import numpy as np
import pandas as pd
from cvxopt import sparse
from scipy.stats import zscore
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
np.random.seed(0)

In [3]:
file_path = 'modified_dataset/us_project_encoding.csv'

us_ds = pd.read_csv(file_path, sep=',')

In [4]:
us_ds.shape

(1789138, 81)

In [5]:
us_ds.head(100)

Unnamed: 0,intercept,period_begin,period_end,period_duration,region_type,region_type_id,table_id,is_seasonally_adjusted,region,city,...,price_increased_yoy,inventory_turnover,sale_to_list_ppsf_ratio,supply_demand_balance,fast_selling,sin_year,cos_year,sin_month,cos_month,normalized_median_sale_price
0,1.0,2017-09-01,2017-09-30,30,place,6,29470,f,"Chicago, IL",Chicago,...,1,-0.659877,-0.223963,3.247222,0,0.573151,1.273751,-1.345297,0.094627,0.039266
1,1.0,2020-07-01,2020-07-31,30,place,6,37598,f,"Parsippany, NJ",Parsippany,...,0,0.591989,-0.005179,0.179183,0,1.103099,-0.638699,-0.637291,-1.137098,0.073469
2,1.0,2021-08-01,2021-08-31,30,place,6,24993,f,"Oakbrook, KY",Oakbrook,...,1,3.177449,-0.029312,0.179183,1,0.573151,-1.151138,-1.155588,-0.616510,0.040100
3,1.0,2018-08-01,2018-08-31,30,place,6,29754,f,"Dunstable, MA",Dunstable,...,0,-0.183649,0.082212,0.002180,0,1.103099,0.761311,-1.155588,-0.616510,0.079157
4,1.0,2023-01-01,2023-01-31,30,place,6,10728,f,"Kalamazoo, MI",Kalamazoo,...,1,-0.142691,0.004767,-0.469826,1,-0.874693,-1.151138,0.778721,1.326353,0.028345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.0,2024-07-01,2024-07-31,30,place,6,16583,f,"Saco, ME",Saco,...,0,0.074897,0.079835,-0.115821,1,-1.404641,-0.638699,-0.637291,-1.137098,0.085983
96,1.0,2020-01-01,2020-01-31,30,place,6,10233,f,"Lancaster, CA",Lancaster,...,1,0.066610,0.008287,-2.180848,0,1.103099,-0.638699,0.778721,1.326353,0.045409
97,1.0,2013-09-01,2013-09-30,30,place,6,24533,f,"Ocean Bluff-Brant Rock, MA",Ocean Bluff-Brant Rock,...,0,-0.839473,-0.057222,-0.174822,0,-1.598615,0.061306,-1.345297,0.094627,0.032327
98,1.0,2024-06-01,2024-06-30,30,place,6,15188,f,"Ogden, IA",Ogden,...,1,0.998275,0.330595,-0.115821,0,-1.404641,-0.638699,0.070715,-1.327647,0.041389


In [6]:
columns_for_model = [
    'state_code_0', 'state_code_1', 'state_code_2', 'state_code_3', 'state_code_4', 'state_code_5', 'property_type_0', 'property_type_1', 'property_type_2', 'median_sale_price', 'median_dom', 'price_drops', 'inventory_turnover', 'price_increased_mom', 'median_sale_price_mom', 'median_list_price', 'median_ppsf', 'median_list_ppsf', 'avg_sale_to_list', 'sold_above_list', 'sale_to_list_ratio', 'sin_year', 'cos_year', 'sin_month', 'cos_month'
]
us_ds_for_model = us_ds[columns_for_model]

In [7]:
us_ds_for_model.head(100)

Unnamed: 0,state_code_0,state_code_1,state_code_2,state_code_3,state_code_4,state_code_5,property_type_0,property_type_1,property_type_2,median_sale_price,...,median_list_price,median_ppsf,median_list_ppsf,avg_sale_to_list,sold_above_list,sale_to_list_ratio,sin_year,cos_year,sin_month,cos_month
0,0,0,0,0,0,1,0,0,1,-0.374561,...,-0.208773,-0.106914,-0.015376,-0.216742,-0.029813,-0.125209,0.573151,1.273751,-1.345297,0.094627
1,0,0,0,0,1,0,0,1,0,0.349894,...,0.131598,0.027611,0.045262,-0.052589,0.349915,0.053787,1.103099,-0.638699,-0.637291,-1.137098
2,0,0,0,0,1,1,0,1,0,-0.356891,...,-0.252977,-0.056876,-0.083672,0.769254,0.850733,0.059784,0.573151,-1.151138,-1.155588,-0.616510
3,0,0,0,1,0,0,0,1,0,0.470368,...,0.224250,0.000360,-0.060149,0.091501,-0.490964,0.003292,1.103099,0.761311,-1.155588,-0.616510
4,0,0,0,1,0,1,0,1,0,-0.605872,...,-0.394607,-0.075282,-0.135217,-0.490857,-0.596074,0.098951,-0.874693,-1.151138,0.778721,1.326353
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,1,1,1,0,1,1,0,0,0.614938,...,0.291617,0.188111,0.211821,0.845293,0.980575,0.025559,-1.404641,-0.638699,-0.637291,-1.137098
96,0,0,1,0,1,1,1,0,0,-0.244448,...,-0.164746,-0.030393,-0.062615,0.295494,0.510632,-0.027308,1.103099,-0.638699,0.778721,1.326353
97,0,0,0,1,0,0,1,0,0,-0.521540,...,0.029841,0.064302,0.168451,-2.033189,-1.226734,-0.808543,-1.598615,0.061306,-1.345297,0.094627
98,0,1,1,0,1,0,1,0,0,-0.329583,...,-0.558074,-0.054269,-0.211645,-0.065575,-1.226734,3.647632,-1.404641,-0.638699,0.070715,-1.327647


In [8]:
output_file = 'modified_dataset/us_project_final_dataset.csv'

us_ds_for_model.to_csv(output_file, index=False)