In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd() / 'src')) 
from nyc_sales.clean import DataCleaner as dc  
from nyc_sales.extract import DataExtractor as de  
from nyc_sales.ingest import DataIngester as di 
from nyc_sales.visualize import Visualizer as v   
from nyc_sales.metrics import MetricsCalculator as mc

In [2]:
# Step 1: Download all raw NYC sales Excel files into data/raw directory
de.download(['manhattan', 'bronx', 'brooklyn', 'queens', 'statenisland', 'staten_island'], 
            range(2015, 2025), 
            ['xlsx', 'xls'], 
            Path('data/r'))

55 files were downloaded successfully to data/r


In [3]:
# Step 2: Extract the data from raw Excel files and normalize column names
raw_sales = de.extract(src_dir='data/r', trgt_dir='data/c')

55 files were processed successfully to data/c


In [4]:
# Step 3: Load the extracted CSVs into a single DataFrame (returns concatenated DataFrame from CSVs)
clean_sales = de.load(src_dir='data/c') 

In [5]:
# Step 4: Clean and transform the data for downstream use and analysis
intermediate_sales = dc.clean(src_dir='data/c', trgt_dir='data/i', file_name='nyc_sales_2015_2025.csv')

In [6]:
# Step 5: Ingest the cleaned and aggregated summary data into year-partitioned files
summary_sales = di.ingest('data/i', 'data/p', 'nyc_sales_summary.csv')

In [7]:
# Step 6: Compute the custom matrix (affordability + market breadth at borough/year level)
matrix = mc.compute('data/p', 'data/p', 'nyc_sales_custom_matrix.csv')

In [8]:
# Step 7: Generate visualizations to answer key research questions

# Q1: How have neighborhood prices evolved across boroughs between 2015 and 2025?
borough_trajectories_fig = v.create_borough_trajectories(df=intermediate_sales)
v.savefig(borough_trajectories_fig, 'borough_trajectories.png')

# Q2: Which boroughs experienced the steepest declines in entry-level affordability?
affordability_fig = v.create_affordability_index_plot(df=matrix)
v.savefig(affordability_fig, 'borough_affordability_index.png')

# Q3: Was the post-COVID rebound broad-based (high Market Breadth) or concentrated in select neighborhoods?
market_breadth_fig = v.create_market_breadth_plot(df=matrix)
v.savefig(market_breadth_fig, 'market_breadth.png')

# Q4: Where does 2025 YTD stand relative to pre-COVID and prior-cycle peaks?
snapshot_fig = v.create_2025_snapshot(df=intermediate_sales)
for idx, fig in enumerate(snapshot_fig):
    v.savefig(fig, f'snapshot_2025_vs_benchmarks_{idx}.png')