In [1]:
import polars as pl

import seaborn as sns
import matplotlib.pyplot as plt

sns.set()

In [2]:
df = pl.read_csv("ingested.csv")

df_filter = df.filter(
    (pl.col("year_of_sale").is_in([2023, 2024])),
).with_columns(
    (pl.col("price").ge(1000000).cast(pl.Int64).alias("ge_1M")),
)
display(df_filter.sample())
print(list(df_filter.columns))

address,lease_remaining,minimum_floor,maximum_floor,built_year,sqft,sqm,flat_type,road,town,year_of_sale,month_of_sale,months_ago,price,psf,band_name,price_is_geq_one_million,ge_1M
str,i64,i64,i64,i64,i64,i64,str,str,str,i64,i64,i64,i64,f64,str,i64,i64
"""494D TAMPINES STREET 43""",90,1,3,2015,731,68,"""3 ROOM""","""TAMPINES STREET 43""","""TAMPINES""",2024,4,8,435000,595.07,"""90 - 94""",0,0


['address', 'lease_remaining', 'minimum_floor', 'maximum_floor', 'built_year', 'sqft', 'sqm', 'flat_type', 'road', 'town', 'year_of_sale', 'month_of_sale', 'months_ago', 'price', 'psf', 'band_name', 'price_is_geq_one_million', 'ge_1M']


In [3]:
with pl.Config(tbl_rows=23, tbl_cols=20):
    display(
        df_filter.filter(pl.col("year_of_sale").eq(2023), pl.col("ge_1M").eq(1))
        .group_by("year_of_sale", "town")
        .agg(pl.col("address").len().alias("count_transactions"))
        .with_columns(
            pl.col("year_of_sale").alias("y2023"),
            pl.col("count_transactions").alias("transactions2023"),
        )
        .select("y2023", "town", "transactions2023")
        .join(
            df_filter.filter(pl.col("year_of_sale").eq(2024), pl.col("ge_1M").eq(1))
            .group_by("year_of_sale", "town")
            .agg(pl.col("address").len().alias("count_transactions"))
            .with_columns(
                pl.col("year_of_sale").alias("y2024"),
                pl.col("count_transactions").alias("transactions2024"),
            )
            .select("y2024", "town", "transactions2024"),
            on="town",
            how="full",
        ).select(
            "town_right", "transactions2023", "transactions2024"
        ).sort(
            "transactions2024", descending=True
        )
    )


town_right,transactions2023,transactions2024
str,u32,u32
"""KALLANG/WHAMPOA""",54.0,154
"""TOA PAYOH""",56.0,141
"""BUKIT MERAH""",62.0,133
"""QUEENSTOWN""",48.0,106
"""BISHAN""",45.0,77
"""ANG MO KIO""",31.0,67
"""CLEMENTI""",31.0,51
"""GEYLANG""",11.0,48
"""CENTRAL AREA""",48.0,45
"""HOUGANG""",5.0,31


In [4]:
with pl.Config(tbl_rows=23, tbl_cols=20):
    display(
        df_filter.filter(pl.col("year_of_sale").eq(2023), pl.col("ge_1M").eq(1))
        .group_by("year_of_sale", "town")
        .agg(pl.col("address").len().alias("count_transactions"))
        .with_columns(
            pl.col("year_of_sale").alias("y2023"),
            pl.col("count_transactions").alias("transactions2023"),
        )
        .select("y2023", "town", "transactions2023")
        .join(
            df_filter.filter(pl.col("year_of_sale").eq(2024), pl.col("ge_1M").eq(1))
            .group_by("year_of_sale", "town")
            .agg(pl.col("address").len().alias("count_transactions"))
            .with_columns(
                pl.col("year_of_sale").alias("y2024"),
                pl.col("count_transactions").alias("transactions2024"),
            )
            .select("y2024", "town", "transactions2024"),
            on="town",
            how="full",
        ).select(
            "town_right", "transactions2023", "transactions2024"
        ).sort(
            "transactions2024", descending=True
        ).filter(
            pl.col("transactions2024").ge(pl.col("transactions2023"))
        ).with_columns(
            (pl.col("transactions2024") / (pl.col("transactions2023"))).alias("yoy_growth")
        ).sort("yoy_growth", descending=True)
    )


town_right,transactions2023,transactions2024,yoy_growth
str,u32,u32,f64
"""TAMPINES""",2,22,11.0
"""BUKIT PANJANG""",1,9,9.0
"""HOUGANG""",5,31,6.2
"""GEYLANG""",11,48,4.363636
"""KALLANG/WHAMPOA""",54,154,2.851852
"""BEDOK""",9,25,2.777778
"""TOA PAYOH""",56,141,2.517857
"""PASIR RIS""",2,5,2.5
"""YISHUN""",4,9,2.25
"""QUEENSTOWN""",48,106,2.208333


In [5]:
with pl.Config(tbl_rows=23, tbl_cols=20):
    display(
        df_filter.filter(
            pl.col("year_of_sale").eq(2024) & 
            pl.col("ge_1M").eq(1) &
            pl.col("town").eq("SENGKANG")
        ).select(
            "year_of_sale", "month_of_sale", 
            "address", "road", "minimum_floor", "maximum_floor", "price"
        ).sort("month_of_sale")
    )


year_of_sale,month_of_sale,address,road,minimum_floor,maximum_floor,price
i64,i64,str,str,i64,i64,i64
2024,4,"""205B COMPASSVALE LANE""","""COMPASSVALE LANE""",7,9,1000000
2024,7,"""216B COMPASSVALE DRIVE""","""COMPASSVALE DRIVE""",16,18,1000000
2024,8,"""102 RIVERVALE WALK""","""RIVERVALE WALK""",10,12,1005000
2024,11,"""216B COMPASSVALE DRIVE""","""COMPASSVALE DRIVE""",13,15,1050000


In [18]:
with pl.Config(tbl_rows=60, tbl_cols=40):
    display(
        df_filter.filter(
            pl.col("year_of_sale").eq(2024) & 
            pl.col("ge_1M").eq(0) &
            pl.col("town").is_in(["JURONG WEST", "CHOA CHU KANG", "SEMBAWANG"])
        ).select(
            "year_of_sale", "month_of_sale", "town",
            "address", "road", "minimum_floor", "maximum_floor", "price"
        ).sort("price", descending=True).head(10)
    )


year_of_sale,month_of_sale,town,address,road,minimum_floor,maximum_floor,price
i64,i64,str,str,str,i64,i64,i64
2024,6,"""JURONG WEST""","""471 JURONG WEST STREET 41""","""JURONG WEST STREET 41""",10,12,980000
2024,9,"""JURONG WEST""","""469 JURONG WEST STREET 41""","""JURONG WEST STREET 41""",4,6,960000
2024,11,"""JURONG WEST""","""138D YUAN CHING ROAD""","""YUAN CHING ROAD""",13,15,952888
2024,12,"""JURONG WEST""","""472 JURONG WEST STREET 41""","""JURONG WEST STREET 41""",13,15,940000
2024,6,"""JURONG WEST""","""138C YUAN CHING ROAD""","""YUAN CHING ROAD""",13,15,920000
2024,11,"""JURONG WEST""","""470 JURONG WEST STREET 41""","""JURONG WEST STREET 41""",10,12,920000
2024,8,"""CHOA CHU KANG""","""342 CHOA CHU KANG LOOP""","""CHOA CHU KANG LOOP""",10,12,915000
2024,4,"""JURONG WEST""","""138B YUAN CHING ROAD""","""YUAN CHING ROAD""",19,21,910000
2024,10,"""CHOA CHU KANG""","""601 CHOA CHU KANG STREET 62""","""CHOA CHU KANG STREET 62""",4,6,908000
2024,6,"""CHOA CHU KANG""","""342 CHOA CHU KANG LOOP""","""CHOA CHU KANG LOOP""",7,9,907888


In [38]:
with pl.Config(tbl_rows=50, tbl_cols=20):
    display(
        df_filter.filter(
            pl.col("year_of_sale").eq(2024) & 
            pl.col("ge_1M").eq(1)
        ).select(
            "year_of_sale", "month_of_sale", "lease_remaining",
            "address", "road", "minimum_floor", "maximum_floor", "price"
        ).with_columns(
            pl.col("lease_remaining").ge(85).cast(pl.Int64).alias("_15_yrs_or_less")
        ).group_by(
#             "year_of_sale", "lease_remaining", 
            "_15_yrs_or_less"
        ).agg(
            pl.col("_15_yrs_or_less").len().alias("count_txns")
        )
    )


_15_yrs_or_less,count_txns
i64,u32
0,503
1,513


In [44]:
with pl.Config(tbl_rows=50, tbl_cols=20):
    display(
        df_filter.filter(
            pl.col("year_of_sale").eq(2024) & 
            pl.col("ge_1M").eq(1) &
            pl.col("town").eq("KALLANG/WHAMPOA")
        ).select(
            "year_of_sale", "month_of_sale", "lease_remaining",
            "address", "road", "minimum_floor", "maximum_floor", "price"
        ).group_by("road", "lease_remaining").agg(pl.col("address").len().alias("count_txns"))
        .sort("count_txns", descending=True)
        .with_columns(
            (pl.col("count_txns") / pl.col("count_txns").sum()).alias("pct_txns")
        )
    )


road,lease_remaining,count_txns,pct_txns
str,i64,u32,f64
"""SAINT GEORGE'S LANE""",95,67,0.435065
"""MCNAIR ROAD""",92,23,0.149351
"""BOON KENG ROAD""",86,16,0.103896
"""BENDEMEER ROAD""",92,7,0.045455
"""UPPER BOON KENG ROAD""",81,7,0.045455
"""UPPER BOON KENG ROAD""",92,5,0.032468
"""TOWNER ROAD""",58,4,0.025974
"""UPPER BOON KENG ROAD""",74,3,0.019481
"""BOON KENG ROAD""",71,3,0.019481
"""BENDEMEER ROAD""",69,3,0.019481


In [54]:
with pl.Config(tbl_rows=50, tbl_cols=20):
    display(
        df_filter.filter(
            pl.col("year_of_sale").eq(2024) & 
            pl.col("ge_1M").eq(1)
        ).with_columns(
            pl.col("built_year").ge(2013).cast(pl.Int64).alias("lease_commencing_2013")
        ).select(
            "year_of_sale", "month_of_sale", "lease_remaining", "lease_commencing_2013", "built_year",
            "address", "road", "minimum_floor", "maximum_floor", "price"
        ).group_by("lease_commencing_2013").agg(pl.col("address").len().alias("count_txns"))
        .sort("lease_commencing_2013", descending=True)
        .with_columns(
            (pl.col("count_txns") / pl.col("count_txns").sum()).alias("pct_txns")
        )
    )


lease_commencing_2013,count_txns,pct_txns
i64,u32,f64
1,376,0.370079
0,640,0.629921


In [57]:
with pl.Config(tbl_rows=50, tbl_cols=20):
    display(
        df_filter.with_columns(
            (pl.col("price") // 100000 * 100000).alias("price_round_down_100k")
        )
        .filter(pl.col("price_is_geq_one_million").eq(1))
        .group_by("price_round_down_100k")
        .agg(pl.len().alias("count_transactions"))
        .with_columns(
            (pl.col("count_transactions") / pl.col("count_transactions").sum()).alias(
                "pct_transactions"
            )
        )
        .sort("price_round_down_100k")
    )


price_round_down_100k,count_transactions,pct_transactions
i64,u32,f64
1000000,846,0.569697
1100000,298,0.200673
1200000,180,0.121212
1300000,89,0.059933
1400000,50,0.03367
1500000,22,0.014815


In [None]:
with pl.Config(tbl_rows=50, tbl_cols=20):
    display(
        df_filter.with_columns(
            (pl.col("price") // 100000 * 100000).alias("price_round_down_100k")
        )
        .filter(pl.col("price_is_geq_one_million").eq(1))
        .group_by("price_round_down_100k")
        .agg(pl.len().alias("count_transactions"))
        .with_columns(
            (pl.col("count_transactions") / pl.col("count_transactions").sum()).alias(
                "pct_transactions"
            )
        )
        .sort("price_round_down_100k")
    )


In [None]:
with pl.Config(set_tbl_rows=30):
    display(
        df.filter(
            (pl.col("year_of_sale").eq(2024)),
            (pl.col("price_is_geq_one_million").eq(1)),
        )
        .group_by("town")
        .agg(pl.col("address").len().alias("count_transactions"))
        .sort("count_transactions", descending=True)
    )


In [None]:
df.filter(
    (pl.col("year_of_sale").eq(2024)),
    (pl.col("price_is_geq_one_million").eq(1)),
    (pl.col("town").eq("SENGKANG"))
).sort("month_of_sale").select(
    "address", "sqft", "minimum_floor", "maximum_floor", "flat_type",
    "price", "year_of_sale", "month_of_sale"
)

In [None]:
df.filter(
    (pl.col("year_of_sale").eq(2024)),
    (pl.col("price_is_geq_one_million").eq(0)),
    (pl.col("town").eq("JURONG WEST"))
).sort("price", descending=True).select(
    "address", "sqft", "minimum_floor", "maximum_floor", "flat_type",
    "price", "year_of_sale", "month_of_sale"
)

In [None]:
df.filter(
    (pl.col("year_of_sale").eq(2024)),
    (pl.col("price_is_geq_one_million").eq(1)),
).sort("price", descending=True).select(
    "address", "sqft", "minimum_floor", "maximum_floor", "flat_type",
    "price", "year_of_sale", "month_of_sale", "psf", "band_name"
)

In [None]:
df.filter(
    (pl.col("year_of_sale").eq(2024)),
    (pl.col("price_is_geq_one_million").eq(1)),
).group_by("band_name").agg(
    pl.col("address").len().alias("count_transactions")
).with_columns(
    pl.col("count_transactions")/pl.col("count_transactions").sum().alias("pct_transactions")
).sort("band_name", descending=True)

In [None]:
df.filter(
    (pl.col("year_of_sale").eq(2024)),
).sort("psf", descending=True).select(
    "address", "sqft", "minimum_floor", "maximum_floor", "flat_type",
    "price", "year_of_sale", "month_of_sale", "psf", "band_name"
)

In [None]:
df_by_sale = df.group_by(
    (pl.col("year_of_sale"))
).agg(
    pl.col("address").len().alias("count_transactions")
).sort(
    "year_of_sale"
)
display(df_by_sale)

In [None]:
sns.barplot(
    x=df_by_sale.select("year_of_sale").to_numpy().flatten(), 
    y=df_by_sale.select("count_transactions").to_numpy().flatten()
)

In [None]:
df_twn = df.filter(
    pl.col("price_is_geq_one_million").eq(1),
    pl.col("year_of_sale").is_in([2022, 2023, 2024])
).group_by(
    "year_of_sale", "town"
).agg(
    pl.col("address").len().alias("count_transactions")
).sort(
    "town", "year_of_sale"
)

In [None]:
# ... existing code ...

plt.figure(figsize=(15, 8))  # Width: 15 inches, Height: 8 inches
sns.barplot(
    x=df_twn.select("town").to_numpy().flatten(), 
    y=df_twn.select("count_transactions").to_numpy().flatten(),
    hue=df_twn.select("year_of_sale").to_numpy().flatten(),
)
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability