In [1]:
import polars as pl
pl.Config.set_tbl_rows(300) 

polars.config.Config

In [2]:
df = pl.read_csv("ingested.csv")

In [3]:
df = df.with_columns(
    (1_000_000 <= pl.col("price")).cast(pl.Int8).alias("price_is_geq_one_million")
)

In [4]:
display(df.sample())

address,lease_remaining,minimum_floor,maximum_floor,built_year,sqft,sqm,flat_type,road,town,year_of_sale,month_of_sale,months_ago,price,psf,band_name,price_is_geq_one_million
str,i64,i64,i64,i64,i64,i64,str,str,str,i64,i64,i64,i64,f64,str,i8
"""432 CHOA CHU KANG AVENUE 4""",68,1,3,1993,1119,104,"""4 ROOM""","""CHOA CHU KANG AVENUE 4""","""CHOA CHU KANG""",2022,9,26,480000,42895.44,"""65 - 69""",0


In [5]:
df.group_by("flat_type").agg(
    pl.col("flat_type").len().alias("no_of_units_sold")
).sort("flat_type")

flat_type,no_of_units_sold
str,u32
"""1 ROOM""",24
"""2 ROOM""",1835
"""3 ROOM""",18595
"""4 ROOM""",32948
"""5 ROOM""",18138
"""EXECUTIVE""",4921
"""MULTI-GENERATION""",20


In [6]:
df_filter = df.select(
    ["flat_type", "year_of_sale", "month_of_sale", "months_ago", "price_is_geq_one_million"]
)
df_filter = df_filter.filter(
    pl.col("flat_type").is_in(["3 ROOM", "4 ROOM", "5 ROOM", "EXECUTIVE", "MULTI-GENERATION"])
)

In [7]:
df_filter.sample()

flat_type,year_of_sale,month_of_sale,months_ago,price_is_geq_one_million
str,i64,i64,i64,i8
"""3 ROOM""",2023,1,22,0


In [8]:
df_filter.group_by("flat_type").agg(
    pl.col("months_ago").len()
).sort("flat_type")

flat_type,months_ago
str,u32
"""3 ROOM""",18595
"""4 ROOM""",32948
"""5 ROOM""",18138
"""EXECUTIVE""",4921
"""MULTI-GENERATION""",20


In [9]:
dm = df_filter.group_by(["year_of_sale", "month_of_sale", "months_ago", "flat_type"]).agg(
    pl.col("flat_type").len().alias("no_of_flats_sold"),
    pl.col("price_is_geq_one_million").sum().alias("no_of_million_dollar_flats_sold"),    
)

In [10]:
display(dm.sort(
    [pl.col("months_ago"), pl.col("flat_type")], descending=(False, False)
).head(12))

year_of_sale,month_of_sale,months_ago,flat_type,no_of_flats_sold,no_of_million_dollar_flats_sold
i64,i64,i64,str,u32,i64
2024,11,0,"""3 ROOM""",47,0
2024,11,0,"""4 ROOM""",79,7
2024,11,0,"""5 ROOM""",46,3
2024,11,0,"""EXECUTIVE""",13,0
2024,10,1,"""3 ROOM""",571,0
2024,10,1,"""4 ROOM""",897,41
2024,10,1,"""5 ROOM""",495,33
2024,10,1,"""EXECUTIVE""",139,29
2024,9,2,"""3 ROOM""",554,0
2024,9,2,"""4 ROOM""",926,42


In [11]:
md_sold_per_year = dm.group_by("year_of_sale").agg(
    [
        pl.col("no_of_flats_sold").sum().alias("number_of_flats_sold"),
        pl.col("no_of_million_dollar_flats_sold").sum().alias("number_of_million_dollar_flats_sold"),        
    ]
)
md_sold_per_year = md_sold_per_year.with_columns(
    (pl.col("number_of_million_dollar_flats_sold") / pl.col("number_of_flats_sold"))
    .round(3).alias("percentage_of_million_dollar_flats_sold")
)
display(md_sold_per_year.sort("year_of_sale"))

year_of_sale,number_of_flats_sold,number_of_million_dollar_flats_sold,percentage_of_million_dollar_flats_sold
i64,u32,i64,f64
2022,26254,369,0.014
2023,25061,469,0.019
2024,23307,863,0.037


In [12]:
total_sold = md_sold_per_year.select("number_of_flats_sold").sum()
print(total_sold)
total_million_dollar_flats_sold = md_sold_per_year.select("number_of_million_dollar_flats_sold").sum()
print(total_million_dollar_flats_sold)
print(total_million_dollar_flats_sold/total_sold)

shape: (1, 1)
┌──────────────────────┐
│ number_of_flats_sold │
│ ---                  │
│ u32                  │
╞══════════════════════╡
│ 74622                │
└──────────────────────┘
shape: (1, 1)
┌─────────────────────────────────┐
│ number_of_million_dollar_flats… │
│ ---                             │
│ i64                             │
╞═════════════════════════════════╡
│ 1701                            │
└─────────────────────────────────┘
shape: (1, 1)
┌─────────────────────────────────┐
│ number_of_million_dollar_flats… │
│ ---                             │
│ f64                             │
╞═════════════════════════════════╡
│ 0.022795                        │
└─────────────────────────────────┘


In [13]:
df_mil = df.filter(
    pl.col("price_is_geq_one_million").eq(1)
).select(
    ["address", "lease_remaining", "flat_type", "sqft", "price", "year_of_sale", "months_ago"]
).sample(150)
df_mil = df_mil.with_columns(
    (pl.col("price") / pl.col("sqft")).round(0).alias("psf")
)

In [14]:
display(df_mil.sort("psf", descending=True))

address,lease_remaining,flat_type,sqft,price,year_of_sale,months_ago,psf
str,i64,str,i64,i64,i64,i64,f64
"""1B CANTONMENT ROAD""",86,"""4 ROOM""",1011,1410000,2023,15,1395.0
"""1G CANTONMENT ROAD""",86,"""4 ROOM""",1011,1365000,2024,5,1350.0
"""1C CANTONMENT ROAD""",86,"""4 ROOM""",1001,1325000,2024,1,1324.0
"""1A CANTONMENT ROAD""",86,"""5 ROOM""",1130,1435000,2024,2,1270.0
"""1B CANTONMENT ROAD""",86,"""4 ROOM""",1001,1248888,2024,6,1248.0
"""139A LORONG 1A TOA PAYOH""",87,"""5 ROOM""",1259,1568888,2024,10,1246.0
"""8B UPPER BOON KENG ROAD""",92,"""4 ROOM""",1054,1300888,2023,13,1234.0
"""1G CANTONMENT ROAD""",86,"""5 ROOM""",1140,1400000,2023,20,1228.0
"""9A BOON TIONG ROAD""",91,"""5 ROOM""",1205,1450000,2024,8,1203.0
"""130A LORONG 1 TOA PAYOH""",93,"""4 ROOM""",1001,1200888,2024,2,1200.0
