In [1]:
import polars as pl
pl.Config.set_tbl_rows(300) 

polars.config.Config

In [2]:
df = pl.read_csv("ingested.csv")

In [3]:
df = df.with_columns(
    (1_000_000 <= pl.col("price")).cast(pl.Int8).alias("price_is_geq_one_million")
)

In [4]:
display(df.sample())

address,lease_remaining,minimum_floor,maximum_floor,built_year,sqft,sqm,flat_type,road,town,year_of_sale,month_of_sale,months_ago,price,psf,band_name,price_is_geq_one_million
str,i64,i64,i64,i64,i64,i64,str,str,str,i64,i64,i64,i64,f64,str,i8
"""133 GEYLANG EAST AVENUE 1""",62,10,12,1987,1550,144,"""EXECUTIVE""","""GEYLANG EAST AVENUE 1""","""GEYLANG""",2024,8,3,1038000,66967.74,"""60 - 64""",1


In [5]:
df_filter = df.select(
    ("flat_type", "year_of_sale", "month_of_sale", "months_ago", "price_is_geq_one_million")
)

In [6]:
display(df_filter.sample())

flat_type,year_of_sale,month_of_sale,months_ago,price_is_geq_one_million
str,i64,i64,i64,i8
"""4 ROOM""",2022,8,27,0


In [7]:
dm = df_filter.group_by(["year_of_sale", "month_of_sale", "months_ago", "flat_type"]).agg(
    pl.col("flat_type").len().alias("no_of_flats_sold"),
    pl.col("price_is_geq_one_million").sum().alias("no_of_million_dollar_flats_sold"),    
)

In [8]:
display(dm.sort(
    "months_ago", descending=False
).head(10))

year_of_sale,month_of_sale,months_ago,flat_type,no_of_flats_sold,no_of_million_dollar_flats_sold
i64,i64,i64,str,u32,i64
2024,11,0,"""2 ROOM""",2,0
2024,11,0,"""1 ROOM""",1,0
2024,11,0,"""EXECUTIVE""",13,0
2024,11,0,"""3 ROOM""",47,0
2024,11,0,"""4 ROOM""",79,7
2024,11,0,"""5 ROOM""",46,3
2024,10,1,"""3 ROOM""",571,0
2024,10,1,"""5 ROOM""",495,33
2024,10,1,"""1 ROOM""",1,0
2024,10,1,"""EXECUTIVE""",139,29


In [9]:
md_sold_per_year = dm.group_by("year_of_sale").agg(
    [
        pl.col("no_of_flats_sold").sum().alias("number_of_flats_sold"),
        pl.col("no_of_million_dollar_flats_sold").sum().alias("number_of_million_dollar_flats_sold"),        
    ]
)
md_sold_per_year = md_sold_per_year.with_columns(
    (pl.col("number_of_million_dollar_flats_sold") / pl.col("number_of_flats_sold"))
    .round(3).alias("percentage_of_million_dollar_flats_sold")
)
display(md_sold_per_year.sort("year_of_sale"))

year_of_sale,number_of_flats_sold,number_of_million_dollar_flats_sold,percentage_of_million_dollar_flats_sold
i64,u32,i64,f64
2022,26720,369,0.014
2023,25755,469,0.018
2024,24006,863,0.036


In [10]:
total_sold = md_sold_per_year.select("number_of_flats_sold").sum()
print(total_sold)
total_million_dollar_flats_sold = md_sold_per_year.select("number_of_million_dollar_flats_sold").sum()
print(total_million_dollar_flats_sold)
print(total_million_dollar_flats_sold/total_sold)

shape: (1, 1)
┌──────────────────────┐
│ number_of_flats_sold │
│ ---                  │
│ u32                  │
╞══════════════════════╡
│ 76481                │
└──────────────────────┘
shape: (1, 1)
┌─────────────────────────────────┐
│ number_of_million_dollar_flats… │
│ ---                             │
│ i64                             │
╞═════════════════════════════════╡
│ 1701                            │
└─────────────────────────────────┘
shape: (1, 1)
┌─────────────────────────────────┐
│ number_of_million_dollar_flats… │
│ ---                             │
│ f64                             │
╞═════════════════════════════════╡
│ 0.022241                        │
└─────────────────────────────────┘


In [11]:
md_sold_per_year_4r = dm.filter(
    pl.col("flat_type").eq("4 ROOM")
).group_by(["year_of_sale", "flat_type"]).agg(
    [
        pl.col("no_of_flats_sold").sum().alias("number_of_flats_sold"),
        pl.col("no_of_million_dollar_flats_sold").sum().alias("number_of_million_dollar_flats_sold"),        
    ]
)
md_sold_per_year_4r = md_sold_per_year_4r.with_columns(
    (pl.col("number_of_million_dollar_flats_sold") / pl.col("number_of_flats_sold"))
    .round(3).alias("percentage_of_million_dollar_flats_sold")
)
display(md_sold_per_year_4r.sort("year_of_sale"))

year_of_sale,flat_type,number_of_flats_sold,number_of_million_dollar_flats_sold,percentage_of_million_dollar_flats_sold
i64,str,u32,i64,f64
2022,"""4 ROOM""",11309,38,0.003
2023,"""4 ROOM""",11357,125,0.011
2024,"""4 ROOM""",10282,302,0.029


In [12]:
total_sold = md_sold_per_year_4r.select("number_of_flats_sold").sum()
print(total_sold)
total_million_dollar_flats_sold = md_sold_per_year_4r.select("number_of_million_dollar_flats_sold").sum()
print(total_million_dollar_flats_sold)
print(total_million_dollar_flats_sold/total_sold)

shape: (1, 1)
┌──────────────────────┐
│ number_of_flats_sold │
│ ---                  │
│ u32                  │
╞══════════════════════╡
│ 32948                │
└──────────────────────┘
shape: (1, 1)
┌─────────────────────────────────┐
│ number_of_million_dollar_flats… │
│ ---                             │
│ i64                             │
╞═════════════════════════════════╡
│ 465                             │
└─────────────────────────────────┘
shape: (1, 1)
┌─────────────────────────────────┐
│ number_of_million_dollar_flats… │
│ ---                             │
│ f64                             │
╞═════════════════════════════════╡
│ 0.014113                        │
└─────────────────────────────────┘
