In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install git+https://github.com/remykarem/pandas-lightning#egg=pandas-lightning

In [None]:
import pandas_lightning
import seaborn as sns

sns.set_theme()
sns.set(rc={'figure.figsize':(14.7,8.27)})

In [None]:
df_ = pd.read_csv("/kaggle/input/cs5228-2020-semester-2-final-project/train.csv")

In [None]:
def get_average_storey(storey):
    a, b = storey.str.split(" to ").str
    return (b.astype(int) + a.astype(int))/2

def get_flat_age(year, lease_commence_date):
    return year - lease_commence_date

def is_inauspicious(block):
    return block.isin(["13", "4", "44", "444"])

def is_before_covid(date):
    return date < "20200201"

def planning_area_to_district(area):
    # https://www.ura.gov.sg/realEstateIIWeb/resources/misc/list_of_postal_districts.html
    # https://www.harrylau.com/basic-real-estate-knowledge-you-must-know/singapore-district-and-planning-area/
    
    DISTRICT_MAPPING = {
    1: ["raffles place", "cecil", "marina", "people’s park"],
    2: ["anson", "tanjong pagar"],
    3: ["queenstown", "tiong bahru", "alexandra"],
    4: ["telok blangah", "harbourfront", "sentosa", "keppel", "mount faber"],
    5: ["pasir panjang", "buona vista", "dover", "west coast", "clementi new town"],
    6: ["high street", "beach road", "city hall"],
    7: ["middle road", "golden mile", "bugis", "rochor"],
    8: ["little india", "farrer park", "serangoon road"],
    9: ["orchard", "cairnhill", "river valley"],
    10: ["ardmore", "bukit timah", "holland road", "tanglin"],
    11: ["watten estate", "newton", "novena", "thomson"],
    12: ["balestier", "toa payoh", "serangoon"],
    13: ["macpherson", "braddell", "potong pasir"],
    14: ["geylang", "paya lebar", "eunos", "kembangan"],
    15: ["katong", "joo chiat", "amber road", "marine parade", "tanjong rhu", "meyer"],
    16: ["bedok", "upper east coast", "eastwood", "kew drive", "chai chee", "siglap"],
    17: ["loyang", "changi"],
    18: ["tampines", "pasir ris", "simei"],
    19: ["serangoon garden", "hougang", "punggol", "sengkang"],
    20: ["bishan", "ang mo kio", "braddell"],
    21: ["upper bukit timah", "ulu pandan"],
    22: ["jurong", "boon lay", "tuas", "lakeside"],
    23: ["hillview", "dairy farm", "bukit panjang", "choa chu kang", "bukit batok"],
    24: ["lim chu kang"],
    25: ["kranji", "woodgrove", "woodlands"],
    26: ["upper thomson", "springleaf"],
    27: ["yishun", "sembawang", "admiralty"],
    28: ["seletar", "yio chu kang"]
    }
    return area.map_categorical_binning(DISTRICT_MAPPING).astype("category")

def is_prime_district(district):
    # https://www.propertyguru.com.sg/property-guides/ccr-ocr-rcr-region-singapore-ura-map-21045
    return district.isin([9, 10, 11])

def is_core(prime_district, planning_area):
    # https://www.propertyguru.com.sg/property-guides/ccr-ocr-rcr-region-singapore-ura-map-21045
    return prime_district | planning_area.isin(["bugis", "city hall", "sentosa", "shenton way", "tanjong pagar",
                                               "boat quay", "raffles place", "marina downtown", "suntec city"]) 

In [None]:
df = df_.copy(

).rename(
    columns={"month": "date"}
    
).lambdas(inplace=True).astype(
    planning_area="category",
    flat_model="category",
    subzone="category",
    region="category",
    town="category",
    date="datetime"

).lambdas(inplace=True).sapply(
    year=("date", lambda date: date.dt.year),
    flat_type=lambda s: s.str.replace("-", " "),
    avg_storey=("storey_range", get_average_storey),
    age=(["year", "lease_commence_date"], get_flat_age),
    inauspicious=("block", is_inauspicious),
    before_covid=("date", is_before_covid),
    district=("planning_area", planning_area_to_district),
    prime_district=("district", is_prime_district),
    core_central_region=(["prime_district", "planning_area"], is_core),
    
).lambdas(inplace=True).astype(
    flat_type=['1 room', '2 room', '3 room', '4 room', '5 room', 'executive', 'multi generation']
    
).drop(
    columns=["storey_range", "eco_category", "elevation", "block", "lease_commence_date"]
    
)

df

In [None]:
df.tests.info(pctg=False).sort_values(by="dtype")

In [None]:
df.quickplot(
#     numerical=["resale_price"],
    categorical=["district"]

).countplot()