In [1]:
%matplotlib notebook

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
df=pd.read_csv("auxiliary-data/sg-population-demographics.csv")

In [4]:
df

Unnamed: 0,plannin_area,subzone,age_group,sex,count
0,ang mo kio,ang mo kio town centre,0-4,m,130
1,ang mo kio,cheng san,0-4,m,670
2,ang mo kio,chong boon,0-4,m,460
3,ang mo kio,kebun bahru,0-4,m,380
4,ang mo kio,sembawang hills,0-4,m,90
...,...,...,...,...,...
7831,yishun,springleaf,85+,f,30
7832,yishun,yishun central,85+,f,10
7833,yishun,yishun east,85+,f,160
7834,yishun,yishun south,85+,f,240


In [5]:
values=df["plannin_area"].unique()
print(np.sort(values))

['ang mo kio' 'bedok' 'bishan' 'bukit batok' 'bukit merah' 'bukit panjang'
 'bukit timah' 'changi' 'choa chu kang' 'clementi' 'downtown core'
 'geylang' 'hougang' 'jurong east' 'jurong west' 'kallang' 'mandai'
 'marine parade' 'museum' 'newton' 'novena' 'orchard' 'outram' 'pasir ris'
 'punggol' 'queenstown' 'river valley' 'rochor' 'seletar' 'sembawang'
 'sengkang' 'serangoon' 'singapore river' 'southern islands'
 'sungei kadut' 'tampines' 'tanglin' 'toa payoh' 'western water catchment'
 'woodlands' 'yishun']


In [6]:
values=df["subzone"].unique()
print(np.sort(values))

['admiralty' 'alexandra hill' 'alexandra north' 'aljunied' 'anak bukit'
 'anchorvale' 'ang mo kio town centre' 'balestier' 'bangkit' 'bayshore'
 'bedok north' 'bedok reservoir' 'bedok south' 'bencoolen' 'bendemeer'
 'bishan east' 'boat quay' 'boon keng' 'boon lay place' 'boon teck'
 'boulevard' 'braddell' 'bugis' 'bukit batok central' 'bukit batok east'
 'bukit batok south' 'bukit batok west' 'bukit ho swee' 'bukit merah'
 'cairnhill' 'cecil' 'central subzone' 'changi point' 'changi west'
 'chatsworth' 'cheng san' 'china square' 'chinatown'
 'choa chu kang central' 'choa chu kang north' 'chong boon' 'clarke quay'
 'clementi central' 'clementi north' 'clementi west' 'clementi woods'
 'commonwealth' 'compassvale' 'coronation road' 'crawford' 'dairy farm'
 'depot road' 'dhoby ghaut' 'dover' 'dunearn' 'everton park' 'faber'
 'fajar' 'farrer court' 'farrer park' 'fernvale' 'flora drive'
 'fort canning' 'frankel' 'gali batu' 'geylang bahru' 'geylang east'
 'ghim moh' 'gombak' 'goodwood park'

In [7]:
values=df["age_group"].unique()
print(np.sort(values))

['0-4' '10-14' '15-19' '20-24' '25-29' '30-34' '35-39' '40-44' '45-49'
 '5-9' '50-54' '55-59' '60-64' '65-69' '70-74' '75-79' '80-84' '85+']


In [8]:
values=df["sex"].unique()
print(np.sort(values))

['f' 'm']


In [9]:
df_new = df.groupby(['plannin_area', 'age_group'])['count'].agg('sum').reset_index()

In [10]:
df_new.head()

Unnamed: 0,plannin_area,age_group,count
0,ang mo kio,0-4,6790
1,ang mo kio,10-14,8300
2,ang mo kio,15-19,9340
3,ang mo kio,20-24,10310
4,ang mo kio,25-29,11180


In [11]:
df['0-4'] = df['count'].where(df['age_group'] == '0-4')
df['5-9'] = df['count'].where(df['age_group'] == '5-9')
df['10-14'] = df['count'].where(df['age_group'] == '10-14')
df['15-19'] = df['count'].where(df['age_group'] == '15-19')
df['20-24'] = df['count'].where(df['age_group'] == '20-24')
df['25-29'] = df['count'].where(df['age_group'] == '25-29')
df['30-34'] = df['count'].where(df['age_group'] == '30-34')
df['35-39'] = df['count'].where(df['age_group'] == '35-39')
df['40-44'] = df['count'].where(df['age_group'] == '40-44')
df['45-49'] = df['count'].where(df['age_group'] == '45-49')
df['50-54'] = df['count'].where(df['age_group'] == '50-54')
df['55-59'] = df['count'].where(df['age_group'] == '55-59')
df['60-64'] = df['count'].where(df['age_group'] == '60-64')
df['65-69'] = df['count'].where(df['age_group'] == '65-69')
df['70-74'] = df['count'].where(df['age_group'] == '70-74')
df['75-79'] = df['count'].where(df['age_group'] == '75-79')
df['80-84'] = df['count'].where(df['age_group'] == '80-84')
df['85+'] = df['count'].where(df['age_group'] == '85+')
df.iloc[300:320]

Unnamed: 0,plannin_area,subzone,age_group,sex,count,0-4,5-9,10-14,15-19,20-24,...,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80-84,85+
300,hougang,hougang west,5-9,m,1010,,1010.0,,,,...,,,,,,,,,,
301,hougang,kangkar,5-9,m,580,,580.0,,,,...,,,,,,,,,,
302,hougang,kovan,5-9,m,610,,610.0,,,,...,,,,,,,,,,
303,hougang,lorong ah soo,5-9,m,690,,690.0,,,,...,,,,,,,,,,
304,hougang,tai seng,5-9,m,360,,360.0,,,,...,,,,,,,,,,
305,hougang,trafalgar,5-9,m,1220,,1220.0,,,,...,,,,,,,,,,
306,jurong east,lakeside,5-9,m,20,,20.0,,,,...,,,,,,,,,,
307,jurong east,teban gardens,5-9,m,460,,460.0,,,,...,,,,,,,,,,
308,jurong east,toh guan,5-9,m,390,,390.0,,,,...,,,,,,,,,,
309,jurong east,yuhua east,5-9,m,590,,590.0,,,,...,,,,,,,,,,


In [12]:
df=df.drop(columns=["subzone","age_group","sex","count"])
df.head()

Unnamed: 0,plannin_area,0-4,5-9,10-14,15-19,20-24,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80-84,85+
0,ang mo kio,130.0,,,,,,,,,,,,,,,,,
1,ang mo kio,670.0,,,,,,,,,,,,,,,,,
2,ang mo kio,460.0,,,,,,,,,,,,,,,,,
3,ang mo kio,380.0,,,,,,,,,,,,,,,,,
4,ang mo kio,90.0,,,,,,,,,,,,,,,,,


In [19]:
aggregations = {
    '0-4':'sum',
    '5-9':'sum',
    '10-14':'sum',
    '15-19':'sum',
    '20-24':'sum',
    '25-29':'sum',
    '30-34':'sum',
    '35-39':'sum',
    '40-44':'sum',
    '45-49':'sum',
    '50-54':'sum',
    '55-59':'sum',
    '60-64':'sum',
    '65-69':'sum',
    '70-74':'sum',
    '75-79':'sum',
    '80-84':'sum',
    '85+':'sum',
}

In [20]:
df.groupby('plannin_area').agg(aggregations)

Unnamed: 0_level_0,0-4,5-9,10-14,15-19,20-24,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80-84,85+
plannin_area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ang mo kio,6790.0,7680.0,8300.0,9340.0,10310.0,11180.0,12240.0,13080.0,13720.0,13000.0,14020.0,13810.0,13000.0,11050.0,6680.0,5150.0,3240.0,2330.0
bedok,11690.0,13420.0,14770.0,16940.0,19480.0,19870.0,19290.0,20850.0,22540.0,21450.0,23420.0,23360.0,20580.0,16740.0,9330.0,7340.0,4770.0,4010.0
bishan,3430.0,4330.0,4720.0,5530.0,6870.0,6470.0,5720.0,5990.0,7070.0,6810.0,7540.0,7710.0,6360.0,4860.0,2730.0,2150.0,1370.0,1090.0
bukit batok,5510.0,6900.0,7960.0,9240.0,10090.0,10460.0,9770.0,9960.0,10990.0,11320.0,12190.0,11780.0,9380.0,6000.0,3050.0,2250.0,1460.0,1110.0
bukit merah,7220.0,7470.0,6650.0,6860.0,7730.0,9360.0,11120.0,12840.0,12730.0,11320.0,11340.0,11760.0,10940.0,10060.0,6480.0,5610.0,3530.0,3060.0
bukit panjang,7320.0,7620.0,8470.0,9420.0,10400.0,10990.0,10680.0,10040.0,11180.0,10560.0,11630.0,10680.0,7900.0,5200.0,2750.0,2010.0,1310.0,1040.0
bukit timah,3120.0,4660.0,4820.0,4820.0,4990.0,4300.0,3780.0,4950.0,6630.0,6340.0,5820.0,5330.0,4550.0,3950.0,2380.0,1900.0,1240.0,1010.0
changi,180.0,230.0,220.0,160.0,130.0,110.0,170.0,240.0,300.0,220.0,140.0,110.0,100.0,80.0,50.0,30.0,30.0,40.0
choa chu kang,7190.0,9350.0,11380.0,14820.0,15250.0,11940.0,11530.0,11970.0,13720.0,16000.0,16190.0,12910.0,8830.0,5660.0,2990.0,2220.0,1430.0,1050.0
clementi,3910.0,4410.0,4490.0,4680.0,4870.0,5830.0,6640.0,7310.0,7610.0,6950.0,6970.0,6560.0,6650.0,6200.0,3360.0,2600.0,1480.0,1210.0
