In [1]:
#######################
### Import Packages ###
#######################

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import geopandas as gpd

## A.1. Import Data

### A.1.1. Import Geoboundaries Data
The geographical boundaries data is a product of the non-profit academic project [geoBoundaries](https://www.geoboundaries.org). Specifically, the data can be obtained [here](https://www.geoboundaries.org/countryDownloads.html). Do check out and support the author/project owner at his [github page](https://github.com/DanRunfola).

In [2]:
district_boundaries = gpd.read_file("geo_MYS/geoBoundaries-MYS-ADM2.shp")
district_boundaries = district_boundaries.rename(columns={'shapeName': 'district'})
district_boundaries.head(10)

Unnamed: 0,district,shapeISO,shapeID,shapeGroup,shapeType,geometry
0,Tumpat,,92858781B54418175160244,MYS,ADM2,"POLYGON ((102.23256 6.20976, 102.23256 6.19821..."
1,Kota Bharu,,92858781B26165935379781,MYS,ADM2,"POLYGON ((102.22043 6.08860, 102.22093 6.09363..."
2,Pasir Puteh,,92858781B22030714456206,MYS,ADM2,"POLYGON ((102.26366 5.89071, 102.26329 5.89626..."
3,Pasir Mas,,92858781B75737561234517,MYS,ADM2,"POLYGON ((102.09278 6.15509, 102.09483 6.15372..."
4,Machang,,92858781B41710084977616,MYS,ADM2,"POLYGON ((102.19669 5.91545, 102.19746 5.91327..."
5,Tanah Merah,,92858781B67391800412495,MYS,ADM2,"POLYGON ((101.93337 5.90660, 101.93600 5.90551..."
6,Jeli,,92858781B21811088790673,MYS,ADM2,"POLYGON ((101.88168 5.81062, 101.88319 5.80561..."
7,Kuala Krai,,92858781B31305353938168,MYS,ADM2,"POLYGON ((102.11378 5.64341, 102.11401 5.64274..."
8,Hulu Perak,,92858781B35407838405861,MYS,ADM2,"POLYGON ((101.68943 5.75677, 101.68831 5.75196..."
9,Selama,,92858781B87749635219289,MYS,ADM2,"POLYGON ((100.86695 5.42816, 100.87682 5.41716..."


### A.1.2. Import Socio-Demographical Data
All the data can be obtained via [openDOSM](https://open.dosm.gov.my), a website created by the Department of Statistics, Malaysia in an effort to make public sector data more accessible. The website consolidates data from various government department and agencies, but unfortunately not all. For the purpose of this analysis, only data with district-level breakdown is used. The data can be accessed via this [link](https://open.dosm.gov.my/data-catalogue?geography=DISTRICT).

In [3]:
########################
### Helper Functions ###
########################

def import_openDOSM(url, date_col=None):
    """
    Imports data in parquet format from OpenDOSM and returns a pandas DataFrame.
    Arguments:
        url (str): Parquet URL from OpenDOSM.
        date_col (str, optional): Name of a column to be parsed as datetime. (Default: None)
    Returns:
        df (DataFrame): The dataset in Pandas DataFrame format.
    """
    # Import the data.
    df = pd.read_parquet(url)
    # Change to datetime format, if any.
    if date_col is not None:
        df[date_col] = pd.to_datetime(df[date_col])
    return df


# For the analysis, an interesting comparison would be the period before and after the COVID19 pandemic.
# However, not all datasets have data prior to 2020.
def selectYears(df, date_col, yearCompare=2022):
    """
    Extracts two periods for pre- and post-pandemic comparison. For pre-pandemic, extracts 2019 data if
    available, otherwise extract 2020. Comparison year is 2022 by default.
    Arguments:
        df (DataFrame): DataFrame for analysis.
        date_col (str): Name of date column.
        yearCompare (int, optional): Year for comparison.
    Returns:
        df (DataFrame): DataFrame containing two years of data.
    """
    if df[date_col].dt.year.min() <= 2019:
        df = df[df[date_col].dt.year.isin((2019, yearCompare))]
    else:
        df = df[df[date_col].dt.year.isin((2020, yearCompare))]
    return df


# Combines both helper functions above into one.
def GetDataForAnalysis(url, date_col='date', yearCompare=2022):
    """
    Function to import data from OpenDOSM, change the date to datetime format. Subsequently,
    select pre- and post-pandemic years for analysis. For post-pandemic, default is 2022. For
    pre-pandemic, select 2019 if available. Else, select 2020.
    Arguments:
        df (DataFrame): DataFrame for analysis.
        date_col (str, optional): Name of a column to be parsed as datetime.
        yearCompare (int, optional): Year for comparison.
    Returns:
        df (DataFrame): DataFrame containing two years of data.
    """
    df = import_openDOSM(url, date_col)
    df = selectYears(df, date_col, yearCompare).copy()
    return df

In [4]:
# Import relevant data
df_crime = GetDataForAnalysis('https://storage.data.gov.my/publicsafety/crime_district.parquet')
df_pop = GetDataForAnalysis('https://storage.dosm.gov.my/population/population_district.parquet')
df_births = GetDataForAnalysis('https://storage.dosm.gov.my/demography/birth_district_sex.parquet')
df_deaths = GetDataForAnalysis('https://storage.dosm.gov.my/demography/death_district_sex.parquet')
df_lfs = GetDataForAnalysis('https://storage.dosm.gov.my/labour/lfs_district.parquet')
df_hinc = GetDataForAnalysis('https://storage.dosm.gov.my/hies/hh_income_district.parquet')
df_gini = GetDataForAnalysis('https://storage.dosm.gov.my/hies/hh_inequality_district.parquet')
df_poverty = GetDataForAnalysis('https://storage.dosm.gov.my/hies/hh_poverty_district.parquet')
df_education = GetDataForAnalysis('https://storage.data.gov.my/education/enrolment_school_district.parquet')
df_teachers = GetDataForAnalysis('https://storage.data.gov.my/education/teachers_district.parquet')
df_schools = GetDataForAnalysis('https://storage.data.gov.my/education/schools_district.parquet')

# Edit the schools data due to misleading rows.
df_schools = df_schools.drop(df_schools[df_schools['district']=='Selangau'].index)          # No data is actually available.
df_schools = df_schools.drop(df_schools[df_schools['district']=='Larut Dan Matang'].index)  # No data is actually available.

## A.2. Clean District Columns
Given that district would be the main level of analysis, consistent naming across datasets is important. The official list as of 2020 can be obtained on DOSM's website [here](https://www.mycensus.gov.my/index.php/census-product/publication/census-2020/list-of-administrative-district-mukim-and-local-authority-area). Nonetheless, the exercise is not as straighforward. The crime dataset, which is provided by the Royal Malaysia Police, does not have consistent district naming - potentially the data is reported by the main police stations, which may not have followed the various redrawing of districts. Mapping is done via best-effort basis mainly relying on wikipedia.

<br><br>
### A.2.1. Create mappings to consistent district names

In [5]:
# Clean disctrict names for geoboundaries dataset.
rename_district_map = {'district': {'Kuala Lumpur': 'W.P. Kuala Lumpur',
                                    'Labuan': 'W.P. Labuan',
                                    'Kulaijaya': 'Kulai',
                                    'Larut dan Matang': 'Larut Dan Matang',
                                    'Nabawan / Persiangan': 'Nabawan',
                                    'Ledang': 'Tangkak'}}
district_boundaries = district_boundaries.replace(rename_district_map)


# Clean district names for crimes dataset.
rename_district_crime = {'district': {'Arau': 'Perlis', 'Kangar': 'Perlis', 'Padang Besar': 'Perlis',
                                      'Brickfields': 'W.P. Kuala Lumpur', 'Cheras': 'W.P. Kuala Lumpur', 'Dang Wangi': 'W.P. Kuala Lumpur', 'Sentul': 'W.P. Kuala Lumpur', 'Wangsa Maju': 'W.P. Kuala Lumpur',
                                      'Ampang Jaya': 'Gombak', 'Hulu Selangor': 'Ulu Selangor', 'Kajang': 'Ulu Langat', 'Klang Selatan': 'Klang', 'Klang Utara': 'Klang',
                                      'Petaling Jaya': 'Petaling', 'Serdang': 'Petaling', 'Sg. Buloh': 'Petaling', 'Shah Alam': 'Petaling', 'Subang Jaya': 'Petaling','Sungai Buloh': 'Petaling',
                                      'Batu Gajah': 'Kinta', 'Gerik': 'Hulu Perak', 'Ipoh': 'Kinta', 'Pengkalan Hulu': 'Hulu Perak', 'Sungai Siput': 'Kuala Kangsar', 'Taiping': 'Larut Dan Matang',
                                      'Tanjong Malim': 'Muallim', 'Tapah': 'Batang Padang', 'Iskandar Puteri': 'Johor Bahru', 'Johor Bahru Selatan': 'Johor Bahru', 'Johor Bahru Utara': 'Johor Bahru',
                                      'Kulaijaya': 'Kulai', 'Ledang': 'Tangkak', 'Nusajaya': 'Johor Bahru', 'Seri Alam': 'Johor Bahru', 'Cameron Highland': 'Cameron Highlands', 'Kuala Lipis': 'Lipis',
                                      'Bandar Bharu': 'Bandar Baharu', 'Nilai': 'Seremban', 'Kota Kinabatangan': 'Kinabatangan', 'Kota Samarahan': 'Samarahan', 'Matu Daro': 'Matu', 'Meradong': 'Sarikei',
                                      'Padawan': 'Kuching'}}
df_crime = df_crime.replace(rename_district_crime)
df_crime = df_crime[df_crime['district']!='All'].copy()


# Clean district names for the DOSM datasets.
rename_district_DOSM = {'district': {'Sp Selatan': 'Seberang Perai Selatan',
                                     'Sp Tengah': 'Seberang Perai Tengah',
                                     'Sp Utara': 'Seberang Perai Utara',
                                     'Cameron Highland': 'Cameron Highlands',
                                     'Larut dan Matang': 'Larut Dan Matang'}}
df_pop = df_pop.replace(rename_district_DOSM)
df_hinc = df_hinc.replace(rename_district_DOSM)
df_gini = df_gini.replace(rename_district_DOSM)
df_poverty = df_poverty.replace(rename_district_DOSM)

# Clean district names for the MOE datasets.
rename_district_MOE = {'district': {'Manjung (Dinding)': 'Manjung', 'Kuala Lipis': 'Lipis', 'Hulu Langat': 'Ulu Langat', 'Hulu Selangor': 'Ulu Selangor', 'Larut dan Matang': 'Larut Dan Matang'}}
df_education = df_education.replace(rename_district_MOE)
df_education = df_education[df_education['district']!='All Districts'].copy()
df_teachers = df_teachers.replace(rename_district_MOE)
df_teachers = df_teachers[df_teachers['district']!='All Districts'].copy()
df_schools = df_schools.replace(rename_district_MOE)
df_schools = df_schools[df_schools['district']!='All Districts'].copy()

### A.2.2. Check Completeness of Mapping

In [6]:
# Create a dictionary of the dataset.
dataset_dict = {'df_crime': df_crime, 'df_pop': df_pop, 'df_births': df_births, 'df_deaths': df_deaths,
                'df_lfs': df_lfs, 'df_hinc': df_hinc, 'df_gini': df_gini, 'df_poverty': df_poverty,
                'df_education': df_education, 'df_teachers': df_teachers, 'df_schools': df_schools}

# Create a set of unique district names based on geoboundaries data. To be used as the base for comparison.
district_name_geo = set(district_boundaries['district'].unique())

# Check whether districts within datasets appear in the shapefile.
print("Disctricts that do not appear in the shapefile:")
for k, v in dataset_dict.items():
    district_names = set(v['district'].unique())
    print(k, ":", district_names.difference(district_name_geo))

# Check whether districts in the shapefile appear within datasets.
print("\nDistricts in the shapefile not within the datasets:")
for k, v in dataset_dict.items():
    district_names = set(v['district'].unique())
    print(k, ":", district_name_geo.difference(district_names))


Disctricts that do not appear in the shapefile:
df_crime : {'W.P. Putrajaya'}
df_pop : {'W.P. Putrajaya'}
df_births : {'W.P. Putrajaya'}
df_deaths : {'W.P. Putrajaya'}
df_lfs : set()
df_hinc : {'W.P. Putrajaya'}
df_gini : {'W.P. Putrajaya'}
df_poverty : {'W.P. Putrajaya'}
df_education : {'W.P. Putrajaya', 'Larut Matang & Selama'}
df_teachers : {'W.P. Putrajaya', 'Larut Matang & Selama'}
df_schools : {'W.P. Putrajaya', 'Larut Matang & Selama'}

Districts in the shapefile not within the datasets:
df_crime : {'Sebauh', 'Asajaya', 'Bukit Mabong', 'Telupid', 'Bagan Datuk', 'Tanjung Manis', 'Tambunan', 'Nabawan', 'Pusa', 'Tongod', 'Subis', 'Telang Usan', 'Kuala Nerus', 'Maradong', 'Pokok Sena', 'Kalabakan', 'Pitas', 'Putatan', 'Selangau', 'Kuala Penyu', 'Kecil Lojing', 'Beluru', 'Kabong', 'Daro', 'Pakan', 'Tebedu'}
df_pop : set()
df_births : set()
df_deaths : set()
df_lfs : {'Sebauh', 'Bukit Mabong', 'Telupid', 'Bagan Datuk', 'Tanjung Manis', 'Muallim', 'Perlis', 'Pusa', 'Subis', 'Telang Usa

Observations:
- Unfortunately, the shapefile does not contain the polygon for Putrajaya. Given that Putrajaya is a small administrative state, drop the area would not be covered under this analysis.
- There are also a number of districts without crime, labour force and education data.
- For the MOE datasets, there is no breakdown between Larut + Matang and Selama. Would need to address that somehow.

## A.3. Cleaning the Datasets
Given that most of the current dataset format is in long format, each table would need to be converted to wide format for further analysis. In this case, wide format is also more intuitive.
<br><br>
### A.3.1. Crime Dataset

In [7]:
# Crime: Aggregate
df_crime_agg = df_crime[df_crime['type']=='all'].drop(columns=['type']).reset_index(drop=True)
df_crime_agg = df_crime_agg.groupby(['date', 'state', 'district', 'category'])['crimes'].sum().reset_index()    # Sum crime values at district level, since the renaming exercise earlier resulted in duplicate district observations.
df_crime_agg = df_crime_agg.pivot(index=['date', 'state', 'district'], columns='category', values='crimes')     # Pivot to wide format.
df_crime_agg['total'] = df_crime_agg['assault'] + df_crime_agg['property']                                      # Add a total column.
df_crime_agg.columns = ["crime_" + x for x in df_crime_agg.columns.values]                                      # Rename column for easier identification.
#df_crime_mod = df_crime_mod.reset_index()

# Crime: By type
df_crime_type = df_crime[df_crime['type']!='all']
df_crime_type = df_crime_type.groupby(['date', 'state', 'district', 'category', 'type'])['crimes'].sum().reset_index()      # Sum crime values at district level, since the renaming exercise earlier resulted in duplicate district observations.
df_crime_type = df_crime_type.pivot(index=['date', 'state', 'district'], columns=['category', 'type'], values='crimes')     # Pivot to wide format. 
df_crime_type.columns = ["_".join(x) for x in df_crime_type.columns.values.reshape(-1)]                                     # Flatten hierarchical column names
df_crime_type.columns = ["crime_" + x for x in df_crime_type.columns.values]                                                # Rename column for easier identification.

# Combine the crime tables.
df_crime_mod = df_crime_agg.merge(df_crime_type, on=['date', 'state', 'district'])
df_crime_mod = df_crime_mod.reset_index()
df_crime_mod

Unnamed: 0,date,state,district,crime_assault,crime_property,crime_total,crime_assault_causing_injury,crime_assault_murder,crime_assault_rape,crime_assault_robbery_gang_armed,crime_assault_robbery_gang_unarmed,crime_assault_robbery_solo_armed,crime_assault_robbery_solo_unarmed,crime_property_break_in,crime_property_theft_other,crime_property_theft_vehicle_lorry,crime_property_theft_vehicle_motorcar,crime_property_theft_vehicle_motorcycle
0,2019-01-01,Johor,Batu Pahat,139,524,663,41,3,29,0,37,0,29,157,127,4,28,208
1,2019-01-01,Johor,Johor Bahru,1208,4972,6180,362,20,101,1,543,1,180,716,1078,158,778,2242
2,2019-01-01,Johor,Kluang,127,458,585,60,4,17,0,35,0,11,56,114,12,41,235
3,2019-01-01,Johor,Kota Tinggi,70,401,471,16,6,11,0,27,0,10,79,94,10,32,186
4,2019-01-01,Johor,Kulai,143,543,686,43,1,9,0,53,0,37,112,188,19,44,180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,2022-01-01,Terengganu,Kuala Terengganu,71,402,473,35,1,21,0,9,0,5,139,169,4,9,81
264,2022-01-01,Terengganu,Marang,18,90,108,10,0,5,0,0,0,3,35,40,0,1,14
265,2022-01-01,Terengganu,Setiu,11,40,51,3,0,4,0,0,0,4,9,10,1,5,15
266,2022-01-01,W.P. Kuala Lumpur,W.P. Kuala Lumpur,1034,3796,4830,304,16,73,2,319,1,319,525,1439,83,534,1215


### A.3.2. Population Dataset
While the dataset is quite granular, for the purpose of this analysis I made the simplying assumption to look at each variable separately.

In [8]:
# Data: Total Overall Population
df_pop_tot = df_pop[(df_pop['sex']=='both') & (df_pop['age']=='overall') & (df_pop['ethnicity']=='overall')]    # Get overall population at district level.
df_pop_tot = df_pop_tot.drop(columns=['age', 'sex', 'ethnicity'])                                               # Drop other variables.
df_pop_tot = df_pop_tot.set_index(['date', 'state', 'district'])                                                # Set multi-index variables

# Data: Population by Gender
df_pop_sex = df_pop[(df_pop['age']=='overall') & (df_pop['ethnicity']=='overall')]                              # Get overall gender breakdown at district level.
df_pop_sex = df_pop_sex.drop(columns=['age', 'ethnicity']).reset_index(drop=True)                               # Drop other variables.
df_pop_sex = df_pop_sex.pivot(index=['date', 'state', 'district'], columns='sex', values='population')          # Pivot to wide format.
df_pop_sex = df_pop_sex.drop(columns='both')                                                                    # Drop total column (total population is already available above).
df_pop_sex.columns = ["sex_" + x for x in df_pop_sex.columns.values]                                            # Rename column for easier identification.

# Data: Population by Age Group
df_pop.loc[df_pop['age']=='0-4', 'age'] = '00-04'                                                               # Rename so that column will appear in order.
df_pop.loc[df_pop['age']=='5-9', 'age'] = '05-09'                                                               # Rename so that column will appear in order.
df_pop_age = df_pop[(df_pop['sex']=='both') & (df_pop['ethnicity']=='overall')]                                 # Get overall age breakdown at district level.
df_pop_age = df_pop_age.drop(columns=['sex', 'ethnicity']).reset_index(drop=True)                               # Drop other variables.
df_pop_age = df_pop_age.pivot(index=['date', 'state', 'district'], columns='age', values='population')          # Pivot to wide format.
df_pop_age = df_pop_age.drop(columns='overall')                                                                 # Drop total column (total population is already available above).
df_pop_age.columns = ["age_" + x for x in df_pop_age.columns.values]                                            # Rename column for easier identification.
df_pop_age.columns = [x.replace("-", "_") for x in df_pop_age.columns.values]

# Data: Population by Ethnicity
df_pop_ethnic = df_pop[(df_pop['age']=='overall') & (df_pop['sex']=='both')]                                        # Get overall ethnicity breakdown at district level.
df_pop_ethnic = df_pop_ethnic.drop(columns=['age', 'sex']).reset_index(drop=True)                                   # Drop other variables.
df_pop_ethnic = df_pop_ethnic.pivot(index=['date', 'state', 'district'], columns='ethnicity', values='population')  # Pivot to wide format.
df_pop_ethnic = df_pop_ethnic.drop(columns='overall')                                                               # Drop total column (total population is already available above).
df_pop_ethnic.columns = ["ethnic_" + x for x in df_pop_ethnic.columns.values]                                       # Rename column for easier identification.

# Combine the various population tables.
df_pop_mod = df_pop_tot.merge(df_pop_sex, on=['date', 'state', 'district'])
df_pop_mod = df_pop_mod.merge(df_pop_age, on=['date', 'state', 'district'])
df_pop_mod = df_pop_mod.merge(df_pop_ethnic, on=['date', 'state', 'district'])
df_pop_mod = df_pop_mod.reset_index()
df_pop_mod

Unnamed: 0,date,state,district,population,sex_female,sex_male,age_00_04,age_05_09,age_10_14,age_15_19,age_20_24,age_25_29,age_30_34,age_35_39,age_40_44,age_45_49,age_50_54,age_55_59,age_60_64,age_65_69,age_70_74,age_75_79,age_80_84,age_85+,ethnic_bumi_malay,ethnic_bumi_other,ethnic_chinese,ethnic_indian,ethnic_other_citizen,ethnic_other_noncitizen
0,2020-01-01,Johor,Batu Pahat,495.3,233.7,261.6,30.3,35.6,40.0,45.0,49.0,45.3,43.1,40.4,32.7,29.7,27.1,24.3,20.3,12.7,8.8,5.1,3.3,2.6,311.3,5.1,140.1,6.9,1.8,30.2
1,2020-01-01,Johor,Johor Bahru,1711.2,803.9,907.3,158.7,149.9,141.8,142.0,158.9,147.7,137.9,131.1,109.1,98.5,89.2,79.3,66.2,39.2,27.4,15.9,10.2,8.3,794.4,32.4,606.3,145.0,10.0,123.2
2,2020-01-01,Johor,Kluang,323.8,144.4,179.3,19.7,26.9,28.6,27.7,32.5,30.7,28.0,26.1,21.2,19.0,17.0,15.1,12.5,7.3,5.0,2.9,1.9,1.7,166.6,4.8,92.3,24.8,1.4,33.8
3,2020-01-01,Johor,Kota Tinggi,222.4,104.0,118.4,17.0,17.4,18.2,20.4,21.9,20.2,19.4,18.2,14.2,12.6,11.5,10.3,8.4,5.0,3.4,1.9,1.2,1.1,180.0,3.6,18.6,5.5,0.8,13.8
4,2020-01-01,Johor,Kulai,329.5,151.8,177.7,21.3,22.8,24.6,27.9,34.8,32.8,29.4,27.1,22.1,19.6,17.5,15.5,12.8,8.2,5.6,3.2,2.2,1.9,151.4,3.9,95.3,30.2,1.5,47.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,2022-01-01,Terengganu,Marang,121.5,59.6,61.9,12.4,10.6,12.0,12.5,10.7,9.1,8.7,8.7,7.5,6.1,6.1,5.5,4.5,3.3,1.7,1.2,0.6,0.3,117.9,0.2,1.5,0.0,0.2,1.7
316,2022-01-01,Terengganu,Setiu,61.5,30.2,31.3,6.4,5.1,6.3,6.4,6.1,5.0,4.2,3.9,3.7,3.0,2.8,2.6,2.1,1.6,1.1,0.6,0.3,0.2,59.8,0.1,0.1,0.0,0.1,1.4
317,2022-01-01,W.P. Kuala Lumpur,W.P. Kuala Lumpur,1961.2,920.3,1040.8,115.4,126.2,129.9,150.9,111.0,152.9,215.5,234.8,208.3,150.7,98.2,72.3,62.4,47.3,40.3,25.0,12.5,7.5,837.2,19.9,730.7,177.0,12.7,183.6
318,2022-01-01,W.P. Labuan,W.P. Labuan,96.9,47.0,49.8,9.0,9.5,7.9,7.6,8.6,8.5,8.3,10.0,7.0,4.9,4.1,3.9,3.0,2.0,1.2,0.6,0.3,0.5,34.9,39.9,9.7,0.9,1.1,10.3


### A.3.3. Household Income-Related Dataset

In [9]:
# Merge the related datasets.
df_income = df_hinc.merge(df_gini, on=['date', 'state', 'district'])
df_income = df_income.merge(df_poverty, on=['date', 'state', 'district'])
df_income = df_income.rename(columns={'gini': 'income_gini'})
print(df_income.info())

# Identify why row count is less than what it should be.
district_count = df_income.groupby('district')['date'].nunique()        # Count instances of each district
district_problem = district_count[district_count!=2].index.values       # Each district is supposed to have two values corresponding to two periods. Filter out those that does not meet this.
district_problem_df = df_income[df_income['district'].isin(district_problem)].reset_index(drop=True)
district_problem_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318 entries, 0 to 317
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   state             318 non-null    object        
 1   district          318 non-null    object        
 2   date              318 non-null    datetime64[ns]
 3   income_mean       318 non-null    int64         
 4   income_median     318 non-null    int64         
 5   income_gini       318 non-null    float64       
 6   poverty_absolute  318 non-null    float64       
 7   poverty_relative  318 non-null    float64       
dtypes: datetime64[ns](1), float64(3), int64(2), object(2)
memory usage: 20.0+ KB
None


Unnamed: 0,state,district,date,income_mean,income_median,income_gini,poverty_absolute,poverty_relative
0,Perak,Selama,2022-01-01,4439,3673,0.299,5.3,18.9
1,Sabah,Kalabakan,2022-01-01,4938,3931,0.368,33.3,21.8


For these two districts (Selama and Kalabakan), data is only available for 2022. Impute the 2022 value for 2019. 

In [10]:
df_income_mod = pd.concat([df_income, district_problem_df], ignore_index=True)  # Add the current rows for the district to the table.
df_income_mod.loc[[318,319], 'date'] = pd.Timestamp(2019, 1, 1)                 # Replace the date to 2019.
df_income_mod

Unnamed: 0,state,district,date,income_mean,income_median,income_gini,poverty_absolute,poverty_relative
0,Johor,Batu Pahat,2019-01-01,7392,6504,0.295,2.9,9.0
1,Johor,Batu Pahat,2022-01-01,7419,6347,0.338,5.1,19.4
2,Johor,Johor Bahru,2019-01-01,9315,7342,0.388,3.3,12.8
3,Johor,Johor Bahru,2022-01-01,9869,8232,0.359,3.7,10.4
4,Johor,Kluang,2019-01-01,5953,4933,0.333,5.0,24.9
...,...,...,...,...,...,...,...,...
315,W.P. Labuan,W.P. Labuan,2022-01-01,8250,6904,0.300,2.5,7.0
316,W.P. Putrajaya,W.P. Putrajaya,2019-01-01,12840,9983,0.361,0.4,12.1
317,W.P. Putrajaya,W.P. Putrajaya,2022-01-01,13473,10056,0.368,0.1,11.4
318,Perak,Selama,2019-01-01,4439,3673,0.299,5.3,18.9


### A.3.4. Labour Force Survey Tables

In [11]:
check_district = pd.DataFrame(df_lfs.groupby('state')['district'].nunique()).rename(columns={'district':'district_lfs'})
check_district = check_district.merge(pd.DataFrame(df_pop_mod.groupby('state')['district'].nunique()).rename(columns={'district':'district_full'}), on='state', how='outer').fillna(0)
check_district['diff'] = check_district['district_full'] - check_district['district_lfs']
check_district.sort_values('diff', ascending=False)

Unnamed: 0_level_0,district_lfs,district_full,diff
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sarawak,31.0,40,9.0
Perak,10.0,13,3.0
Sabah,25.0,27,2.0
Kelantan,10.0,11,1.0
Perlis,0.0,1,1.0
Terengganu,7.0,8,1.0
W.P. Kuala Lumpur,0.0,1,1.0
W.P. Labuan,0.0,1,1.0
W.P. Putrajaya,0.0,1,1.0
Johor,10.0,10,0.0


As highlighted earlier, the LFS data seem to lack a lot of district-level data, mostly in Sarawak. From the metadata on the website, it has been highlighted that data for around 20 districts are not available due to _"constraints on data availability at district level for more remote areas"_.

Additionally, the district dataset also omits states where there are no districts (i.e., W.P. Kuala Lumpur. W.P. Labuan, W.P. Putrajaya and Perlis). The data would need to be taken from the state-level dataset.

In [12]:
# Import state-level data.
df_lfs_state = GetDataForAnalysis('https://storage.dosm.gov.my/labour/lfs_state_sex.parquet')

# Keep values for the missing states.
missing_states = ['W.P. Kuala Lumpur', 'W.P. Labuan', 'W.P. Putrajaya', 'Perlis']
df_lfs_state_missing = df_lfs_state[df_lfs_state['state'].isin(missing_states)]

# Remove gender breakdown.
df_lfs_state_missing = df_lfs_state_missing[df_lfs_state_missing['sex']=='both']
df_lfs_state_missing = df_lfs_state_missing.drop(columns='sex')

# Create district column.
df_lfs_state_missing['district'] = df_lfs_state_missing['state']

# Concatenate with district-level data.
df_lfs_mod = pd.concat([df_lfs, df_lfs_state_missing], ignore_index=True)
df_lfs_mod

Unnamed: 0,state,district,date,lf,lf_employed,lf_unemployed,lf_outside,p_rate,u_rate,ep_ratio
0,Johor,Batu Pahat,2019-01-01,213.6,209.8,3.7,90.4,70.3,1.7,69.035867
1,Johor,Batu Pahat,2022-01-01,220.8,216.8,4.0,94.6,70.0,1.8,68.738110
2,Johor,Johor Bahru,2019-01-01,792.4,767.8,24.6,293.7,73.0,3.1,70.693306
3,Johor,Johor Bahru,2022-01-01,829.5,799.0,30.5,292.6,73.9,3.7,71.205775
4,Johor,Kluang,2019-01-01,166.1,160.5,5.6,66.9,71.3,3.4,68.884120
...,...,...,...,...,...,...,...,...,...,...
283,W.P. Kuala Lumpur,W.P. Kuala Lumpur,2022-01-01,884.9,857.5,27.4,337.7,72.4,3.1,58.900000
284,W.P. Labuan,W.P. Labuan,2019-01-01,43.3,40.3,3.0,26.4,62.2,6.9,58.700000
285,W.P. Labuan,W.P. Labuan,2022-01-01,48.5,45.0,3.5,23.8,67.0,7.2,68.300000
286,W.P. Putrajaya,W.P. Putrajaya,2019-01-01,38.6,38.1,0.4,12.2,75.9,1.2,58.600000


In [13]:
# Check district difference again.
check_district = pd.DataFrame(df_lfs_mod.groupby('state')['district'].nunique()).rename(columns={'district':'district_lfs'})
check_district = check_district.merge(pd.DataFrame(df_pop_mod.groupby('state')['district'].nunique()).rename(columns={'district':'district_full'}), on='state', how='outer').fillna(0)
check_district['diff'] = check_district['district_full'] - check_district['district_lfs']
check_district.sort_values('diff', ascending=False)

Unnamed: 0_level_0,district_lfs,district_full,diff
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sarawak,31,40,9
Perak,10,13,3
Sabah,25,27,2
Kelantan,10,11,1
Terengganu,7,8,1
Johor,10,10,0
Kedah,12,12,0
Melaka,3,3,0
Negeri Sembilan,7,7,0
Pahang,11,11,0


### A.3.5. Birth and Death Rates

In [14]:
# Combine birth and death rates datasets.
df_life = df_births.merge(df_deaths, on=['date', 'state', 'district', 'sex'], suffixes=('_births', '_deaths'))

# Remove gender breakdown.
df_life = df_life[df_life['sex']=='both'].copy()
df_life = df_life.drop(columns=['sex'])
df_life

Unnamed: 0,date,state,district,abs_births,rate_births,abs_deaths,rate_deaths
0,2020-01-01,Johor,Batu Pahat,6653,13.432263,3021,6.099334
3,2020-01-01,Johor,Johor Bahru,24675,14.419705,6609,3.862202
6,2020-01-01,Johor,Kluang,4274,13.199506,2060,6.361952
9,2020-01-01,Johor,Kota Tinggi,4182,18.803957,1503,6.758094
12,2020-01-01,Johor,Kulai,3742,11.356601,1243,3.772382
...,...,...,...,...,...,...,...
906,2022-01-01,Terengganu,Marang,2776,22.847737,877,7.218107
909,2022-01-01,Terengganu,Setiu,1449,23.560976,521,8.471545
912,2022-01-01,W.P. Kuala Lumpur,W.P. Kuala Lumpur,19853,10.122884,9962,5.079543
915,2022-01-01,W.P. Labuan,W.P. Labuan,1374,14.179567,392,4.045408


In [15]:
district_count = df_life.groupby('district')['date'].nunique()          # Count instances of each district
district_problem = district_count[district_count!=2].index.values       # Each district is supposed to have two values corresponding to two periods. Filter out those that does not meet this.
district_problem_df = df_life[df_life['district'].isin(district_problem)].reset_index(drop=True)
district_problem_df

Unnamed: 0,date,state,district,abs_births,rate_births,abs_deaths,rate_deaths
0,2022-01-01,Kelantan,Kecil Lojing,226,20.925926,76,7.037037
1,2022-01-01,Perak,Selama,603,17.478261,422,12.231884
2,2022-01-01,Sabah,Kalabakan,271,5.815451,94,2.017167
3,2022-01-01,Sabah,Telupid,337,11.6609,125,4.32526
4,2022-01-01,Sarawak,Beluru,194,6.928571,201,7.178571
5,2022-01-01,Sarawak,Bukit Mabong,104,10.0,135,12.980769
6,2022-01-01,Sarawak,Kabong,309,16.612903,196,10.537634
7,2022-01-01,Sarawak,Pusa,333,16.903553,157,7.969543
8,2022-01-01,Sarawak,Sebauh,269,9.243986,195,6.701031
9,2022-01-01,Sarawak,Subis,647,11.431095,448,7.915194


From the metadata on the website: _"Furthermore, it should be noted that the number of administrative districts increased from 147 to 160 in 2021, due to the creation of new districts in Kelantan (1), Perak (1), Sabah (2), and Sarawak (9)."_

### A.3.6. Education-related Variables

In [16]:
# Data: Education Level.
df_education_mod = df_education[df_education['sex']=='both']                                                                                    # Get overall education breakdown at district level.
df_education_mod = df_education_mod.drop(columns=['sex'])                                                                                       # Drop gender variables.
df_education_mod = df_education_mod.pivot(index=['date', 'state', 'district'], columns='stage', values='students').fillna(0)                    # Pivot to wide format.
df_education_mod.columns = ["edu_" + x for x in df_education_mod.columns.values]                                                                # Rename column for easier identification.
df_education_mod['edu_total'] = df_education_mod['edu_primary'] + df_education_mod['edu_secondary'] + df_education_mod['edu_post_secondary']    #  Total students

# Data: Teachers statistic.
df_teachers_mod = df_teachers[df_teachers['sex']=='both']                                                                       # Get overall education breakdown at district level.
df_teachers_mod = df_teachers_mod.drop(columns=['sex'])                                                                         # Drop gender variables.
df_teachers_mod = df_teachers_mod.pivot(index=['date', 'state', 'district'], columns='stage', values='teachers').fillna(0)      # Pivot to wide format.
df_teachers_mod.columns = ["teach_" + x for x in df_teachers_mod.columns.values]                                                # Rename column for easier identification.
df_teachers_mod['teach_total'] = df_teachers_mod['teach_primary'] + df_teachers_mod['teach_secondary']                          # Total primary and secondary teachers

# Data: Schools statistic.
df_schools_mod = pd.DataFrame(df_schools.groupby(['date', 'state', 'district', 'stage'])['schools'].sum())                                  # Sum across school types.
df_schools_mod = df_schools_mod.reset_index().pivot(index=['date', 'state', 'district'], columns='stage', values='schools').fillna(0)       # Pivot to wide format.
df_schools_mod.columns = ["school_" + x for x in df_schools_mod.columns.values]                                                             # Rename column for easier identification.
df_schools_mod['school_total'] = df_schools_mod['school_primary'] + df_schools_mod['school_secondary'] + df_schools_mod['school_tertiary']  # Total schools



# Combine the various education-related tables.
df_edu_mod = df_education_mod.merge(df_teachers_mod, on=['date', 'state', 'district'])
df_edu_mod = df_edu_mod.merge(df_schools_mod, on=['date', 'state', 'district'])
df_edu_mod = df_edu_mod.reset_index()
df_edu_mod

Unnamed: 0,date,state,district,edu_post_secondary,edu_primary,edu_secondary,edu_total,teach_primary,teach_secondary,teach_total,school_primary,school_secondary,school_tertiary,school_total
0,2019-01-01,Johor,Batu Pahat,1911.0,36797.0,28659.0,67367.0,3643.0,2654.0,6297.0,144.0,32.0,1.0,177.0
1,2019-01-01,Johor,Johor Bahru,3918.0,149749.0,97020.0,250687.0,9778.0,7688.0,17466.0,180.0,87.0,1.0,268.0
2,2019-01-01,Johor,Kluang,1324.0,25430.0,19397.0,46151.0,2373.0,1960.0,4333.0,88.0,29.0,0.0,117.0
3,2019-01-01,Johor,Kota Tinggi,1302.0,21719.0,16925.0,39946.0,2150.0,1772.0,3922.0,83.0,28.0,0.0,111.0
4,2019-01-01,Johor,Kulai,667.0,21970.0,14630.0,37267.0,1587.0,1247.0,2834.0,37.0,12.0,0.0,49.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,2022-01-01,Terengganu,Marang,234.0,12763.0,9305.0,22302.0,1086.0,930.0,2016.0,29.0,13.0,0.0,42.0
296,2022-01-01,Terengganu,Setiu,137.0,7826.0,6596.0,14559.0,940.0,767.0,1707.0,43.0,14.0,0.0,57.0
297,2022-01-01,W.P. Kuala Lumpur,W.P. Kuala Lumpur,1737.0,133531.0,86197.0,221465.0,8644.0,7994.0,16638.0,188.0,104.0,2.0,294.0
298,2022-01-01,W.P. Labuan,W.P. Labuan,281.0,9925.0,6821.0,17027.0,705.0,645.0,1350.0,17.0,11.0,0.0,28.0


## A.4. Combining all the Datasets

In [17]:
# Create dataset for pre-pandemic.
main_df_pre = df_pop_mod[df_pop_mod['date'].dt.year==2020].copy().drop(columns='date')                                                              # Population dataset. Year: 2020.
main_df_pre = main_df_pre.merge(df_crime_mod[df_crime_mod['date'].dt.year==2019].drop(columns=['date', 'state']), on=['district'], how='outer')     # Crime dataset. Year: 2019.
main_df_pre = main_df_pre.merge(df_income_mod[df_income_mod['date'].dt.year==2019].drop(columns=['date', 'state']), on=['district'], how='outer')   # Household income-related dataset. Year: 2019.
main_df_pre = main_df_pre.merge(df_lfs_mod[df_lfs_mod['date'].dt.year==2019].drop(columns=['date', 'state']), on=['district'], how='outer')         # Labour Force Survey dataset. Year: 2019.
main_df_pre = main_df_pre.merge(df_life[df_life['date'].dt.year==2020].drop(columns=['date', 'state']), on=['district'], how='outer')               # Birth and death rates. Year: 2020.
main_df_pre = main_df_pre.merge(df_edu_mod[df_edu_mod['date'].dt.year==2019].drop(columns=['date', 'state']), on=['district'], how='outer')         # Education datasets. Year: 2019.

main_df_pre

Unnamed: 0,state,district,population,sex_female,sex_male,age_00_04,age_05_09,age_10_14,age_15_19,age_20_24,age_25_29,age_30_34,age_35_39,age_40_44,age_45_49,age_50_54,age_55_59,age_60_64,age_65_69,age_70_74,age_75_79,age_80_84,age_85+,ethnic_bumi_malay,ethnic_bumi_other,ethnic_chinese,ethnic_indian,ethnic_other_citizen,ethnic_other_noncitizen,crime_assault,crime_property,crime_total,crime_assault_causing_injury,crime_assault_murder,crime_assault_rape,crime_assault_robbery_gang_armed,crime_assault_robbery_gang_unarmed,crime_assault_robbery_solo_armed,crime_assault_robbery_solo_unarmed,crime_property_break_in,crime_property_theft_other,crime_property_theft_vehicle_lorry,crime_property_theft_vehicle_motorcar,crime_property_theft_vehicle_motorcycle,income_mean,income_median,income_gini,poverty_absolute,poverty_relative,lf,lf_employed,lf_unemployed,lf_outside,p_rate,u_rate,ep_ratio,abs_births,rate_births,abs_deaths,rate_deaths,edu_post_secondary,edu_primary,edu_secondary,edu_total,teach_primary,teach_secondary,teach_total,school_primary,school_secondary,school_tertiary,school_total
0,Melaka,Alor Gajah,249.4,120.6,128.8,13.5,17.5,17.1,34.2,26.8,20.6,23.8,17.8,25.5,10.7,9.4,9.2,8.0,6.8,4.0,2.2,1.5,0.8,181.6,2.8,27.9,18.2,0.7,18.1,64.0,277.0,341.0,30.0,1.0,8.0,0.0,15.0,0.0,10.0,67.0,58.0,2.0,16.0,134.0,7050.0,5907.0,0.356,3.8,18.8,95.3,93.8,1.5,46.3,67.3,1.6,66.196189,3175.0,12.730553,1318.0,5.284683,916.0,20209.0,14366.0,35491.0,2043.0,1516.0,3559.0,77.0,20.0,1.0,98.0
1,Sarawak,Asajaya,33.6,16.5,17.1,2.4,2.7,3.3,3.3,2.5,2.5,2.3,2.0,2.0,2.0,2.1,1.8,1.5,1.3,0.9,0.6,0.3,0.2,28.7,3.0,1.5,0.0,0.0,0.3,,,,,,,,,,,,,,,,4316.0,3101.0,0.383,20.0,28.4,16.9,16.3,0.6,8.4,66.8,3.4,64.426877,634.0,18.869048,227.0,6.755952,331.0,4120.0,3835.0,8286.0,528.0,326.0,854.0,27.0,5.0,0.0,32.0
2,Kelantan,Bachok,157.3,81.1,76.2,14.3,13.4,15.4,22.1,16.2,11.1,9.5,9.0,7.7,8.3,7.5,7.4,5.7,3.7,2.9,1.7,0.9,0.5,153.8,0.5,1.4,0.1,0.4,1.2,42.0,200.0,242.0,24.0,1.0,10.0,0.0,2.0,0.0,5.0,70.0,44.0,3.0,21.0,62.0,4728.0,3621.0,0.361,13.1,10.7,57.1,55.3,1.8,45.0,55.9,3.1,54.162586,2941.0,18.696758,1019.0,6.478067,0.0,15311.0,12790.0,28101.0,1408.0,1338.0,2746.0,34.0,19.0,0.0,53.0
3,Perak,Bagan Datuk,82.8,36.9,45.9,3.4,4.6,6.5,12.2,4.8,6.2,8.1,5.9,9.9,4.7,3.8,3.6,3.1,2.7,1.8,0.9,0.5,0.3,40.9,0.3,14.3,17.1,0.2,9.9,,,,,,,,,,,,,,,,5705.0,4503.0,0.364,6.4,11.0,,,,,,,,748.0,9.033816,493.0,5.954106,204.0,5250.0,5089.0,10543.0,850.0,589.0,1439.0,58.0,10.0,0.0,68.0
4,Kedah,Baling,142.6,69.5,73.1,13.1,13.2,12.2,12.8,14.6,12.7,10.1,8.8,7.7,7.4,7.1,6.6,5.7,4.2,2.9,1.6,1.1,0.7,128.0,0.7,4.2,4.5,2.3,2.9,36.0,270.0,306.0,20.0,1.0,7.0,0.0,6.0,0.0,2.0,49.0,50.0,4.0,14.0,153.0,4313.0,3583.0,0.339,17.9,23.9,70.5,68.9,1.6,33.3,67.9,2.3,66.441659,2994.0,20.995792,1164.0,8.162693,481.0,14359.0,10437.0,25277.0,1460.0,1072.0,2532.0,62.0,16.0,0.0,78.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,Selangor,Ulu Selangor,243.0,113.7,129.3,20.1,20.1,18.1,19.2,18.8,20.9,25.7,23.2,17.4,14.5,12.0,10.0,7.6,6.9,4.6,2.1,1.1,0.9,168.5,5.6,26.3,30.9,1.2,10.6,123.0,373.0,496.0,20.0,1.0,32.0,0.0,55.0,0.0,15.0,75.0,125.0,7.0,30.0,136.0,7598.0,6249.0,0.339,1.5,26.6,128.4,125.4,3.0,40.9,75.9,2.3,74.069699,3489.0,14.358025,1168.0,4.806584,742.0,24261.0,17080.0,42083.0,1916.0,1362.0,3278.0,52.0,17.0,0.0,69.0
157,W.P. Kuala Lumpur,W.P. Kuala Lumpur,1982.1,923.2,1058.9,129.1,125.9,140.8,134.8,116.2,183.7,224.3,250.9,187.7,143.5,85.9,78.0,50.9,52.2,37.5,21.6,11.5,7.5,824.6,21.7,737.2,178.1,12.1,208.4,2891.0,8035.0,10926.0,499.0,22.0,118.0,0.0,1165.0,2.0,1085.0,1044.0,2636.0,254.0,1406.0,2695.0,13257.0,10549.0,0.350,0.2,10.6,863.4,841.0,22.4,369.6,70.0,2.6,64.900000,23087.0,11.647747,8318.0,4.196559,3101.0,130276.0,84761.0,218138.0,8979.0,8326.0,17305.0,191.0,104.0,2.0,297.0
158,W.P. Labuan,W.P. Labuan,95.1,46.2,48.9,10.0,8.2,7.8,8.7,8.1,8.7,8.6,10.0,5.4,4.7,4.3,3.8,2.7,1.7,1.0,0.5,0.4,0.4,23.6,49.0,9.8,0.9,0.9,10.8,20.0,232.0,252.0,9.0,0.0,6.0,0.0,3.0,0.0,2.0,56.0,166.0,0.0,1.0,9.0,8319.0,6726.0,0.333,3.1,12.9,43.3,40.3,3.0,26.4,62.2,6.9,58.700000,1710.0,17.981073,314.0,3.301788,607.0,9393.0,5765.0,15765.0,741.0,646.0,1387.0,17.0,10.0,0.0,27.0
159,W.P. Putrajaya,W.P. Putrajaya,109.2,55.6,53.6,12.9,17.3,11.3,5.9,4.2,6.4,11.4,17.6,9.4,4.5,3.0,2.3,1.4,0.8,0.4,0.2,0.1,0.1,101.8,2.3,0.7,1.3,0.3,2.8,23.0,223.0,246.0,6.0,0.0,4.0,0.0,8.0,0.0,5.0,43.0,95.0,0.0,17.0,68.0,12840.0,9983.0,0.361,0.4,12.1,38.6,38.1,0.4,12.2,75.9,1.2,58.600000,2142.0,19.615385,210.0,1.923077,258.0,18586.0,8465.0,27309.0,1249.0,824.0,2073.0,14.0,11.0,0.0,25.0


In [18]:
# Create dataset for post-pandemic.
main_df_post = df_pop_mod[df_pop_mod['date'].dt.year==2022].copy().drop(columns='date')                                                                 # Population dataset. Year: 2022.
main_df_post = main_df_post.merge(df_crime_mod[df_crime_mod['date'].dt.year==2022].drop(columns=['date', 'state']), on=['district'], how='outer')       # Crime dataset. Year: 2022.
main_df_post = main_df_post.merge(df_income_mod[df_income_mod['date'].dt.year==2022].drop(columns=['date', 'state']), on=['district'], how='outer')     # Household income-related dataset. Year: 2022.
main_df_post = main_df_post.merge(df_lfs_mod[df_lfs_mod['date'].dt.year==2022].drop(columns=['date', 'state']), on=['district'], how='outer')           # Labour Force Survey dataset. Year: 2022.
main_df_post = main_df_post.merge(df_life[df_life['date'].dt.year==2022].drop(columns=['date', 'state']), on=['district'], how='outer')                 # Birth and death rates. Year: 2022.
main_df_post = main_df_post.merge(df_edu_mod[df_edu_mod['date'].dt.year==2022].drop(columns=['date', 'state']), on=['district'], how='outer')           # Education datasets. Year: 2022.

main_df_post

Unnamed: 0,state,district,population,sex_female,sex_male,age_00_04,age_05_09,age_10_14,age_15_19,age_20_24,age_25_29,age_30_34,age_35_39,age_40_44,age_45_49,age_50_54,age_55_59,age_60_64,age_65_69,age_70_74,age_75_79,age_80_84,age_85+,ethnic_bumi_malay,ethnic_bumi_other,ethnic_chinese,ethnic_indian,ethnic_other_citizen,ethnic_other_noncitizen,crime_assault,crime_property,crime_total,crime_assault_causing_injury,crime_assault_murder,crime_assault_rape,crime_assault_robbery_gang_armed,crime_assault_robbery_gang_unarmed,crime_assault_robbery_solo_armed,crime_assault_robbery_solo_unarmed,crime_property_break_in,crime_property_theft_other,crime_property_theft_vehicle_lorry,crime_property_theft_vehicle_motorcar,crime_property_theft_vehicle_motorcycle,income_mean,income_median,income_gini,poverty_absolute,poverty_relative,lf,lf_employed,lf_unemployed,lf_outside,p_rate,u_rate,ep_ratio,abs_births,rate_births,abs_deaths,rate_deaths,edu_post_secondary,edu_primary,edu_secondary,edu_total,teach_primary,teach_secondary,teach_total,school_primary,school_secondary,school_tertiary,school_total
0,Melaka,Alor Gajah,251.8,122.1,129.8,13.1,17.2,18.2,29.7,28.1,22.1,24.5,18.0,22.3,15.5,9.7,8.8,8.4,6.9,4.6,2.4,1.5,0.8,185.4,2.6,27.7,18.3,0.7,17.2,55.0,204.0,259.0,23.0,0.0,15.0,1.0,9.0,0.0,7.0,69.0,84.0,3.0,6.0,42.0,7235.0,5611.0,0.343,4.8,14.1,107.0,104.1,2.9,39.1,73.3,2.7,71.252567,3071.0,12.196187,1668.0,6.624305,511.0,20950.0,14749.0,36210.0,2155.0,1506.0,3661.0,77.0,20.0,1.0,98.0
1,Sarawak,Asajaya,34.1,16.7,17.4,2.6,2.5,3.1,3.5,2.6,2.5,2.3,2.2,2.0,2.0,2.0,1.9,1.6,1.3,0.9,0.7,0.4,0.1,29.3,3.0,1.5,0.0,0.0,0.3,,,,,,,,,,,,,,,,4557.0,3929.0,0.301,17.9,19.9,17.1,16.6,0.6,9.3,64.9,3.3,62.878788,566.0,16.598240,275.0,8.064516,155.0,3974.0,4411.0,8540.0,508.0,342.0,850.0,27.0,5.0,0.0,32.0
2,Kelantan,Bachok,160.7,82.7,78.0,15.1,13.8,14.9,19.8,18.2,12.5,10.1,9.0,7.9,7.9,7.7,7.2,6.3,4.3,2.9,1.8,0.8,0.5,157.5,0.2,1.3,0.1,0.4,1.1,43.0,157.0,200.0,28.0,0.0,7.0,0.0,5.0,0.0,3.0,36.0,45.0,1.0,8.0,67.0,4811.0,3625.0,0.374,12.1,10.4,60.5,58.5,2.0,47.6,56.0,3.4,54.116559,3138.0,19.527069,1152.0,7.168637,548.0,15621.0,11769.0,27938.0,1426.0,1314.0,2740.0,34.0,19.0,0.0,53.0
3,Perak,Bagan Datuk,82.3,36.9,45.3,3.3,4.1,6.0,12.4,5.4,6.0,7.8,5.8,8.4,6.2,3.7,3.4,3.3,2.8,1.8,1.1,0.5,0.3,41.3,0.3,14.3,17.0,0.2,9.2,,,,,,,,,,,,,,,,5160.0,4076.0,0.361,12.4,19.8,,,,,,,,749.0,9.100851,682.0,8.286756,79.0,5081.0,5144.0,10304.0,843.0,573.0,1416.0,58.0,10.0,0.0,68.0
4,Kedah,Baling,144.9,70.8,74.1,13.1,13.2,12.6,12.2,14.2,13.8,10.7,9.1,7.9,7.3,7.1,6.6,5.8,4.5,3.2,1.9,1.1,0.6,130.7,0.5,4.2,4.5,2.3,2.8,47.0,268.0,315.0,19.0,1.0,12.0,0.0,7.0,0.0,8.0,94.0,84.0,2.0,8.0,80.0,4263.0,3400.0,0.347,14.4,19.7,74.1,72.5,1.6,35.3,67.8,2.2,66.270567,2855.0,19.703244,1492.0,10.296756,258.0,14255.0,10748.0,25261.0,1476.0,1001.0,2477.0,62.0,16.0,0.0,78.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,Selangor,Ulu Selangor,246.4,115.8,130.5,19.0,20.1,18.4,18.9,19.2,19.2,24.0,24.8,19.2,15.1,12.7,10.5,8.3,6.9,5.3,2.7,1.2,0.9,173.2,5.2,26.0,31.0,1.2,9.8,94.0,235.0,329.0,17.0,3.0,36.0,0.0,23.0,0.0,15.0,56.0,83.0,8.0,19.0,69.0,8779.0,7678.0,0.297,1.7,23.9,139.1,135.4,3.7,42.5,76.6,2.7,74.600551,3409.0,13.835227,1474.0,5.982143,341.0,24754.0,17562.0,42657.0,1913.0,1361.0,3274.0,51.0,17.0,0.0,68.0
157,W.P. Kuala Lumpur,W.P. Kuala Lumpur,1961.2,920.3,1040.8,115.4,126.2,129.9,150.9,111.0,152.9,215.5,234.8,208.3,150.7,98.2,72.3,62.4,47.3,40.3,25.0,12.5,7.5,837.2,19.9,730.7,177.0,12.7,183.6,1034.0,3796.0,4830.0,304.0,16.0,73.0,2.0,319.0,1.0,319.0,525.0,1439.0,83.0,534.0,1215.0,13325.0,10234.0,0.380,1.4,12.7,884.9,857.5,27.4,337.7,72.4,3.1,58.900000,19853.0,10.122884,9962.0,5.079543,1737.0,133531.0,86197.0,221465.0,8644.0,7994.0,16638.0,188.0,104.0,2.0,294.0
158,W.P. Labuan,W.P. Labuan,96.9,47.0,49.8,9.0,9.5,7.9,7.6,8.6,8.5,8.3,10.0,7.0,4.9,4.1,3.9,3.0,2.0,1.2,0.6,0.3,0.5,34.9,39.9,9.7,0.9,1.1,10.3,24.0,87.0,111.0,9.0,0.0,14.0,0.0,0.0,0.0,1.0,13.0,67.0,0.0,0.0,7.0,8250.0,6904.0,0.300,2.5,7.0,48.5,45.0,3.5,23.8,67.0,7.2,68.300000,1374.0,14.179567,392.0,4.045408,281.0,9925.0,6821.0,17027.0,705.0,645.0,1350.0,17.0,11.0,0.0,28.0
159,W.P. Putrajaya,W.P. Putrajaya,117.0,59.4,57.6,11.5,16.9,14.9,7.8,4.4,5.9,8.8,17.3,13.6,6.1,3.5,2.7,1.8,1.0,0.5,0.2,0.1,0.1,108.7,2.3,0.6,1.4,0.3,3.7,15.0,69.0,84.0,4.0,1.0,2.0,0.0,6.0,0.0,2.0,17.0,39.0,1.0,0.0,12.0,13473.0,10056.0,0.368,0.1,11.4,37.9,37.5,0.4,12.1,75.7,1.0,52.200000,1839.0,15.717949,253.0,2.162393,139.0,22503.0,11436.0,34078.0,1515.0,927.0,2442.0,16.0,11.0,0.0,27.0


Uncomment the part below to save the file to the working folder.

In [None]:
# Save the dataset to the folder.
#main_df_pre.to_csv("data/0_main_df_pre.csv", index=False)
#main_df_post.to_csv("data/0_main_df_post.csv", index=False)
#district_boundaries.to_file('data/district_boundaries.shp', driver='ESRI Shapefile')