## Shapefile Converter
A tool to convert Telkomsel's shapefile into GeoJSON format.    

**Writer** : ngakan.gandhi@dsanalytics.com  
**License** : MIT License.

In [1]:
# Import modules
import geopandas as gpd
import json 
import pandas as pd
import shapely
from pathlib import Path

In [2]:
# Set the path towards our data directory
data_dir = str(Path.cwd()) + '/PETA_V21'
out_dir = str(Path.cwd()) + '/results'

In [3]:
def load_shapefile_into_geodataframe(shp_in):
    """Load the .shp file and return a GeoDataFrame"""
    print("Reading shapefile input...")
    return gpd.read_file(shp_in)

In [4]:
def subset_geodataframe(geodataframe, list_of_cols_to_keep):
    """Only keep columns of a Geodataframe as we specify"""
    gdf = geodataframe[list_of_cols_to_keep]
    return gdf

In [5]:
def rename_geodataframe_columns(geodataframe, col_name_new):
    """Rename geodataframe column names as we specify.
    But keep 'geometry' as is."""
    col_name_orig = list(geodataframe.columns)
    col_name_dict = dict(zip(col_name_orig, col_name_new))
    gdf.rename(columns=col_name_dict, inplace=True)
    return gdf

In [6]:
def create_gdf_hierarchy(geodataframe):
    """Construct a new column 'hierarchy' representing
    the structure used in Dataspark's GeoJSON""" 
    def create_hierarchy(row):
        dict_container = [{'id': row.ID_DESA, 'level': 'kelurahan', 'name': row.DESA},
                          {'id': row.ID_KEC, 'level': 'kecamatan', 'name': row.KECAMATAN},
                          {'id': row.ID_KAB, 'level': 'kabupaten', 'name': row.KABUPATEN},
                          {'id': row.ID_PROV, 'level': 'province', 'name': row.PROVINSI}]
        return pd.Series(str(dict_container))
    geodataframe['hierarchy'] = geodataframe.apply(lambda row: create_hierarchy(row), axis=1)
    return geodataframe

In [7]:
def slice_geodataframe(geodataframe, start_idx, end_idx):
    """Slice a geodataframe to return a smaller set of it"""
    print("Sliced GeoDataFrame has length of : {}".format(len(geodataframe[start_idx:end_idx])))
    gdf = geodataframe[start_idx:end_idx]
    gdf.crs = None
    return gdf

In [8]:
def write_geodataframe_into_geojson(geodataframe, geojson_path):
    """Write the geodataframe file as a GeoJSON file."""
    print("Writing GeoDataFrame to GeoJSON...")
    # unset CRS
    geodataframe.crs = None
    geodataframe = geodataframe[['geometry','hierarchy']]
    geodataframe.to_file(geojson_path, driver='GeoJSON')
    print("Done writing shapefile input to GeoJSON!")

In [9]:
def json_neater(raw_geojson_in, neat_geojson_out):
    """Indentise the GeoJSON converted by load_shapefile_into_geojson()"""
    print("Loading raw GeoJSON file...")
    with open(raw_geojson_in, encoding='utf-8', errors='ignore') as json_data:
        data = json.load(json_data, strict=False)
        # Some checks to replace 'Feature' to 'Features'
        # As workaround to nullpointerexception
        for p in data['features']:
            p['type'] = p['type'].replace('Feature', 'Features')
        # Checks finished
        with open(neat_geojson_out, 'w') as outfile:
            json.dump(data, outfile, indent=4)
    print("Done prettify GeoJSON!")

In [10]:
def get_geodataframe_row_from_id_desa(geodataframe, id_desa):
    """Return the index of the row containing specified ID_DESA"""
    row_idx = geodataframe.index[geodataframe['ID_DESA'] == str(id_desa)].tolist()
    return row_idx

In [11]:
# Set shp_in, geojson, final json path
shp_in = data_dir + '/BTS_DESA_NASIONAL_V21_0.shp'
geojson_path = out_dir + '/BTS_DESA_NASIONAL_V21_0.json'
final_json = out_dir + '/BTS_DESA_NASIONAL_V21_0_PRETTY.json'

In [12]:
# Read the GeoDataFrame
gdf = load_shapefile_into_geodataframe(shp_in)

gdf.head(2)

Reading shapefile input...


Unnamed: 0,ID_DESA,DESA,ID_KEC,KECAMATAN,ID_KAB,KABUPATEN,ID_PROV,PROVINSI,ID_REG,REGION,...,ID_BRANCH,BRANCH,ID_SUBBRAN,SUBBRANCH,ID_CLUSTER,CLUSTER,MITRA_AD,LONGITUDE,LATITUDE,geometry
0,1101010001,LATIUNG,1101010,TEUPAH SELATAN,1101,SIMEULUE,11,NANGGROE ACEH DARUSSALAM,1,SUMBAGUT,...,50,BANDA ACEH,105,MEULABOH,113,MEULABOH,,96.4556,2.37651,"POLYGON ((96.48015 2.34382, 96.47920 2.34325, ..."
1,1101010002,LABUHAN BAJAU,1101010,TEUPAH SELATAN,1101,SIMEULUE,11,NANGGROE ACEH DARUSSALAM,1,SUMBAGUT,...,50,BANDA ACEH,105,MEULABOH,113,MEULABOH,,96.4729,2.39231,"POLYGON ((96.49244 2.38675, 96.49196 2.38556, ..."


In [13]:
# Create a 'hierarchy' column
gdf = create_gdf_hierarchy(gdf)

gdf.head(2)

Unnamed: 0,ID_DESA,DESA,ID_KEC,KECAMATAN,ID_KAB,KABUPATEN,ID_PROV,PROVINSI,ID_REG,REGION,...,BRANCH,ID_SUBBRAN,SUBBRANCH,ID_CLUSTER,CLUSTER,MITRA_AD,LONGITUDE,LATITUDE,geometry,hierarchy
0,1101010001,LATIUNG,1101010,TEUPAH SELATAN,1101,SIMEULUE,11,NANGGROE ACEH DARUSSALAM,1,SUMBAGUT,...,BANDA ACEH,105,MEULABOH,113,MEULABOH,,96.4556,2.37651,"POLYGON ((96.48015 2.34382, 96.47920 2.34325, ...","[{'id': '1101010001', 'level': 'kelurahan', 'n..."
1,1101010002,LABUHAN BAJAU,1101010,TEUPAH SELATAN,1101,SIMEULUE,11,NANGGROE ACEH DARUSSALAM,1,SUMBAGUT,...,BANDA ACEH,105,MEULABOH,113,MEULABOH,,96.4729,2.39231,"POLYGON ((96.49244 2.38675, 96.49196 2.38556, ...","[{'id': '1101010002', 'level': 'kelurahan', 'n..."


In [14]:
# Filter columns
cols_to_keep = ['hierarchy', 'geometry', 'ID_DESA']
gdf = subset_geodataframe(gdf, cols_to_keep)

gdf.head(2)

Unnamed: 0,hierarchy,geometry,ID_DESA
0,"[{'id': '1101010001', 'level': 'kelurahan', 'n...","POLYGON ((96.48015 2.34382, 96.47920 2.34325, ...",1101010001
1,"[{'id': '1101010002', 'level': 'kelurahan', 'n...","POLYGON ((96.49244 2.38675, 96.49196 2.38556, ...",1101010002


In [15]:
# Re-set the logging path
geojson_path = out_dir + '/BTS_DESA_NASIONAL_V21_0.json'
final_json = out_dir + '/BTS_DESA_NASIONAL_V21_0_PRETTY.json'

In [16]:
# Write GeoDataFrame into a GeoJSON
write_geodataframe_into_geojson(gdf, geojson_path)

Writing GeoDataFrame to GeoJSON...
Done writing shapefile input to GeoJSON!


In [17]:
# Make the GeoJSON neater
json_neater(geojson_path, final_json)

Loading raw GeoJSON file...
Done prettify GeoJSON!


In [18]:
# Slice 50 geodataframe
gdf_smaller = slice_geodataframe(gdf, 0, 50)

gdf_smaller.head()

Sliced GeoDataFrame has length of : 50


Unnamed: 0,hierarchy,geometry,ID_DESA
0,"[{'id': '1101010001', 'level': 'kelurahan', 'n...","POLYGON ((96.48015 2.34382, 96.47920 2.34325, ...",1101010001
1,"[{'id': '1101010002', 'level': 'kelurahan', 'n...","POLYGON ((96.49244 2.38675, 96.49196 2.38556, ...",1101010002
2,"[{'id': '1101010003', 'level': 'kelurahan', 'n...","POLYGON ((96.37330 2.34174, 96.37327 2.34161, ...",1101010003
3,"[{'id': '1101010004', 'level': 'kelurahan', 'n...","POLYGON ((96.47211 2.43549, 96.47207 2.43430, ...",1101010004
4,"[{'id': '1101010005', 'level': 'kelurahan', 'n...","POLYGON ((96.47117 2.44948, 96.46327 2.44910, ...",1101010005


In [19]:
# Re-set the testing path
geojson_path = out_dir + '/BTS_DESA_NASIONAL_V21_0_smaller.json'
final_json = out_dir + '/BTS_DESA_NASIONAL_V21_0_PRETTY_smaller.json'

In [20]:
# Write GeoDataFrame into a GeoJSON
write_geodataframe_into_geojson(gdf_smaller, geojson_path)

Writing GeoDataFrame to GeoJSON...
Done writing shapefile input to GeoJSON!


In [21]:
# Make the GeoJSON neater
json_neater(geojson_path, final_json)

Loading raw GeoJSON file...
Done prettify GeoJSON!


### Some Experimentation Using Smaller Dataframe

In [22]:
gdf_smaller.head()

Unnamed: 0,hierarchy,geometry,ID_DESA
0,"[{'id': '1101010001', 'level': 'kelurahan', 'n...","POLYGON ((96.48015 2.34382, 96.47920 2.34325, ...",1101010001
1,"[{'id': '1101010002', 'level': 'kelurahan', 'n...","POLYGON ((96.49244 2.38675, 96.49196 2.38556, ...",1101010002
2,"[{'id': '1101010003', 'level': 'kelurahan', 'n...","POLYGON ((96.37330 2.34174, 96.37327 2.34161, ...",1101010003
3,"[{'id': '1101010004', 'level': 'kelurahan', 'n...","POLYGON ((96.47211 2.43549, 96.47207 2.43430, ...",1101010004
4,"[{'id': '1101010005', 'level': 'kelurahan', 'n...","POLYGON ((96.47117 2.44948, 96.46327 2.44910, ...",1101010005


In [27]:
# Test the get_geodataframe_row_from_id_desa
row_idx = get_geodataframe_row_from_id_desa(gdf, 6409040009)
print(row_idx)

[61823]


In [30]:
# Slice 50 geodataframe from idx 61823
gdf_smaller = slice_geodataframe(gdf, 61823, 61900)
gdf_smaller.head()

Sliced GeoDataFrame has length of : 77


Unnamed: 0,hierarchy,geometry,ID_DESA
61823,"[{'id': '6409040009', 'level': 'kelurahan', 'n...","POLYGON ((116.89270 -0.83148, 116.88484 -0.851...",6409040009
61824,"[{'id': '6409040010', 'level': 'kelurahan', 'n...","POLYGON ((116.80891 -0.88567, 116.80876 -0.886...",6409040010
61825,"[{'id': '6409040011', 'level': 'kelurahan', 'n...","POLYGON ((116.77336 -0.89259, 116.77274 -0.894...",6409040011
61826,"[{'id': '6409040012', 'level': 'kelurahan', 'n...","POLYGON ((116.73363 -0.80521, 116.73363 -0.805...",6409040012
61827,"[{'id': '6409040013', 'level': 'kelurahan', 'n...","POLYGON ((116.82831 -0.81411, 116.81700 -0.831...",6409040013


In [31]:
# Re-set the testing path
geojson_path = out_dir + '/BTS_DESA_NASIONAL_V21_0_smaller.json'
final_json = out_dir + '/BTS_DESA_NASIONAL_V21_0_PRETTY_smaller.json'

In [33]:
# Write GeoDataFrame into a GeoJSON
write_geodataframe_into_geojson(gdf_smaller, geojson_path)

# Make the GeoJSON neater
json_neater(geojson_path, final_json)

Writing GeoDataFrame to GeoJSON...
Done writing shapefile input to GeoJSON!
Loading raw GeoJSON file...
Done prettify GeoJSON!


In [34]:
# Last index of the smaller dataframe...
gdf_smaller.tail()

Unnamed: 0,hierarchy,geometry,ID_DESA
61895,"[{'id': '6471040002', 'level': 'kelurahan', 'n...","MULTIPOLYGON (((116.85766 -1.25411, 116.85718 ...",6471040002
61896,"[{'id': '6471040003', 'level': 'kelurahan', 'n...","POLYGON ((116.83692 -1.25130, 116.83609 -1.251...",6471040003
61897,"[{'id': '6471040004', 'level': 'kelurahan', 'n...","MULTIPOLYGON (((116.84709 -1.24301, 116.84703 ...",6471040004
61898,"[{'id': '6471040005', 'level': 'kelurahan', 'n...","POLYGON ((116.86203 -1.25231, 116.86196 -1.252...",6471040005
61899,"[{'id': '6471040006', 'level': 'kelurahan', 'n...","MULTIPOLYGON (((116.83976 -1.26042, 116.83975 ...",6471040006


In [37]:
# Get a slightly bigger dataframe 61500 <-> 65900
gdf_smaller = slice_geodataframe(gdf, 61500, 65900)
gdf_smaller.head()

Sliced GeoDataFrame has length of : 4400


Unnamed: 0,hierarchy,geometry,ID_DESA
61500,"[{'id': '6403170002', 'level': 'kelurahan', 'n...","POLYGON ((116.49088 0.16920, 116.48942 0.15772...",6403170002
61501,"[{'id': '6403170003', 'level': 'kelurahan', 'n...","POLYGON ((116.49362 0.22411, 116.49316 0.20274...",6403170003
61502,"[{'id': '6403170004', 'level': 'kelurahan', 'n...","POLYGON ((116.49303 0.26881, 116.48029 0.26295...",6403170004
61503,"[{'id': '6403170005', 'level': 'kelurahan', 'n...","POLYGON ((116.38799 0.37283, 116.38655 0.37027...",6403170005
61504,"[{'id': '6403170006', 'level': 'kelurahan', 'n...","POLYGON ((116.20876 0.24730, 116.20549 0.24076...",6403170006


In [38]:
# Write GeoDataFrame into a GeoJSON
write_geodataframe_into_geojson(gdf_smaller, geojson_path)

# Make the GeoJSON neater
json_neater(geojson_path, final_json)

Writing GeoDataFrame to GeoJSON...
Done writing shapefile input to GeoJSON!
Loading raw GeoJSON file...
Done prettify GeoJSON!


In [39]:
# What is the very last index?
gdf.tail()

Unnamed: 0,hierarchy,geometry,ID_DESA
79515,"[{'id': '9471040004', 'level': 'kelurahan', 'n...","POLYGON ((140.72340 -2.52798, 140.72339 -2.528...",9471040004
79516,"[{'id': '9471040005', 'level': 'kelurahan', 'n...","POLYGON ((140.72162 -2.50982, 140.71920 -2.512...",9471040005
79517,"[{'id': '9471040006', 'level': 'kelurahan', 'n...","POLYGON ((140.73134 -2.51706, 140.73132 -2.517...",9471040006
79518,"[{'id': '9471040007', 'level': 'kelurahan', 'n...","POLYGON ((140.74226 -2.52283, 140.74203 -2.523...",9471040007
79519,"[{'id': '9471040008', 'level': 'kelurahan', 'n...","POLYGON ((140.74825 -2.53426, 140.74809 -2.534...",9471040008


In [40]:
# Get a slightly bigger dataframe 61500 <-> end
gdf_smaller = slice_geodataframe(gdf, 61500, 79519)
gdf_smaller.head()

Sliced GeoDataFrame has length of : 18019


Unnamed: 0,hierarchy,geometry,ID_DESA
61500,"[{'id': '6403170002', 'level': 'kelurahan', 'n...","POLYGON ((116.49088 0.16920, 116.48942 0.15772...",6403170002
61501,"[{'id': '6403170003', 'level': 'kelurahan', 'n...","POLYGON ((116.49362 0.22411, 116.49316 0.20274...",6403170003
61502,"[{'id': '6403170004', 'level': 'kelurahan', 'n...","POLYGON ((116.49303 0.26881, 116.48029 0.26295...",6403170004
61503,"[{'id': '6403170005', 'level': 'kelurahan', 'n...","POLYGON ((116.38799 0.37283, 116.38655 0.37027...",6403170005
61504,"[{'id': '6403170006', 'level': 'kelurahan', 'n...","POLYGON ((116.20876 0.24730, 116.20549 0.24076...",6403170006


In [41]:
# Write GeoDataFrame into a GeoJSON
write_geodataframe_into_geojson(gdf_smaller, geojson_path)

# Make the GeoJSON neater
json_neater(geojson_path, final_json)

Writing GeoDataFrame to GeoJSON...
Done writing shapefile input to GeoJSON!
Loading raw GeoJSON file...
Done prettify GeoJSON!
