In [1]:
from geopy.geocoders import Nominatim
from geopandas import gpd
import pandas as pd
import numpy as np
import folium

In [2]:
address='Sheung Shui'
geolocator = Nominatim(user_agent="Your_Name")
location = geolocator.geocode(address)
print(location.address)
print((location.latitude, location.longitude))


上水 Sheung Shui, 北區 North District, 新界 New Territories, 香港 Hong Kong, 518005, 中国
(22.5014146, 114.1287033)


In [3]:
# remove nan to test
districts = ['Kowloon Bay', 'Tai Koo', 'Ngau Tau Kok', 'Lantau Island',
'district', 'Sheung Wan', 'Shatin Area', 'Tung Chung', 'Aberdeen',
'Tseung Kwan O', 'Chai Wan', 'Central', 'Central & Western Area',
'Quarry Bay', 'Wan Chai', 'Admiralty', 'Kwai Hing',
'Overseas', 'Airport Area', 'Southern Area', 'Tsim Sha Tsui',
'Kwun Tong', 'Tsing Yi', 'Tai Po Area', 'Others', 'North Point',
'Hung Hom', 'Kwun Tong Area', 'Causeway Bay', 'Kwai Tsing Area',
'Lai Chi Kok', 'Yau Tsim Mong Area', 'Cheung Sha Wan',
'Wong Tai Sin Area', 'Sai Wan Ho', 'Tin Hau', 'Tsuen Wan Area',
'Shau Kei Wan', 'Mong Kok']

In [4]:
# important parameters
file_name = 'data-scientist-2022-12-06-1.csv'
df_raw = pd.read_csv(file_name)


In [5]:
df_1 = df_raw.copy()

In [6]:
df_1['district'].replace('district',np.nan)

0        Kowloon Bay
1            Tai Koo
2       Ngau Tau Kok
3      Lantau Island
4                NaN
           ...      
344              NaN
345         Wan Chai
346         Wan Chai
347    Tseung Kwan O
348    Lantau Island
Name: district, Length: 349, dtype: object

In [7]:
def my_geocoder(row):
    try:
        point = geolocator.geocode(row).point
        return pd.Series({'Latitude': point.latitude, 'Longitude': point.longitude})
    except:
        return None

df_1[['Latitude', 'Longitude']] = df_1.apply(lambda x: my_geocoder(x['district']), axis=1)

print("{}% of addresses were geocoded!".format(
    (1 - sum(np.isnan(df_1["Latitude"])) / len(df_1)) * 100))


99.42693409742121% of addresses were geocoded!


In [8]:
df_1.head()

Unnamed: 0,title,salary,company,posted,district,job_description,Career Level,Years of Experience,Company Website,Qualification,Job Type,Job Functions,url,Latitude,Longitude
0,Data Scientist,salary,Connexe Search Limited,2022-12-06,Kowloon Bay,Our client is one of the leading financial ins...,,,,,Contract,"Banking / Finance, Others, Information Technol...",https://hk.jobsdb.com/hk/en/job/data-scientist...,22.323467,114.214068
1,Data Analyst (Data & AI),salary,NCSI (HK) Limited,2022-12-06,Tai Koo,Key Role Responsibilities:\n\nParticipate in c...,,,http://www.ncs-i.com,,"Full Time, Permanent","Information Technology (IT), Product Managemen...",https://hk.jobsdb.com/hk/en/job/data-analyst-d...,22.284532,114.216352
2,Data Analyst (Data Governance),salary,Gientech Technology (Hong Kong) Limited,2022-12-06,Ngau Tau Kok,Duties:\nManage & support data governance mode...,Middle,3 years,,Degree,"Full Time, Permanent","Information Technology (IT), Data Scientist",https://hk.jobsdb.com/hk/en/job/data-analyst-d...,22.315512,114.219092
3,Solution Analyst - ICC - Data Integration,salary,Cathay Pacific Airways Ltd,2022-12-06,Lantau Island,Reports to: Solution Lead / Senior Solution Le...,,,http://www.cathaypacific.com/careers,,"Full Time, Permanent","Transportation & Logistics, Aviation Services,...",https://hk.jobsdb.com/hk/en/job/solution-analy...,22.271703,113.957319
4,Analyst Programmer (Data Analytics),salary,China State Construction Engineering (Hong Kon...,2022-12-06,district,Job Description:\nEstablish and maintain high-...,Entry Level,3 years,https://www.cscechk.com/en/,Degree,"Full Time, Permanent","Information Technology (IT), DBA, Others, Data...",https://hk.jobsdb.com/hk/en/job/analyst-progra...,33.811953,-84.378525


In [9]:
df_2 = gpd.GeoDataFrame(
    df_1, geometry=gpd.points_from_xy(df_1.Longitude, df_1.Latitude))
df_2.crs = {'init': 'epsg:4326'}


  in_crs_string = _prepare_from_proj_string(in_crs_string)


In [10]:
df_2

Unnamed: 0,title,salary,company,posted,district,job_description,Career Level,Years of Experience,Company Website,Qualification,Job Type,Job Functions,url,Latitude,Longitude,geometry
0,Data Scientist,salary,Connexe Search Limited,2022-12-06,Kowloon Bay,Our client is one of the leading financial ins...,,,,,Contract,"Banking / Finance, Others, Information Technol...",https://hk.jobsdb.com/hk/en/job/data-scientist...,22.323467,114.214068,POINT (114.21407 22.32347)
1,Data Analyst (Data & AI),salary,NCSI (HK) Limited,2022-12-06,Tai Koo,Key Role Responsibilities:\n\nParticipate in c...,,,http://www.ncs-i.com,,"Full Time, Permanent","Information Technology (IT), Product Managemen...",https://hk.jobsdb.com/hk/en/job/data-analyst-d...,22.284532,114.216352,POINT (114.21635 22.28453)
2,Data Analyst (Data Governance),salary,Gientech Technology (Hong Kong) Limited,2022-12-06,Ngau Tau Kok,Duties:\nManage & support data governance mode...,Middle,3 years,,Degree,"Full Time, Permanent","Information Technology (IT), Data Scientist",https://hk.jobsdb.com/hk/en/job/data-analyst-d...,22.315512,114.219092,POINT (114.21909 22.31551)
3,Solution Analyst - ICC - Data Integration,salary,Cathay Pacific Airways Ltd,2022-12-06,Lantau Island,Reports to: Solution Lead / Senior Solution Le...,,,http://www.cathaypacific.com/careers,,"Full Time, Permanent","Transportation & Logistics, Aviation Services,...",https://hk.jobsdb.com/hk/en/job/solution-analy...,22.271703,113.957319,POINT (113.95732 22.27170)
4,Analyst Programmer (Data Analytics),salary,China State Construction Engineering (Hong Kon...,2022-12-06,district,Job Description:\nEstablish and maintain high-...,Entry Level,3 years,https://www.cscechk.com/en/,Degree,"Full Time, Permanent","Information Technology (IT), DBA, Others, Data...",https://hk.jobsdb.com/hk/en/job/analyst-progra...,33.811953,-84.378525,POINT (-84.37852 33.81195)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,"Regional Senior Manager, Enterprise Architectu...",salary,Michael Page,2022-11-09,district,Large conglomerate\nGroup Head Office\nDigital...,Senior,9 years,http://www.michaelpage.com.hk,Degree,"Full Time, Permanent","Professional Services, Business Analysis / Dat...",https://hk.jobsdb.com/hk/en/job/regional-senio...,33.811953,-84.378525,POINT (-84.37852 33.81195)
345,Senior Data Engineer (Financial Services),salary,Emperor Financial Services Group,2022-11-08,Wan Chai,"Job Description \nTo design, implement and mai...",Middle,5 years,,Degree,"Full Time, Permanent","Information Technology (IT), DBA, Software Dev...",https://hk.jobsdb.com/hk/en/job/senior-data-en...,22.279015,114.172483,POINT (114.17248 22.27901)
346,Data Engineer / Data Analyst,salary,eCloudvalley Technology (HK) Limited,2022-11-07,Wan Chai,Job Duties\nBuild end-to-end data architecture...,Entry Level,,,,"Full Time, Permanent","Information Technology (IT), Product Managemen...",https://hk.jobsdb.com/hk/en/job/data-engineer-...,22.279015,114.172483,POINT (114.17248 22.27901)
347,Senior Business Analysis Engineer,salary,China Unicom (Hong Kong) Operations Limited,2022-11-07,Tseung Kwan O,Job Description\n\n1. Responsible for the plan...,Middle,,http://www.chinaunicomglobal.com/hk,Degree,"Full Time, Permanent","Information Technology (IT), DBA, Product Mana...",https://hk.jobsdb.com/hk/en/job/senior-busines...,22.292258,114.257724,POINT (114.25772 22.29226)


In [11]:
df_2.to_csv('geo_data.csv', index = False)

In [12]:
df_3 = df_2[df_2['district'].notna()]
df_3 = df_3[df_3['Latitude'].notna()]
df_3 = df_3[df_3['Longitude'].notna()]
df_3 = df_3[df_3['geometry'].notna()]

In [13]:
df_3.isna().sum()

title                    0
salary                   0
company                  0
posted                   0
district                 0
job_description          0
Career Level            83
Years of Experience    114
Company Website        133
Qualification           67
Job Type                 0
Job Functions            0
url                      0
Latitude                 0
Longitude                0
geometry                 0
dtype: int64

In [14]:
# further cleanup 
to_drop = ['district',
            'Aberdeen',
            'Overseas',
            'Others',
            'Southern Area']
for i in to_drop:
    df_3.drop(df_3.loc[df_3['district']==i].index, inplace=True)

In [15]:
# Create a map
m = folium.Map(location=[22.34, 114.1], tiles='openstreetmap', zoom_start=11, control_scale=True)

# Add points to the map
for idx, row in df_3.iterrows():
    folium.Marker([row['Latitude'], row['Longitude']], popup=row['district']).add_to(m)

# Display the map
m

In [16]:
m.save('hk_data_job_geodistribution.html')