In [1]:
import pandas as pd
df = pd.read_csv('ds_salaries.csv')
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [2]:
#drop everything except job_title, salary in usd and company location
df = df[['job_title', 'salary_in_usd', 'company_location']]
df.head()

Unnamed: 0,job_title,salary_in_usd,company_location
0,Principal Data Scientist,85847,ES
1,ML Engineer,30000,US
2,ML Engineer,25500,US
3,Data Scientist,175000,CA
4,Data Scientist,120000,CA


In [3]:
#show each unique location
df['company_location'].unique()

array(['ES', 'US', 'CA', 'DE', 'GB', 'NG', 'IN', 'HK', 'NL', 'CH', 'CF',
       'FR', 'FI', 'UA', 'IE', 'IL', 'GH', 'CO', 'SG', 'AU', 'SE', 'SI',
       'MX', 'BR', 'PT', 'RU', 'TH', 'HR', 'VN', 'EE', 'AM', 'BA', 'KE',
       'GR', 'MK', 'LV', 'RO', 'PK', 'IT', 'MA', 'PL', 'AL', 'AR', 'LT',
       'AS', 'CR', 'IR', 'BS', 'HU', 'AT', 'SK', 'CZ', 'TR', 'PR', 'DK',
       'BO', 'PH', 'BE', 'ID', 'EG', 'AE', 'LU', 'MY', 'HN', 'JP', 'DZ',
       'IQ', 'CN', 'NZ', 'CL', 'MD', 'MT'], dtype=object)

In [4]:
#convert location from ISO-2 to ISO-3
import pycountry

# Function to convert ISO 3166-1 alpha-2 codes to ISO 3166-1 alpha-3 codes
def alpha2_to_alpha3(alpha2_code):
    try:
        return pycountry.countries.get(alpha_2=alpha2_code).alpha_3
    except AttributeError:
        # Return NaN for codes that don't have a corresponding alpha-3 code
        return None
    
df['company_location'] = df['company_location'].apply(alpha2_to_alpha3)

In [5]:
df['company_location'].unique()

array(['ESP', 'USA', 'CAN', 'DEU', 'GBR', 'NGA', 'IND', 'HKG', 'NLD',
       'CHE', 'CAF', 'FRA', 'FIN', 'UKR', 'IRL', 'ISR', 'GHA', 'COL',
       'SGP', 'AUS', 'SWE', 'SVN', 'MEX', 'BRA', 'PRT', 'RUS', 'THA',
       'HRV', 'VNM', 'EST', 'ARM', 'BIH', 'KEN', 'GRC', 'MKD', 'LVA',
       'ROU', 'PAK', 'ITA', 'MAR', 'POL', 'ALB', 'ARG', 'LTU', 'ASM',
       'CRI', 'IRN', 'BHS', 'HUN', 'AUT', 'SVK', 'CZE', 'TUR', 'PRI',
       'DNK', 'BOL', 'PHL', 'BEL', 'IDN', 'EGY', 'ARE', 'LUX', 'MYS',
       'HND', 'JPN', 'DZA', 'IRQ', 'CHN', 'NZL', 'CHL', 'MDA', 'MLT'],
      dtype=object)

In [6]:
#compute the average salary for each location for each job title
df.groupby(['job_title', 'company_location']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,salary_in_usd
job_title,company_location,Unnamed: 2_level_1
3D Computer Vision Researcher,ALB,10000.000000
3D Computer Vision Researcher,ASM,20000.000000
3D Computer Vision Researcher,CRI,50000.000000
3D Computer Vision Researcher,IND,5409.000000
AI Developer,BIH,120000.000000
...,...,...
Research Scientist,USA,179146.206897
Software Data Engineer,AUS,50000.000000
Software Data Engineer,SGP,75020.000000
Staff Data Analyst,CAN,15000.000000


In [7]:
#draw heatmap on world chart of 
#average salary for each location for each job title and save as a pdf file	
#for each page in the pdf file, the title should be the job title
#the heatmap should be the average salary for each location for that job title
import plotly.express as px
from matplotlib.backends.backend_pdf import PdfPages
import io
from PIL import Image
import matplotlib.pyplot as plt

with PdfPages('job_title_heatmaps.pdf') as pdf:
    for job in df['job_title'].unique():
        fig = px.choropleth(df[df['job_title'] == job], 
                        locations='company_location', 
                        color='salary_in_usd', 
                        hover_name='company_location', 
                        locationmode='ISO-3', 
                        color_continuous_scale=px.colors.sequential.Plasma)
        fig.update_layout(title_text=job)
        # Convert Plotly fig to static image and then to PIL image
        img_buf = io.BytesIO()
        fig.write_image(img_buf, format='png')
        img_buf.seek(0)
        pil_img = Image.open(img_buf)
        # Convert PIL image to matplotlib figure
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.imshow(pil_img)
        ax.axis('off')

        # Save the current plot as a page in the PDF
        pdf.savefig(fig)
        plt.close()

      