<a href="https://colab.research.google.com/github/cbonnin88/EDA_Projects/blob/main/EDA_AI_jobs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [46]:
import polars as pl
import pandas as pd
import plotly.express as px
import gdown as gd

In [47]:
# Import the CVS via google drive
url = 'https://drive.google.com/uc?id=1ziyV2F8W-aEXSgGuCGPF-V06PrLUQKna'
gd.download(url,"ai_job_dataset-cleaned.csv",quiet=True)

ai_jobs = pl.read_csv("ai_job_dataset-cleaned.csv")

In [48]:
ai_jobs.head()

job_id,job_title,salary_usd,salary_currency,experience_level,employment_type,job_location,company_size,remote_ratio,required_skills,education_required,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name
str,str,i64,str,str,str,str,str,i64,str,str,i64,str,str,str,i64,f64,str
"""AI00001""","""AI Research Scientist""",90376,"""USD""","""Senior""","""Contract""","""France""","""Mid Size""",50,"""Tableau, PyTorch, Kubernetes, …","""Bachelor""",9,"""Automotive""","""2024-10-18""","""2024-11-07""",1076,5.9,"""Smart Analytics"""
"""AI00002""","""AI Software Engineer""",61895,"""USD""","""Entry Level""","""Contract""","""Canada""","""Mid Size""",100,"""Deep Learning, AWS, Mathematic…","""Master""",1,"""Media""","""2024-11-20""","""2025-01-11""",1268,5.2,"""TechCorp Inc"""
"""AI00003""","""AI Specialist""",152626,"""USD""","""Mid Level""","""Full-time""","""Switzerland""","""Large""",0,"""Kubernetes, Deep Learning, Jav…","""Associate""",2,"""Education""","""2025-03-18""","""2025-04-07""",1974,9.4,"""Autonomous Tech"""
"""AI00004""","""NLP Engineer""",80215,"""USD""","""Senior""","""Full-time""","""India""","""Mid Size""",50,"""Scala, SQL, Linux, Python""","""PhD""",7,"""Consulting""","""2024-12-23""","""2025-02-24""",1345,8.6,"""Future Systems"""
"""AI00005""","""AI Consultant""",54624,"""EUR""","""Entry Level""","""Part-time""","""France""","""Small""",100,"""MLOps, Java, Tableau, Python""","""Master""",0,"""Media""","""2025-04-15""","""2025-06-23""",1989,6.6,"""Advanced Robotics"""


In [49]:
ai_jobs.describe()

statistic,job_id,job_title,salary_usd,salary_currency,experience_level,employment_type,job_location,company_size,remote_ratio,required_skills,education_required,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name
str,str,str,f64,str,str,str,str,str,f64,str,str,f64,str,str,str,f64,f64,str
"""count""","""15000""","""15000""",15000.0,"""15000""","""15000""","""15000""","""15000""","""15000""",15000.0,"""15000""","""15000""",15000.0,"""15000""","""15000""","""15000""",15000.0,15000.0,"""15000"""
"""null_count""","""0""","""0""",0.0,"""0""","""0""","""0""","""0""","""0""",0.0,"""0""","""0""",0.0,"""0""","""0""","""0""",0.0,0.0,"""0"""
"""mean""",,,115348.965133,,,,,,49.483333,,,6.2532,,,,1503.314733,7.504273,
"""std""",,,60260.940438,,,,,,40.812712,,,5.545768,,,,576.127083,1.45087,
"""min""","""AI00001""","""AI Architect""",32519.0,"""EUR""","""Entry Level""","""Contract""","""Australia""","""Large""",0.0,"""AWS, Azure, GCP, Docker""","""Associate""",0.0,"""Automotive""","""2024-01-01""","""2024-01-16""",500.0,5.0,"""AI Innovations"""
"""25%""",,,70180.0,,,,,,0.0,,,2.0,,,,1004.0,6.2,
"""50%""",,,99724.0,,,,,,50.0,,,5.0,,,,1512.0,7.5,
"""75%""",,,146407.0,,,,,,100.0,,,10.0,,,,2000.0,8.8,
"""max""","""AI15000""","""Robotics Engineer""",399095.0,"""USD""","""Senior""","""Part-time""","""United States""","""Small""",100.0,"""TensorFlow, Tableau, SQL, Hado…","""PhD""",19.0,"""Transportation""","""2025-04-30""","""2025-07-11""",2499.0,10.0,"""TechCorp Inc"""


In [50]:
print(f"Rows: {ai_jobs.shape[0]}")
print(f"Columns: {ai_jobs.shape[1]}")

Rows: 15000
Columns: 18


In [51]:
ai_jobs.dtypes

[String,
 String,
 Int64,
 String,
 String,
 String,
 String,
 String,
 Int64,
 String,
 String,
 Int64,
 String,
 String,
 String,
 Int64,
 Float64,
 String]

In [52]:
display(ai_jobs.columns)

['job_id',
 'job_title',
 'salary_usd',
 'salary_currency',
 'experience_level',
 'employment_type',
 'job_location',
 'company_size',
 'remote_ratio',
 'required_skills',
 'education_required',
 'years_experience',
 'industry',
 'posting_date',
 'application_deadline',
 'job_description_length',
 'benefits_score',
 'company_name']

In [53]:
# Initial Missing values:
missing_values = ai_jobs.null_count()
display(missing_values)

job_id,job_title,salary_usd,salary_currency,experience_level,employment_type,job_location,company_size,remote_ratio,required_skills,education_required,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# **Cleaning up the Data**

In [54]:
# Handling duplicate rows
initial_rows = ai_jobs.shape[0]
ai_jobs = ai_jobs.unique()
rows_after_duplicates = ai_jobs.shape[0]
print(f"Removed {initial_rows - rows_after_duplicates} duplicated rows")

Removed 0 duplicated rows


In [55]:
# changing  'salary-usd to salary-eur'
ai_jobs = ai_jobs.rename({'salary_usd':'salary_eur'})
display(ai_jobs.columns)

['job_id',
 'job_title',
 'salary_eur',
 'salary_currency',
 'experience_level',
 'employment_type',
 'job_location',
 'company_size',
 'remote_ratio',
 'required_skills',
 'education_required',
 'years_experience',
 'industry',
 'posting_date',
 'application_deadline',
 'job_description_length',
 'benefits_score',
 'company_name']

In [56]:
# I am ensuring that salary_eur is a numerical data type
if 'salary_eur' in ai_jobs.columns:
    ai_jobs = ai_jobs.with_columns(
        pl.col('salary_eur').cast(pl.Float64, strict=False)
    )
    print("'salary_eur changed to Float64")
else :
    print("salary_eur is already a numerical data type")

'salary_eur changed to Float64


In [57]:
# Filling missing salary data with the median
median_salary = ai_jobs.select(pl.col('salary_eur').median()).item()

if median_salary is not None:
  ai_jobs = ai_jobs.with_columns(
      pl.col('salary_eur').fill_null(median_salary)
  )
  print(f"Filled Missing 'salary_eur' with median {median_salary}")
else:
    print("Median salary cound not be calculated or was null, skipping fill for 'salary_eur'")

Filled Missing 'salary_eur' with median 99705.0


In [58]:
if 'salary_currency' in ai_jobs.columns:
  ai_jobs = ai_jobs.with_columns(
      pl.col('salary_currency').cast(pl.Float64, strict=False)
  )
  print('salary_curreny changed to Floatt64')
else:
    print("salary_currency is already a numerical data type")

salary_curreny changed to Floatt64


In [59]:
median_salary2 = ai_jobs.select(pl.col('salary_currency').median()).item()

if median_salary is not None:
  ai_jobs = ai_jobs.with_columns(
      pl.col('salary_currency').fill_null(median_salary)
  )
  print(f"Filled Missing 'salary_currency' with median {median_salary}")
else:
    print("Median salary cound not be calculated or was null, skipping fill for 'salary_currency'")

Filled Missing 'salary_currency' with median 99705.0


In [60]:
ai_jobs.describe()

statistic,job_id,job_title,salary_eur,salary_currency,experience_level,employment_type,job_location,company_size,remote_ratio,required_skills,education_required,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name
str,str,str,f64,f64,str,str,str,str,f64,str,str,f64,str,str,str,f64,f64,str
"""count""","""15000""","""15000""",15000.0,15000.0,"""15000""","""15000""","""15000""","""15000""",15000.0,"""15000""","""15000""",15000.0,"""15000""","""15000""","""15000""",15000.0,15000.0,"""15000"""
"""null_count""","""0""","""0""",0.0,0.0,"""0""","""0""","""0""","""0""",0.0,"""0""","""0""",0.0,"""0""","""0""","""0""",0.0,0.0,"""0"""
"""mean""",,,115348.965133,99705.0,,,,,49.483333,,,6.2532,,,,1503.314733,7.504273,
"""std""",,,60260.940438,0.0,,,,,40.812712,,,5.545768,,,,576.127083,1.45087,
"""min""","""AI00001""","""AI Architect""",32519.0,99705.0,"""Entry Level""","""Contract""","""Australia""","""Large""",0.0,"""AWS, Azure, GCP, Docker""","""Associate""",0.0,"""Automotive""","""2024-01-01""","""2024-01-16""",500.0,5.0,"""AI Innovations"""
"""25%""",,,70180.0,99705.0,,,,,0.0,,,2.0,,,,1004.0,6.2,
"""50%""",,,99724.0,99705.0,,,,,50.0,,,5.0,,,,1512.0,7.5,
"""75%""",,,146407.0,99705.0,,,,,100.0,,,10.0,,,,2000.0,8.8,
"""max""","""AI15000""","""Robotics Engineer""",399095.0,99705.0,"""Senior""","""Part-time""","""United States""","""Small""",100.0,"""TensorFlow, Tableau, SQL, Hado…","""PhD""",19.0,"""Transportation""","""2025-04-30""","""2025-07-11""",2499.0,10.0,"""TechCorp Inc"""


In [61]:
ai_jobs.head()

job_id,job_title,salary_eur,salary_currency,experience_level,employment_type,job_location,company_size,remote_ratio,required_skills,education_required,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name
str,str,f64,f64,str,str,str,str,i64,str,str,i64,str,str,str,i64,f64,str
"""AI01110""","""AI Research Scientist""",53959.0,99705.0,"""Entry Level""","""Part-time""","""Canada""","""Small""",50,"""GCP, Linux, Python, TensorFlow""","""Master""",1,"""Gaming""","""2024-12-26""","""2025-01-17""",2248,8.5,"""Algorithmic Solutions"""
"""AI05885""","""Robotics Engineer""",64132.0,99705.0,"""Entry Level""","""Part-time""","""Netherlands""","""Large""",50,"""Mathematics, Spark, Tableau, P…","""PhD""",1,"""Automotive""","""2024-02-19""","""2024-03-07""",960,7.4,"""Machine Intelligence Group"""
"""AI12309""","""Research Scientist""",160145.0,99705.0,"""Executive""","""Full-time""","""South Korea""","""Small""",0,"""SQL, PyTorch, R""","""Bachelor""",11,"""Technology""","""2024-06-26""","""2024-08-27""",727,6.5,"""Cognitive Computing"""
"""AI00258""","""Head of AI""",59373.0,99705.0,"""Entry Level""","""Contract""","""Austria""","""Large""",0,"""Python, GCP, Mathematics, Scal…","""PhD""",0,"""Gaming""","""2024-07-17""","""2024-08-28""",1833,5.3,"""DataVision Ltd"""
"""AI03086""","""NLP Engineer""",128439.0,99705.0,"""Senior""","""Part-time""","""Singapore""","""Large""",0,"""Java, Azure, Tableau, Python""","""PhD""",8,"""Education""","""2025-03-09""","""2025-05-04""",985,7.6,"""AI Innovations"""


In [62]:
# Making sure that remote_ration is treated as categorical for analysis
if 'remote_ratio' in ai_jobs.columns:
  ai_jobs = ai_jobs.with_columns(
      pl.col('remote_ratio').cast(pl.Utf8)
  )
  print('remote_ratio was type casted')
else:
  print("Warning: 'remote_ratio' column not found. skipping type cast for it")

remote_ratio was type casted


In [63]:
if 'posting_date' in ai_jobs.columns and ai_jobs['posting_date'].dtype != pl.Datetime:
  ai_jobs = ai_jobs.with_columns(
      pl.col('posting_date').str.to_datetime(format='%Y-%m-%d', strict=False)
  )
  print("'posted_date' was converted to a Datetime Type")
else:
  print("posted_date is already a Datetime Type")

'posted_date' was converted to a Datetime Type


In [64]:
if 'application_deadline' in ai_jobs.columns:
  ai_jobs = ai_jobs.with_columns(
      pl.col('application_deadline').str.to_datetime(format='%Y-%m-%d',strict=False)
  )
  print("application_deadline was converted to a Datetime Type")
else:
  print("application_deadline is already a Datetime Type")

application_deadline was converted to a Datetime Type


In [65]:
print('Missing values count after cleaning')
display(ai_jobs.null_count())

Missing values count after cleaning


job_id,job_title,salary_eur,salary_currency,experience_level,employment_type,job_location,company_size,remote_ratio,required_skills,education_required,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [66]:
display(ai_jobs.schema)

Schema([('job_id', String),
        ('job_title', String),
        ('salary_eur', Float64),
        ('salary_currency', Float64),
        ('experience_level', String),
        ('employment_type', String),
        ('job_location', String),
        ('company_size', String),
        ('remote_ratio', String),
        ('required_skills', String),
        ('education_required', String),
        ('years_experience', Int64),
        ('industry', String),
        ('posting_date', Datetime(time_unit='us', time_zone=None)),
        ('application_deadline', Datetime(time_unit='us', time_zone=None)),
        ('job_description_length', Int64),
        ('benefits_score', Float64),
        ('company_name', String)])

# **Data Analysis and Visualization**

In [None]:
# Distribution of Job Titles
job_title_counts = ai_jobs.group_by('job_title').len().sort('len', descending=True).head(10)
display(job_title_counts)

job_title,len
str,u32
"""Machine Learning Researcher""",808
"""AI Software Engineer""",784
"""Autonomous Systems Engineer""",777
"""Machine Learning Engineer""",772
"""AI Architect""",771
"""Head of AI""",765
"""NLP Engineer""",762
"""Data Analyst""",759
"""Robotics Engineer""",759
"""AI Research Scientist""",756


In [77]:
fig_job_titles = px.bar(
    job_title_counts,
    x='job_title',
    y='len',
    labels={'job_title':'Job Title','len':'Number of Positions'},
    color='job_title',
    text_auto= True
)
fig_job_titles.update_layout(
    title=dict(
        text="Top 10 Jobs",
        x=0.5,
        xanchor="center"
    ),
    xaxis=dict(title='Job Titles'),
    yaxis=dict(title='Number of Positions'),
    legend_title='Job Titles'
)
fig_job_titles.show()

In [None]:
# Salary Distribution
salaries = ai_jobs.select([
    pl.col('salary_eur').round(2).alias('salaries')
])

fig_salary= px.histogram(
    salaries,
    x='salaries',
    nbins=50,
    title='Salary Distribution (EUR)',
    labels={'salaries':'Salary in EUR'}
)

fig_salary.update_layout(
    title=dict(
        text='Salary Distribution (EUR)',
        x=0.5,
        xanchor="center")
    ),

fig_salary.show()

fig_salary_box= px.box(
    salaries,
    y='salaries',
    labels={'salaries':'Salary in EUR'}
)


fig_salary_box.show()

In [None]:
# Experience Level vs. Salary
avg_salary_by_exp = ai_jobs.group_by('experience_level').agg(
    pl.col('salary_eur').mean().round(0).alias('average_salary_eur')
).sort('average_salary_eur', descending=True)

display(avg_salary_by_exp)

experience_level,average_salary_eur
str,f64
"""Executive""",187724.0
"""Senior""",122188.0
"""Mid Level""",87955.0
"""Entry Level""",63133.0


In [None]:
fig_exp_salary = px.bar(
    avg_salary_by_exp,
    x='experience_level',
    y='average_salary_eur',
    title='Average Salary by Experience Level',
    labels={'experience_level':'Experience Level','average_salary_eur':'Average Salary (EUR)'},
    color='experience_level',
    text_auto=True
)

fig_exp_salary.update_layout(
    title=dict(
        text="Average Salary by Experience Level",
        x=0.5,
        xanchor="center")
)

fig_exp_salary.show()

In [None]:
# Company Location vs Salary (Top 5)
avg_salary_by_location= ai_jobs.group_by('job_location').agg(
    pl.col('salary_eur').mean().round(0).alias('average_salary_eur')).sort('average_salary_eur',descending=True).head(5)

In [None]:
fig_location_salary = px.bar(
    avg_salary_by_location,
    x='job_location',
    y='average_salary_eur',
    title='Top 5 Company Locations by Average Salary',
    labels={'job_location':'Country','average_salary_eur':'Average Salary (EUR)'},
    color = 'job_location',
    text_auto=True
)

fig_location_salary.show()

In [None]:
# Where does France Stand
if 'job_location' in ai_jobs.columns and 'salary_eur' in ai_jobs.columns:
  all_avg_salary_by_location = ai_jobs.group_by('job_location').agg(
    pl.col('salary_eur').mean().round(0).alias('average_salary_eur')
).sort('average_salary_eur',descending=True)

In [None]:
# Getting the top 5 locations
top_five_locations = all_avg_salary_by_location.head(5)

# Getting France's average Salary
france_salary_df = all_avg_salary_by_location.filter(
    pl.col('job_location')=='France'
)

In [None]:
# Combining the locations
combined_locations = top_five_locations.clone()
if 'France' not in top_five_locations['job_location'].to_list() and france_salary_df.shape[0] > 0:
  combined_locations = pl.concat([combined_locations,france_salary_df])
  combined_locations_final = combined_locations.sort('average_salary_eur',descending=True)

In [None]:
fig_salary_comparison = px.bar(
    combined_locations_final,
    x='job_location',
    y='average_salary_eur',
    title='France compared to the Top Five Highest paid',
    labels={'job_location':'Country','average_salary_eur':'Average Salary (EUR)'},
    text_auto=True,
    color='job_location',
    color_discrete_map={'France':'lightgreen',
                        "Switzerland":"blue",
                        "Denmark":"blue",
                        "Norway":"blue",
                        "United States":"blue",
                        "United Kingdom":"blue"}
)

fig_salary_comparison.show()

In [None]:
# Remote Ratio vs Salary
avg_salary_by_remote = ai_jobs.group_by('remote_ratio').agg(
    pl.col('salary_eur').mean().round(0).alias('salary_eur')
).sort('remote_ratio')

In [None]:
fig_remote_salary = px.bar(
    avg_salary_by_remote,
    x='remote_ratio',
    y='salary_eur',
    title='Average Salary by Remote Ratio',
    labels={'remote_ratio':'Remote Jobs Ratio (%)','salary_eur':'Averagey Salary (EUR)'},
    color='remote_ratio',
    text_auto= True
)


fig_remote_salary.show()

In [73]:
# Average salary by Contract
avg_salary_by_contract = ai_jobs.group_by('employment_type').agg(
    pl.col('salary_eur').mean().round(0).alias('salary_eur')
).sort('employment_type')

In [76]:
fig_contract_salary = px.bar(
    avg_salary_by_contract,
    x='employment_type',
    y='salary_eur',
    title='Average Salary by Contract Type',
    labels={'employment_type':'Employment Type Ratio (%)','salary_eur':'Averagey Salary (EUR)'},
    color='employment_type',
    text_auto= True
)


fig_contract_salary.show()

In [79]:
# Job Title by salary
avg_salary_by_job= ai_jobs.group_by('job_title').agg(
    pl.col('salary_eur').mean().round(0).alias('average_salary_eur')).sort('average_salary_eur',descending=True).head(5)

display(avg_salary_by_job)

job_title,average_salary_eur
str,f64
"""AI Specialist""",120571.0
"""Machine Learning Engineer""",118828.0
"""Head of AI""",118543.0
"""AI Research Scientist""",117898.0
"""AI Architect""",117437.0


In [82]:
fig_job_salary = px.bar(
    avg_salary_by_job,
    x='job_title',
    y='average_salary_eur',
    labels={'job_title':'Job Title','average_salary_eur':'Average Salary (EUR)'},
    color='job_title',
    text_auto= True
)
fig_job_salary.update_layout(
    title=dict(
        text="Top 5 Jobs by Salary",
        x=0.5,
        xanchor="center"
    ),
    xaxis=dict(title='Job Titles'),
    yaxis=dict(title='Salaries '),
    legend_title='Job Titles'
)
fig_job_salary.show()