# AI POWERED JOB INSIGHTS - SIMPLE CLEANING DATASET


Laksika Tharmalingam. (2024). AI-Powered Job Market Insights. Kaggle.com. https://www.kaggle.com/datasets/uom190346a/ai-powered-job-market-insights/data


In [65]:
#importing

import numpy as np
import pandas as pd


In [66]:
dataset = pd.read_csv("ai_job_market_insights.csv")

In [67]:
dataset.head(10)

Unnamed: 0,Job_Title,Industry,Company_Size,Location,AI_Adoption_Level,Automation_Risk,Required_Skills,Salary_USD,Remote_Friendly,Job_Growth_Projection
0,Cybersecurity Analyst,Entertainment,Small,Dubai,Medium,High,UX/UI Design,111392.165243,Yes,Growth
1,Marketing Specialist,Technology,Large,Singapore,Medium,High,Marketing,93792.562466,No,Decline
2,AI Researcher,Technology,Large,Singapore,Medium,High,UX/UI Design,107170.263069,Yes,Growth
3,Sales Manager,Retail,Small,Berlin,Low,High,Project Management,93027.953758,No,Growth
4,Cybersecurity Analyst,Entertainment,Small,Tokyo,Low,Low,JavaScript,87752.922171,Yes,Decline
5,UX Designer,Education,Large,San Francisco,Medium,Medium,Cybersecurity,102825.007867,No,Growth
6,HR Manager,Finance,Medium,Singapore,Low,High,Sales,102065.720673,Yes,Growth
7,Cybersecurity Analyst,Technology,Small,Dubai,Medium,Low,Machine Learning,86607.317618,Yes,Decline
8,AI Researcher,Retail,Large,London,High,Low,JavaScript,75015.860846,No,Stable
9,Sales Manager,Entertainment,Medium,Singapore,High,Low,Cybersecurity,96834.578295,Yes,Decline


In [68]:
dataset.columns

Index(['Job_Title', 'Industry', 'Company_Size', 'Location',
       'AI_Adoption_Level', 'Automation_Risk', 'Required_Skills', 'Salary_USD',
       'Remote_Friendly', 'Job_Growth_Projection'],
      dtype='object')

No columns seem misspelt 

In [69]:
dataset.shape

(500, 10)

500 rows, 10 columns

In [70]:
dataset.dtypes

Job_Title                 object
Industry                  object
Company_Size              object
Location                  object
AI_Adoption_Level         object
Automation_Risk           object
Required_Skills           object
Salary_USD               float64
Remote_Friendly           object
Job_Growth_Projection     object
dtype: object

In [71]:
#checking to see if anything is missing 
dataset.isna().sum()

Job_Title                0
Industry                 0
Company_Size             0
Location                 0
AI_Adoption_Level        0
Automation_Risk          0
Required_Skills          0
Salary_USD               0
Remote_Friendly          0
Job_Growth_Projection    0
dtype: int64

no data seems to be NA

In [72]:
#checking for duplicates
dataset.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
495    False
496    False
497    False
498    False
499    False
Length: 500, dtype: bool

no data seem to be duplicated

In [73]:
dataset["Industry"].unique()

array(['Entertainment', 'Technology', 'Retail', 'Education', 'Finance',
       'Transportation', 'Telecommunications', 'Manufacturing',
       'Healthcare', 'Energy'], dtype=object)

In [74]:
#getting a feel of the values in the dataset
for i in range(len(dataset.columns)):
    columns = dataset.columns[i]
    unique_values = dataset.iloc[:, i].unique()
    print(f"{columns}\n {unique_values} \n")

Job_Title
 ['Cybersecurity Analyst' 'Marketing Specialist' 'AI Researcher'
 'Sales Manager' 'UX Designer' 'HR Manager' 'Product Manager'
 'Software Engineer' 'Data Scientist' 'Operations Manager'] 

Industry
 ['Entertainment' 'Technology' 'Retail' 'Education' 'Finance'
 'Transportation' 'Telecommunications' 'Manufacturing' 'Healthcare'
 'Energy'] 

Company_Size
 ['Small' 'Large' 'Medium'] 

Location
 ['Dubai' 'Singapore' 'Berlin' 'Tokyo' 'San Francisco' 'London' 'Paris'
 'Sydney' 'New York' 'Toronto'] 

AI_Adoption_Level
 ['Medium' 'Low' 'High'] 

Automation_Risk
 ['High' 'Low' 'Medium'] 

Required_Skills
 ['UX/UI Design' 'Marketing' 'Project Management' 'JavaScript'
 'Cybersecurity' 'Sales' 'Machine Learning' 'Python' 'Data Analysis'
 'Communication'] 

Salary_USD
 [111392.16524316  93792.56246611 107170.26306895  93027.95375786
  87752.92217059 102825.00786657 102065.72067348  86607.3176181
  75015.86084571  96834.57829481  91566.97337925  78902.56574458
  73151.99034926  98209.51687

In [75]:
#updating the values to be rounding to the nearest number as often salary ranges aren't given based upon cents etc
dataset["Salary_USD"] = dataset["Salary_USD"].round(0)

In [76]:
dataset

Unnamed: 0,Job_Title,Industry,Company_Size,Location,AI_Adoption_Level,Automation_Risk,Required_Skills,Salary_USD,Remote_Friendly,Job_Growth_Projection
0,Cybersecurity Analyst,Entertainment,Small,Dubai,Medium,High,UX/UI Design,111392.0,Yes,Growth
1,Marketing Specialist,Technology,Large,Singapore,Medium,High,Marketing,93793.0,No,Decline
2,AI Researcher,Technology,Large,Singapore,Medium,High,UX/UI Design,107170.0,Yes,Growth
3,Sales Manager,Retail,Small,Berlin,Low,High,Project Management,93028.0,No,Growth
4,Cybersecurity Analyst,Entertainment,Small,Tokyo,Low,Low,JavaScript,87753.0,Yes,Decline
...,...,...,...,...,...,...,...,...,...,...
495,Data Scientist,Telecommunications,Medium,Berlin,Low,Medium,Machine Learning,105821.0,Yes,Stable
496,Cybersecurity Analyst,Telecommunications,Small,London,Low,High,UX/UI Design,119795.0,No,Decline
497,Cybersecurity Analyst,Energy,Large,Dubai,High,Low,UX/UI Design,79645.0,Yes,Stable
498,Operations Manager,Healthcare,Large,Paris,High,Low,Python,77642.0,Yes,Stable


In [77]:
#creating long and lat column
dataset.insert(10,"long",0)
dataset.insert(11,"lat",0)
dataset

Unnamed: 0,Job_Title,Industry,Company_Size,Location,AI_Adoption_Level,Automation_Risk,Required_Skills,Salary_USD,Remote_Friendly,Job_Growth_Projection,long,lat
0,Cybersecurity Analyst,Entertainment,Small,Dubai,Medium,High,UX/UI Design,111392.0,Yes,Growth,0,0
1,Marketing Specialist,Technology,Large,Singapore,Medium,High,Marketing,93793.0,No,Decline,0,0
2,AI Researcher,Technology,Large,Singapore,Medium,High,UX/UI Design,107170.0,Yes,Growth,0,0
3,Sales Manager,Retail,Small,Berlin,Low,High,Project Management,93028.0,No,Growth,0,0
4,Cybersecurity Analyst,Entertainment,Small,Tokyo,Low,Low,JavaScript,87753.0,Yes,Decline,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
495,Data Scientist,Telecommunications,Medium,Berlin,Low,Medium,Machine Learning,105821.0,Yes,Stable,0,0
496,Cybersecurity Analyst,Telecommunications,Small,London,Low,High,UX/UI Design,119795.0,No,Decline,0,0
497,Cybersecurity Analyst,Energy,Large,Dubai,High,Low,UX/UI Design,79645.0,Yes,Stable,0,0
498,Operations Manager,Healthcare,Large,Paris,High,Low,Python,77642.0,Yes,Stable,0,0


In [79]:
# create masks for each location and update the 'long' and 'lat' columns

# Dubai
dubai_mask = dataset["Location"] == "Dubai"
dataset.loc[dubai_mask, "long"] = 25.2048
dataset.loc[dubai_mask, "lat"] = 55.2708

# Singapore
singapore_mask = dataset["Location"] == "Singapore"
dataset.loc[singapore_mask, "long"] = 103.8198
dataset.loc[singapore_mask, "lat"] = 1.3521

# Berlin
berlin_mask = dataset["Location"] == "Berlin"
dataset.loc[berlin_mask, "long"] = 13.4050
dataset.loc[berlin_mask, "lat"] = 52.5200

# Tokyo
tokyo_mask = dataset["Location"] == "Tokyo"
dataset.loc[tokyo_mask, "long"] = 139.6917
dataset.loc[tokyo_mask, "lat"] = 35.6895

# San Francisco
sf_mask = dataset["Location"] == "San Francisco"
dataset.loc[sf_mask, "long"] = -122.4194
dataset.loc[sf_mask, "lat"] = 37.7749

# London
london_mask = dataset["Location"] == "London"
dataset.loc[london_mask, "long"] = -0.1276
dataset.loc[london_mask, "lat"] = 51.5074

# Paris
paris_mask = dataset["Location"] == "Paris"
dataset.loc[paris_mask, "long"] = 2.3522
dataset.loc[paris_mask, "lat"] = 48.8566

# Sydney
sydney_mask = dataset["Location"] == "Sydney"
dataset.loc[sydney_mask, "long"] = 151.2093
dataset.loc[sydney_mask, "lat"] = -33.8688

# New York
ny_mask = dataset["Location"] == "New York"
dataset.loc[ny_mask, "long"] = -74.0060
dataset.loc[ny_mask, "lat"] = 40.7128

# Toronto
toronto_mask = dataset["Location"] == "Toronto"
dataset.loc[toronto_mask, "long"] = -79.3832
dataset.loc[toronto_mask, "lat"] = 43.6532

In [80]:
dataset

Unnamed: 0,Job_Title,Industry,Company_Size,Location,AI_Adoption_Level,Automation_Risk,Required_Skills,Salary_USD,Remote_Friendly,Job_Growth_Projection,long,lat
0,Cybersecurity Analyst,Entertainment,Small,Dubai,Medium,High,UX/UI Design,111392.0,Yes,Growth,25.2048,55.2708
1,Marketing Specialist,Technology,Large,Singapore,Medium,High,Marketing,93793.0,No,Decline,103.8198,1.3521
2,AI Researcher,Technology,Large,Singapore,Medium,High,UX/UI Design,107170.0,Yes,Growth,103.8198,1.3521
3,Sales Manager,Retail,Small,Berlin,Low,High,Project Management,93028.0,No,Growth,13.4050,52.5200
4,Cybersecurity Analyst,Entertainment,Small,Tokyo,Low,Low,JavaScript,87753.0,Yes,Decline,139.6917,35.6895
...,...,...,...,...,...,...,...,...,...,...,...,...
495,Data Scientist,Telecommunications,Medium,Berlin,Low,Medium,Machine Learning,105821.0,Yes,Stable,13.4050,52.5200
496,Cybersecurity Analyst,Telecommunications,Small,London,Low,High,UX/UI Design,119795.0,No,Decline,-0.1276,51.5074
497,Cybersecurity Analyst,Energy,Large,Dubai,High,Low,UX/UI Design,79645.0,Yes,Stable,25.2048,55.2708
498,Operations Manager,Healthcare,Large,Paris,High,Low,Python,77642.0,Yes,Stable,2.3522,48.8566


In [82]:
dataset.to_csv("cleaned_dataset_ai_job_insights.csv")