In [2]:
import pandas as pd
import requests
import ast

#Loading Data_Jobs Dataset
#%pip install datasets
from datasets import load_dataset
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

#Data Cleanup
df = df.dropna(subset=['salary_year_avg'])
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])
df['job_skills'] = df['job_skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)

df.head(3)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
28,Data Scientist,CRM Data Specialist,"San José Province, San José, Costa Rica",via Ai-Jobs.net,Full-time,False,Costa Rica,2023-08-01 13:37:57,False,False,Costa Rica,year,109500.0,,Netskope,"[gdpr, excel]","{'analyst_tools': ['excel'], 'libraries': ['gd..."
77,Data Engineer,Data Engineer,"Arlington, VA",via LinkedIn,Full-time,False,Sudan,2023-06-26 14:22:54,False,False,Sudan,year,140000.0,,Intelletec,"[mongodb, mongodb, python, r, sql, mysql, mari...","{'analyst_tools': ['tableau'], 'cloud': ['orac..."
92,Data Engineer,Remote - Data Engineer - Permanent - W2,Anywhere,via LinkedIn,Full-time,True,"Illinois, United States",2023-02-21 13:29:59,False,True,United States,year,120000.0,,Apex Systems,"[sql, python]","{'programming': ['sql', 'python']}"


In [3]:
#Loading USD -> EUR exchange rate
def get_usd_to_eur_rate_from_erapi():
    url = 'https://open.er-api.com/v6/latest/USD'
    response = requests.get(url)
    data = response.json()
    return data['rates']['EUR'] if data.get('result') == 'success' and 'EUR' in data.get('rates', {}) else None

x_rate = get_usd_to_eur_rate_from_erapi()

# Create DataFrame and save to CSV
df_ex_rate = pd.DataFrame([{'year': 2023, 'usd_to_eur': x_rate}]).round(4)
df_ex_rate.to_csv('/Users/kolesnikevgenia/Documents/Python_Projects/Job_Skills/Raw_Data/ex_rate.csv', index=False)

df_ex_rate

Unnamed: 0,year,usd_to_eur
0,2023,0.8815


In [4]:
#Loading EU countries dictionary
df_EU = pd.read_csv('/Users/kolesnikevgenia/Documents/Python_Projects/Job_Skills/Raw_Data/EU_countries_dict.csv', delimiter=';')

df_EU.head(3)

Unnamed: 0,country,is_eu
0,Austria,True
1,Belgium,True
2,Bulgaria,True


In [5]:
#Convert job_year_avg in EUR
df_Final = df.rename(columns={'salary_year_avg': 'salary_year_avg_usd'})
df_Final['year'] = pd.to_datetime(df_Final['job_posted_date']).dt.year
df_Final = df_Final.merge(df_ex_rate, how='left', on='year')
df_Final['salary_year_avg_eur'] = (df_Final['salary_year_avg_usd'] * df_Final['usd_to_eur']).round(2)

#Calculate monthly salaries in EUR
df_Final['salary_month_avg_eur'] = (df_Final['salary_year_avg_eur'] / 12).round(2)

df_Final.loc[1:3, ['job_title_short', 'usd_to_eur', 'salary_year_avg_usd', 'salary_year_avg_eur', 'salary_month_avg_eur']]


Unnamed: 0,job_title_short,usd_to_eur,salary_year_avg_usd,salary_year_avg_eur,salary_month_avg_eur
1,Data Engineer,0.8815,140000.0,123410.0,10284.17
2,Data Engineer,0.8815,120000.0,105780.0,8815.0
3,Data Scientist,0.8815,228222.0,201177.69,16764.81


In [6]:
#Add EU column
df_Final = df_Final.merge(df_EU, how='left', left_on='job_country', right_on='country')
pd.set_option('future.no_silent_downcasting', True)
df_Final['is_eu'] = df_Final['is_eu'].fillna(False).astype(bool)

df_Final.loc[1:10, ['job_title_short', 'job_country', 'is_eu']]

Unnamed: 0,job_title_short,job_country,is_eu
1,Data Engineer,Sudan,False
2,Data Engineer,United States,False
3,Data Scientist,United States,False
4,Data Analyst,United States,False
5,Data Scientist,United States,False
6,Data Engineer,United States,False
7,Data Analyst,United States,False
8,Data Scientist,Belgium,True
9,Data Scientist,United States,False
10,Data Engineer,United States,False


In [7]:
#Add Country grouping column
def group_country(row):
    if row['job_country'] == 'United States':
        return 'US'
    elif row['is_eu'] == True:
        return 'EU'
    else:
        return 'Other'
    
df_Final['region_group'] = df_Final.apply(group_country, axis=1)

df_Final.loc[1:10, ['region_group', 'job_country', 'salary_month_avg_eur']]

Unnamed: 0,region_group,job_country,salary_month_avg_eur
1,Other,Sudan,10284.17
2,US,United States,8815.0
3,US,United States,16764.81
4,US,United States,6537.79
5,US,United States,8374.25
6,US,United States,9512.85
7,US,United States,6629.62
8,EU,Belgium,11569.69
9,US,United States,7575.61
10,US,United States,13589.79


In [8]:
df_Final.drop(columns=['usd_to_eur', 'year', 'salary_year_avg_usd', 'salary_hour_avg'], inplace=True)
df_Final.to_csv('/Users/kolesnikevgenia/Documents/Python_Projects/Job_Skills/Raw_Data/df_Final.csv', index=False)

df_Final.head(3)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,company_name,job_skills,job_type_skills,salary_year_avg_eur,salary_month_avg_eur,country,is_eu,region_group
0,Data Scientist,CRM Data Specialist,"San José Province, San José, Costa Rica",via Ai-Jobs.net,Full-time,False,Costa Rica,2023-08-01 13:37:57,False,False,Costa Rica,year,Netskope,"[gdpr, excel]","{'analyst_tools': ['excel'], 'libraries': ['gd...",96524.25,8043.69,,False,Other
1,Data Engineer,Data Engineer,"Arlington, VA",via LinkedIn,Full-time,False,Sudan,2023-06-26 14:22:54,False,False,Sudan,year,Intelletec,"[mongodb, mongodb, python, r, sql, mysql, mari...","{'analyst_tools': ['tableau'], 'cloud': ['orac...",123410.0,10284.17,,False,Other
2,Data Engineer,Remote - Data Engineer - Permanent - W2,Anywhere,via LinkedIn,Full-time,True,"Illinois, United States",2023-02-21 13:29:59,False,True,United States,year,Apex Systems,"[sql, python]","{'programming': ['sql', 'python']}",105780.0,8815.0,,False,US
