## Data Cleaning and Merging

This section covers the steps taken to prepare the raw datasets for analysis:
- Load Raw Data
- Clean Data
- Merge Data
- Save Final Dataset

### Import Libraries

In [1]:
from pathlib import Path
import ast

import requests
import pandas as pd
from tqdm import tqdm 

### Load Raw Data

#### Import the job postings dataset

In [4]:
# %pip install datasets
'''from datasets import load_dataset
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

df.head(3)'''

raw_data_dir = Path.cwd().parents[1] / 'Raw_Data'

csv_file = raw_data_dir / 'data_jobs_24.csv' 
df = pd.read_csv(csv_file)

df.head(3)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
0,Data Analyst,"Summer Internship -Data Analyst Intern, Risk M...","Marlborough, MA",via Boatingrevealed.com,"Full-time, Part-time, and Internship",False,"New York, United States",2024-01-01 00:00:01,False,True,United States,,,,BJ's Wholesale Club,['excel'],{'analyst_tools': ['excel']}
1,Data Analyst,"Staff Data Analyst Operations, Infrastructure ...","Fremont, CA",via ClimateTechList,Full-time,False,"California, United States",2024-01-01 00:00:11,True,False,United States,,,,Tesla,"['tableau', 'flow']","{'analyst_tools': ['tableau'], 'other': ['flow']}"
2,Data Analyst,Junior Data Analyst - Entry Level,"Waco, TX",via ZipRecruiter,Full-time and Part-time,False,"Texas, United States",2024-01-01 00:00:15,True,False,United States,,,,Next Recruiting,,


To make the salary data more relevant for European users, I extended the main dataset (which includes yearly salaries in USD) with two additional sources:
- **Exchange Rate** — to convert salaries from USD to EUR.
- **Country Dictionary** — to group EU countries for regional analysis.

#### Load USD→EUR exchange rate from a public API

In [7]:
'''def get_usd_to_eur_rate_from_erapi():
    url = 'https://open.er-api.com/v6/latest/USD'
    response = requests.get(url)
    data = response.json()
    return data['rates']['EUR'] if data.get('result') == 'success' and 'EUR' in data.get('rates', {}) else None

x_rate = get_usd_to_eur_rate_from_erapi()

# Create DataFrame and save to CSV
df_ex_rate = pd.DataFrame([{'year': 2023, 'usd_to_eur': x_rate}]).round(4)

raw_data_dir = Path.cwd().parents[1] / 'Raw_Data'
df_ex_rate.to_csv(raw_data_dir / 'ex_rate.csv', index=False)

df_ex_rate'''
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])
def get_usd_to_eur_rate_frankfurter(date_str):
    
    url = f"https://api.frankfurter.app/{date_str}?from=USD&to=EUR"
    response = requests.get(url)
    data = response.json()
    if 'rates' in data and 'EUR' in data['rates']:
        return data['rates']['EUR']
    return None

# Get all unique job_posted_date values (ensure they are datetime.date)
unique_dates = sorted(df['job_posted_date'].dt.date.unique())

# Get exchange rates for each date
ex_rates = []
for date_obj in tqdm(unique_dates, desc="Fetching exchange rates"):
    date_str = date_obj.strftime("%Y-%m-%d")
    rate = get_usd_to_eur_rate_frankfurter(date_str)
    ex_rates.append({'job_posted_date': date_obj, 'usd_to_eur': rate})

df_ex_rate = pd.DataFrame(ex_rates).round(4)

df_ex_rate.to_csv(raw_data_dir / 'ex_rate_daily.csv', index=False)

df_ex_rate

Fetching exchange rates: 100%|██████████| 366/366 [07:08<00:00,  1.17s/it]


Unnamed: 0,job_posted_date,usd_to_eur
0,2024-01-01,0.9050
1,2024-01-02,0.9127
2,2024-01-03,0.9158
3,2024-01-04,0.9130
4,2024-01-05,0.9157
...,...,...
361,2024-12-27,0.9583
362,2024-12-28,0.9583
363,2024-12-29,0.9583
364,2024-12-30,0.9575


#### Load a countries dictionary

In [8]:
df_EU = pd.read_csv(raw_data_dir / 'EU_Countries_dict.csv', delimiter=';')

df_EU.head(3)

Unnamed: 0,country,is_eu,ISO
0,Austria,True,AUT
1,Belgium,True,BEL
2,Bulgaria,True,BGR


### Clean Data

In [21]:
df['job_skills'] = df['job_skills'].apply(lambda x: tuple(x) if isinstance(x, list) else x)
df = df.drop_duplicates()

df_ex_rate['job_posted_date'] = pd.to_datetime(df_ex_rate['job_posted_date'])

### Merge Data

#### Merge Exchange rate
In this section we are merging Exchange rate to the main dataset and convert annual salaries to EUR. 
Since salaries in Europe are commonly discussed monthly, we are calculating monthly equivalents for better local insight.

In [22]:
# Merge Exchange Rate. Convert job_year_avg in EUR
df_Final = df.rename(columns={'salary_year_avg': 'salary_year_avg_usd'})

df_Final = df_Final.merge(df_ex_rate, on='job_posted_date', how='left')
df_Final['salary_year_avg_eur'] = (df_Final['salary_year_avg_usd'] * df_Final['usd_to_eur']).round(2)

# Calculate monthly salaries
df_Final['salary_month_avg_eur'] = (df_Final['salary_year_avg_eur'] / 12).round(2)

df_Final.loc[1:3, ['job_title_short', 'usd_to_eur', 'salary_year_avg_usd', 'salary_year_avg_eur', 'salary_month_avg_eur']]


Unnamed: 0,job_title_short,usd_to_eur,salary_year_avg_usd,salary_year_avg_eur,salary_month_avg_eur
1,Data Analyst,,,,
2,Data Analyst,,,,
3,Data Analyst,,,,


#### Merge Countries dictionary

To analyze the European data jobs market add is_eu and ISO columns from the countries dictionary.
- Use is_eu column to group countries that are European Union members.
- Use ISO codes to build interactive map with job salaries in 3_Salary_Analysis notebook.

In [23]:
df_Final = df_Final.merge(df_EU, how='left', left_on='job_country', right_on='country')
pd.set_option('future.no_silent_downcasting', True)
df_Final['is_eu'] = df_Final['is_eu'].fillna(False).astype(bool)

df_Final.loc[1:10, ['job_title_short', 'job_country', 'is_eu', 'ISO']]

Unnamed: 0,job_title_short,job_country,is_eu,ISO
1,Data Analyst,United States,False,
2,Data Analyst,United States,False,
3,Data Analyst,United States,False,
4,Data Scientist,United States,False,
5,Senior Data Scientist,United States,False,
6,Machine Learning Engineer,United States,False,
7,Machine Learning Engineer,United States,False,
8,Data Analyst,United States,False,
9,Data Scientist,United States,False,
10,Senior Data Analyst,United States,False,


In [31]:
# Add Region grouping column
def group_country(row):
    if row['job_country'] == 'United States':
        return 'US'
    elif row['is_eu']:
        return 'EU'
    else:
        return 'Other'
    
df_Final['region_group'] = df_Final.apply(group_country, axis=1)

df_Final.loc[1:10, ['region_group', 'job_country', 'salary_month_avg_eur']]

Unnamed: 0,region_group,job_country,salary_month_avg_eur
1,US,United States,
2,US,United States,
3,US,United States,
4,US,United States,
5,US,United States,
6,US,United States,
7,US,United States,
8,US,United States,
9,US,United States,
10,US,United States,


### Save Final Dataset

In [33]:
df_Final.drop(columns=['usd_to_eur', 'salary_year_avg_usd', 'salary_hour_avg'], inplace=True)
df_Final.to_pickle(raw_data_dir / 'df_Final.pkl')

df_Final.head(3)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,...,salary_rate,company_name,job_skills,job_type_skills,salary_year_avg_eur,salary_month_avg_eur,country,is_eu,ISO,region_group
0,Data Analyst,"Summer Internship -Data Analyst Intern, Risk M...","Marlborough, MA",via Boatingrevealed.com,"Full-time, Part-time, and Internship",False,"New York, United States",2024-01-01 00:00:01,False,True,...,,BJ's Wholesale Club,"(excel,)",{'analyst_tools': ['excel']},,,,False,,US
1,Data Analyst,"Staff Data Analyst Operations, Infrastructure ...","Fremont, CA",via ClimateTechList,Full-time,False,"California, United States",2024-01-01 00:00:11,True,False,...,,Tesla,"(tableau, flow)","{'analyst_tools': ['tableau'], 'other': ['flow']}",,,,False,,US
2,Data Analyst,Junior Data Analyst - Entry Level,"Waco, TX",via ZipRecruiter,Full-time and Part-time,False,"Texas, United States",2024-01-01 00:00:15,True,False,...,,Next Recruiting,,,,,,False,,US
