# Data Cleaning and Merging

- Load Raw Data
- Clean Data
- Merge Data
- Save Final Dataset

In [7]:
import pandas as pd
import requests
import ast

### Load Raw Data

In [8]:
#Loading Data_Jobs Dataset
#%pip install datasets
from datasets import load_dataset
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

df.head(3)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
0,Senior Data Engineer,Senior Clinical Data Engineer / Principal Clin...,"Watertown, CT",via Work Nearby,Full-time,False,"Texas, United States",2023-06-16 13:44:15,False,False,United States,,,,Boehringer Ingelheim,,
1,Data Analyst,Data Analyst,"Guadalajara, Jalisco, Mexico",via BeBee México,Full-time,False,Mexico,2023-01-14 13:18:07,False,False,Mexico,,,,Hewlett Packard Enterprise,"['r', 'python', 'sql', 'nosql', 'power bi', 't...","{'analyst_tools': ['power bi', 'tableau'], 'pr..."
2,Data Engineer,"Data Engineer/Scientist/Analyst, Mid or Senior...","Berlin, Germany",via LinkedIn,Full-time,False,Germany,2023-10-10 13:14:55,False,False,Germany,,,,ALPHA Augmented Services,"['python', 'sql', 'c#', 'azure', 'airflow', 'd...","{'analyst_tools': ['dax'], 'cloud': ['azure'],..."


In [9]:
#Loading USD -> EUR exchange rate
def get_usd_to_eur_rate_from_erapi():
    url = 'https://open.er-api.com/v6/latest/USD'
    response = requests.get(url)
    data = response.json()
    return data['rates']['EUR'] if data.get('result') == 'success' and 'EUR' in data.get('rates', {}) else None

x_rate = get_usd_to_eur_rate_from_erapi()

# Create DataFrame and save to CSV
df_ex_rate = pd.DataFrame([{'year': 2023, 'usd_to_eur': x_rate}]).round(4)
df_ex_rate.to_csv('/Users/kolesnikevgenia/Documents/Python_Projects/Job_Skills/Raw_Data/ex_rate.csv', index=False)

df_ex_rate

Unnamed: 0,year,usd_to_eur
0,2023,0.8707


In [12]:
#Loading EU countries dictionary
df_EU = pd.read_csv('/Users/kolesnikevgenia/Documents/Python_Projects/Job_Skills/Raw_Data/EU_countries_dict.csv', delimiter=';')

df_EU.head(3)

Unnamed: 0,country,is_eu,ISO
0,Austria,True,AUT
1,Belgium,True,BEL
2,Bulgaria,True,BGR


### Clean Data

In [13]:
#Jobs Dataset Cleaning
df = df.dropna(subset=['salary_year_avg'])
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])
df['job_skills'] = df['job_skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)

### Merge Data

In [14]:
#Merging Exchange Rate. Convert job_year_avg in EUR
df_Final = df.rename(columns={'salary_year_avg': 'salary_year_avg_usd'})
df_Final['year'] = pd.to_datetime(df_Final['job_posted_date']).dt.year
df_Final = df_Final.merge(df_ex_rate, how='left', on='year')
df_Final['salary_year_avg_eur'] = (df_Final['salary_year_avg_usd'] * df_Final['usd_to_eur']).round(2)

#Calculate monthly salaries in EUR
df_Final['salary_month_avg_eur'] = (df_Final['salary_year_avg_eur'] / 12).round(2)

df_Final.loc[1:3, ['job_title_short', 'usd_to_eur', 'salary_year_avg_usd', 'salary_year_avg_eur', 'salary_month_avg_eur']]


Unnamed: 0,job_title_short,usd_to_eur,salary_year_avg_usd,salary_year_avg_eur,salary_month_avg_eur
1,Data Engineer,0.8707,140000.0,121898.0,10158.17
2,Data Engineer,0.8707,120000.0,104484.0,8707.0
3,Data Scientist,0.8707,228222.0,198712.9,16559.41


In [15]:
#Add EU column
df_Final = df_Final.merge(df_EU, how='left', left_on='job_country', right_on='country')
pd.set_option('future.no_silent_downcasting', True)
df_Final['is_eu'] = df_Final['is_eu'].fillna(False).astype(bool)

df_Final.loc[1:10, ['job_title_short', 'job_country', 'is_eu', 'ISO']]

Unnamed: 0,job_title_short,job_country,is_eu,ISO
1,Data Engineer,Sudan,False,
2,Data Engineer,United States,False,
3,Data Scientist,United States,False,
4,Data Analyst,United States,False,
5,Data Scientist,United States,False,
6,Data Engineer,United States,False,
7,Data Analyst,United States,False,
8,Data Scientist,Belgium,True,BEL
9,Data Scientist,United States,False,
10,Data Engineer,United States,False,


In [16]:
#Add Country grouping column
def group_country(row):
    if row['job_country'] == 'United States':
        return 'US'
    elif row['is_eu'] == True:
        return 'EU'
    else:
        return 'Other'
    
df_Final['region_group'] = df_Final.apply(group_country, axis=1)

df_Final.loc[1:10, ['region_group', 'job_country', 'salary_month_avg_eur']]

Unnamed: 0,region_group,job_country,salary_month_avg_eur
1,Other,Sudan,10158.17
2,US,United States,8707.0
3,US,United States,16559.41
4,US,United States,6457.69
5,US,United States,8271.65
6,US,United States,9396.3
7,US,United States,6548.39
8,EU,Belgium,11427.94
9,US,United States,7482.8
10,US,United States,13423.29


### Save Final Dataset

In [17]:
df_Final.drop(columns=['usd_to_eur', 'year', 'salary_year_avg_usd', 'salary_hour_avg'], inplace=True)
df_Final.to_csv('/Users/kolesnikevgenia/Documents/Python_Projects/Job_Skills/Raw_Data/df_Final.csv', index=False)

df_Final.head(3)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,...,salary_rate,company_name,job_skills,job_type_skills,salary_year_avg_eur,salary_month_avg_eur,country,is_eu,ISO,region_group
0,Data Scientist,CRM Data Specialist,"San José Province, San José, Costa Rica",via Ai-Jobs.net,Full-time,False,Costa Rica,2023-08-01 13:37:57,False,False,...,year,Netskope,"[gdpr, excel]","{'analyst_tools': ['excel'], 'libraries': ['gd...",95341.65,7945.14,,False,,Other
1,Data Engineer,Data Engineer,"Arlington, VA",via LinkedIn,Full-time,False,Sudan,2023-06-26 14:22:54,False,False,...,year,Intelletec,"[mongodb, mongodb, python, r, sql, mysql, mari...","{'analyst_tools': ['tableau'], 'cloud': ['orac...",121898.0,10158.17,,False,,Other
2,Data Engineer,Remote - Data Engineer - Permanent - W2,Anywhere,via LinkedIn,Full-time,True,"Illinois, United States",2023-02-21 13:29:59,False,True,...,year,Apex Systems,"[sql, python]","{'programming': ['sql', 'python']}",104484.0,8707.0,,False,,US
