In [1]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt  

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

# DataFrame Copy
df_original = df.copy()

In [2]:
df_altered = df_original

df_altered.loc[:5,'salary_year_avg']

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
Name: salary_year_avg, dtype: float64

In [3]:
median_salary = df_altered['salary_year_avg'].median()


df_altered['salary_year_avg'] = df_altered.loc[:,'salary_year_avg'].fillna(median_salary)


In [4]:

df_altered['salary_year_avg'] = df_altered['salary_year_avg'].fillna(median_salary)

In [5]:

df_altered.loc[:5,'salary_year_avg']

0    115000.0
1    115000.0
2    115000.0
3    115000.0
4    115000.0
5    115000.0
Name: salary_year_avg, dtype: float64

In [6]:
df_original.loc[:5,'salary_year_avg']

0    115000.0
1    115000.0
2    115000.0
3    115000.0
4    115000.0
5    115000.0
Name: salary_year_avg, dtype: float64

In [7]:
print('ID of df_original:               ', id(df_original))
print('ID of df_altered:                ', id(df_altered))
print('Are the two dataframes the same? ', id(df_original) == id(df_altered))

ID of df_original:                1859875118416
ID of df_altered:                 1859875118416
Are the two dataframes the same?  True


In [8]:
df_original = df.copy()
df_altered = df_original.copy()

print('ID of df_original:               ', id(df_original))
print('ID of df_altered:                ', id(df_altered))
print('Are the two dataframes the same? ', id(df_original) == id(df_altered))

ID of df_original:                1860397927312
ID of df_altered:                 1859879160976
Are the two dataframes the same?  False


In [9]:
median_salary = df_altered['salary_year_avg'].median()

# Filling the missing values with the median salary
df_altered['salary_year_avg'] = df_altered['salary_year_avg'].fillna(median_salary)

df_altered.loc[:5,'salary_year_avg']

0    115000.0
1    115000.0
2    115000.0
3    115000.0
4    115000.0
5    115000.0
Name: salary_year_avg, dtype: float64

In [10]:

df_original.loc[:5,'salary_year_avg']

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
Name: salary_year_avg, dtype: float64

In [11]:

df.sample(n=5)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
63133,Data Analyst,Data Analyst,Spain,via Trabajo.org,Full-time,False,Spain,2023-09-09 06:26:22,True,False,Spain,,,,Repsol,"['sql', 'sas', 'sas', 'r', 'python']","{'analyst_tools': ['sas'], 'programming': ['sq..."
574844,Senior Data Engineer,"Senior Data Engineer, B2B Acceptance","Atlanta, GA",via Visa - Talentify,Full-time,False,"California, United States",2023-02-23 09:24:44,False,True,United States,,,,Visa,"['nosql', 'python', 'cassandra', 'mysql', 'db2...","{'cloud': ['oracle'], 'databases': ['cassandra..."
606886,Data Analyst,"Manager, Digital Analytics (L 09)",Anywhere,via LinkedIn,Full-time,True,India,2023-04-18 09:13:24,False,False,India,,,,Synchrony,"['sql', 'sas', 'sas', 'word']","{'analyst_tools': ['sas', 'word'], 'programmin..."
351755,Machine Learning Engineer,Machine Learning Specialist,"Lisbon, Portugal",via Jobrapido.com,Full-time,False,Portugal,2023-01-28 00:03:38,False,False,Portugal,,,,HUK-COBURG Versicherungsgruppe,"['python', 'azure', 'aws', 'pytorch', 'tensorf...","{'cloud': ['azure', 'aws'], 'libraries': ['pyt..."
233598,Data Scientist,Data Manager (H/F) | POEI,"Puteaux, France",via Welcome To The Jungle,Full-time,False,France,2023-07-11 07:54:54,False,False,France,,,,Datascientest,,


In [12]:
df.sample(frac=0.1, replace=False)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
349454,Senior Data Engineer,Senior Data Engineer (m/f/d) - Team Forecasting,"Berlin, Germany",via LinkedIn,Full-time,False,Germany,2023-09-21 23:26:24,True,False,Germany,,,,Flix,"['python', 'typescript', 'sql', 'scikit-learn'...","{'libraries': ['scikit-learn', 'pandas', 'nump..."
297244,Data Engineer,Data Engineer,"Liverpool, UK",via LinkedIn,Full-time,False,United Kingdom,2023-12-14 17:13:27,True,False,United Kingdom,,,,Matalan,"['go', 'gcp', 'aws']","{'cloud': ['gcp', 'aws'], 'programming': ['go']}"
659133,Data Analyst,Data Analyst Automotive Hochvoltspeicher (m/w/d),"Munich, Germany",via Stepstone,Full-time,False,Germany,2023-06-12 10:38:57,True,False,Germany,,,,Bertrandt AG,"['python', 'aws', 'azure']","{'cloud': ['aws', 'azure'], 'programming': ['p..."
578524,Senior Data Scientist,Senior Data Scientist (m/f/d),"Munich, Germany",via LinkedIn,Full-time,False,Germany,2023-12-22 09:18:19,False,False,Germany,,,,EGYM | DACH,"['python', 'java', 'go', 'r', 'sql', 'nosql']","{'programming': ['python', 'java', 'go', 'r', ..."
205985,Data Scientist,IT - Data Scientist 4,"Pittsburgh, PA",via ZipRecruiter,Full-time,False,"New York, United States",2023-07-28 22:02:36,False,False,United States,,,,V2Soft,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12255,Data Engineer,Data Engineer,"San Borja, Peru",via BeBee Perú,Full-time,False,Peru,2023-12-20 13:26:16,False,False,Peru,,,,Intercorp Retail,['excel'],{'analyst_tools': ['excel']}
10243,Data Engineer,Team Lead Data Engineer,"Moscow, Russia",via hh.ru,Full-time,False,Russia,2023-11-12 13:34:34,True,False,Russia,,,,"билайн: ИТ, Data, Digital","['scala', 'sql', 'postgresql', 'spark', 'hadoop']","{'databases': ['postgresql'], 'libraries': ['s..."
435896,Data Engineer,Jr. Intel Data Engineer - ETL | Cloud | Data W...,"Rosslyn, VA",via Clearance Jobs,Full-time,False,"Illinois, United States",2023-02-01 16:49:24,True,False,United States,,,,Deloitte,"['python', 'java', 'scala', 'r', 'sql', 'hadoo...","{'async': ['jira', 'confluence'], 'libraries':..."
256378,Business Analyst,Operations Analyst,New Zealand,via Trabajo.org,Full-time,False,New Zealand,2023-12-14 07:19:24,False,False,New Zealand,,,,Te Whatu Ora - Health New Zealand,['react'],{'libraries': ['react']}
