In [1]:
# Importing Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt 

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

We have fictional colleague, who wants access to the dataset.

They need the missing (`NaN`) values in the salary columns to be filled in with the median value.

In [5]:
# Finding the median for the yearly column
median_salary_year = df['salary_year_avg'].median()

In [4]:
# Finding the median for the hourly column
median_salary_hour = df['salary_hour_avg'].median()

In [10]:
# Let's keep our original dataframe intact
df_filled = df

# Now fill in the values in the copy
df_filled['salary_year_avg'] = df_filled['salary_year_avg'].fillna(median_salary_year)

In [11]:
# Same for the hourly column
df_filled['salary_hour_avg'] = df_filled['salary_hour_avg'].fillna(median_salary_hour)

Now that the `NaN` values are filled in, we also need to drop duplicate rows.

In [15]:
# We preserve the filled in dataframe as well
df_unique = df_filled

# Dropping all duplicates
df_unique = df_unique.drop_duplicates()

# Compare lengths
print('Length of original df:       ', len(df_filled))
print('Length of drop duplicates df:', len(df_unique))
print('Rows dropped:                ', len(df_filled)-len(df_unique))

Length of original df:        785741
Length of drop duplicates df: 785640
Rows dropped:                 101


Let's go further and don't just drop all duplicate rows, but select certain columns from the dataframe to use them to filter for duplicates!

In [16]:
df_unique = df_unique.drop_duplicates(subset=['job_title', 'company_name'])

# Compare lengths
print('Length of original df:       ', len(df_filled))
print('Length of drop duplicates df:', len(df_unique))
print('Rows dropped:                ', len(df_filled)-len(df_unique))

Length of original df:        785741
Length of drop duplicates df: 508042
Rows dropped:                 277699
