## Importing Dataset & Overview

In [1]:
# Importing Libraries
import pandas as pd
from datasets import load_dataset

# Loading Data 
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

In [2]:
df.head()

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
0,Senior Data Engineer,Senior Clinical Data Engineer / Principal Clin...,"Watertown, CT",via Work Nearby,Full-time,False,"Texas, United States",2023-06-16 13:44:15,False,False,United States,,,,Boehringer Ingelheim,,
1,Data Analyst,Data Analyst,"Guadalajara, Jalisco, Mexico",via BeBee México,Full-time,False,Mexico,2023-01-14 13:18:07,False,False,Mexico,,,,Hewlett Packard Enterprise,"['r', 'python', 'sql', 'nosql', 'power bi', 't...","{'analyst_tools': ['power bi', 'tableau'], 'pr..."
2,Data Engineer,"Data Engineer/Scientist/Analyst, Mid or Senior...","Berlin, Germany",via LinkedIn,Full-time,False,Germany,2023-10-10 13:14:55,False,False,Germany,,,,ALPHA Augmented Services,"['python', 'sql', 'c#', 'azure', 'airflow', 'd...","{'analyst_tools': ['dax'], 'cloud': ['azure'],..."
3,Data Engineer,LEAD ENGINEER - PRINCIPAL ANALYST - PRINCIPAL ...,"San Antonio, TX",via Diversity.com,Full-time,False,"Texas, United States",2023-07-04 13:01:41,True,False,United States,,,,Southwest Research Institute,"['python', 'c++', 'java', 'matlab', 'aws', 'te...","{'cloud': ['aws'], 'libraries': ['tensorflow',..."
4,Data Engineer,Data Engineer- Sr Jobs,"Washington, DC",via Clearance Jobs,Full-time,False,Sudan,2023-08-07 14:29:36,False,False,Sudan,,,,Kristina Daniel,"['bash', 'python', 'oracle', 'aws', 'ansible',...","{'cloud': ['oracle', 'aws'], 'other': ['ansibl..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785741 entries, 0 to 785740
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   job_title_short        785741 non-null  object 
 1   job_title              785740 non-null  object 
 2   job_location           784696 non-null  object 
 3   job_via                785733 non-null  object 
 4   job_schedule_type      773074 non-null  object 
 5   job_work_from_home     785741 non-null  bool   
 6   search_location        785741 non-null  object 
 7   job_posted_date        785741 non-null  object 
 8   job_no_degree_mention  785741 non-null  bool   
 9   job_health_insurance   785741 non-null  bool   
 10  job_country            785692 non-null  object 
 11  salary_rate            33067 non-null   object 
 12  salary_year_avg        22003 non-null   float64
 13  salary_hour_avg        10662 non-null   float64
 14  company_name           785723 non-nu

In [4]:
df.describe()

Unnamed: 0,salary_year_avg,salary_hour_avg
count,22003.0,10662.0
mean,123286.274072,47.016598
std,48312.449482,21.890738
min,15000.0,8.0
25%,90000.0,27.5
50%,115000.0,45.98
75%,150000.0,61.159996
max,960000.0,391.0


## Data Cleaning


In [5]:
# Check missing values in each column
df.isnull().sum()

job_title_short               0
job_title                     1
job_location               1045
job_via                       8
job_schedule_type         12667
job_work_from_home            0
search_location               0
job_posted_date               0
job_no_degree_mention         0
job_health_insurance          0
job_country                  49
salary_rate              752674
salary_year_avg          763738
salary_hour_avg          775079
company_name                 18
job_skills               117037
job_type_skills          117037
dtype: int64

In [22]:
# Fill missing values in salary columns with mean
df['salary_year_avg'].fillna(df['salary_year_avg'].mean(), inplace=True)
df['salary_hour_avg'].fillna(df['salary_hour_avg'].mean(), inplace=True)

# Fill None or NaN values in 'salary_rate' with 'Unknown'
df['salary_rate'].fillna('Unknown', inplace=True)

# Drop rows with missing values in critical columns
df.dropna(subset=['job_title', 'job_location', 'job_via', 'job_schedule_type', 'job_country', 'company_name'], inplace=True)

In [7]:
# Convert 'job_posted_date' to datetime
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [8]:
# Check for duplicates
print("Number of duplicate rows:", df.duplicated().sum())

# Remove duplicates
df.drop_duplicates(inplace=True)

Number of duplicate rows: 101


In [16]:
#Impute missing skills with 'Unknown'
df['job_skills'].fillna('Unknown', inplace=True)
df['job_type_skills'].fillna('Unknown', inplace=True)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 771865 entries, 0 to 785740
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   job_title_short        771865 non-null  object        
 1   job_title              771865 non-null  object        
 2   job_location           771865 non-null  object        
 3   job_via                771865 non-null  object        
 4   job_schedule_type      771865 non-null  object        
 5   job_work_from_home     771865 non-null  bool          
 6   search_location        771865 non-null  object        
 7   job_posted_date        771865 non-null  datetime64[ns]
 8   job_no_degree_mention  771865 non-null  bool          
 9   job_health_insurance   771865 non-null  bool          
 10  job_country            771865 non-null  object        
 11  salary_rate            32560 non-null   object        
 12  salary_year_avg        771865 non-null  float64  

In [23]:
# Final check for any remaining missing values
print(df.isnull().sum())

job_title_short          0
job_title                0
job_location             0
job_via                  0
job_schedule_type        0
job_work_from_home       0
search_location          0
job_posted_date          0
job_no_degree_mention    0
job_health_insurance     0
job_country              0
salary_rate              0
salary_year_avg          0
salary_hour_avg          0
company_name             0
job_skills               0
job_type_skills          0
dtype: int64
