In [3]:
# P1: Data Cleaning
# Import libraries
import pandas as pd
import numpy as np

# Constants for regex patterns
REMOTE_MODE_PATTERN = r'(?i)(remote|hybrid)'
LEVEL_PATTERN = r'(?i)(associate|graduate|senior|junior|entry-level|manager|lead)'
SALARY_PATTERN = r'(\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?)'
POSTCODE_PATTERN = r'(\d+)'

def clean_data(file_path):
    """Main function to clean the job postings dataset."""
    df = pd.read_csv(file_path)
    pd.set_option('display.max_rows', None) 
    pd.set_option('display.max_columns', None)

    # Initial checks
    print(df.shape)
    print(df.info())  # Check for missing values

    # Data cleaning steps
    df = drop_duplicates(df)
    df = replace_blanks_with_nan(df)
    df = drop_unused_columns(df, ['Job URL'])
    df = capitalize_columns(df, ['Company', 'Title'])
    df = extract_working_mode_and_level(df)
    df = split_location(df)
    df = extract_and_clean_salary(df)
    df = filter_salaries(df)
    df = handle_city_and_working_mode(df)
    df['Date'] = pd.to_datetime(df['Date'])

    # Finalize the dataframe
    cleaned_df = df[['Date', 'Company', 'State', 'City', 'Level', 'Working Mode', 'Average_salary']]
    print(cleaned_df.isna().sum())  # Check for missing values
    return cleaned_df

def drop_duplicates(df):
    """Drop duplicate rows in the DataFrame."""
    return df.drop_duplicates()

def replace_blanks_with_nan(df):
    """Replace blank values with NaN."""
    return df.replace(' ', np.nan)

def drop_unused_columns(df, columns):
    """Drop specified unused columns from the DataFrame."""
    return df.drop(columns=columns)

def capitalize_columns(df, columns):
    """Capitalize text in specified columns."""
    for column in columns:
        df[column] = df[column].astype(str).apply(lambda x: ' '.join(word.capitalize() for word in x.split()))
    return df

def extract_working_mode_and_level(df):
    """Extract Working Mode and Level from the Title."""
    df['Working Mode'] = df['Title'].str.extract(REMOTE_MODE_PATTERN, expand=False)
    df['Level'] = df['Title'].str.extract(LEVEL_PATTERN, expand=False).fillna('Mid-level').str.capitalize()
    return df

def split_location(df):
    """Split 'Location' into 'City' and 'Post_Code'."""
    df[['City', 'Post_Code']] = df['Location'].str.split(',', n=1, expand=True)
    df['Post_Code'] = df['Post_Code'].str.extract(POSTCODE_PATTERN)
    return df

def extract_and_clean_salary(df):
    """Extract and clean salary information."""
    df['Salary'] = df['Salary'].apply(lambda x: ''.join(char for char in str(x) if char.isdigit() or char in ',.-').strip())
    df['Salary'] = df['Salary'].str.strip('-')
    df.replace('', np.nan, inplace=True)
    df['Salary1'] = df['Description'].str.extract(SALARY_PATTERN)
    df['Salary'] = df['Salary'].fillna(df['Salary1'])
    
    # Separate into Min and Max salary
    df[['Min_salary', 'Max_salary']] = df['Salary'].str.split('-', n=1, expand=True)
    df['Max_salary'] = df['Max_salary'].fillna(df['Min_salary'])
    
    # Clean salary columns
    for col in ['Min_salary', 'Max_salary']:
        df[col] = df[col].str.replace(r'[$,]', '', regex=True).astype(float)
    
    # Calculate average salary
    df['Average_salary'] = (df['Min_salary'] + df['Max_salary']) / 2
    return df

def filter_salaries(df):
    """Filter the DataFrame for salaries greater than or equal to 30,000."""
    return df[df['Average_salary'] >= 30000]

def handle_city_and_working_mode(df):
    """Handle City and Working Mode for remote jobs."""
    df['City'] = np.where(df['State'] == 'Remote', 'Remote', df['City'])
    df['Working Mode'].fillna('On-Site', inplace=True)
    df['City'] = df['City'].apply(lambda x: x.split(' in ', 1)[-1].strip() if 'in' in x else x)
    return df

# File path to be defined
file_path = '../indeed_kaggle.csv'

# Save cleaned dataset to CSV
cleaned_df = clean_data(file_path)
cleaned_df.to_csv('completed_file.csv', index=False)


(29184, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29184 entries, 0 to 29183
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        29184 non-null  object
 1   Company      29181 non-null  object
 2   Location     29182 non-null  object
 3   Salary       16283 non-null  object
 4   Description  22755 non-null  object
 5   Job URL      29181 non-null  object
 6   Date         29184 non-null  object
 7   State        29184 non-null  object
dtypes: object(8)
memory usage: 1.8+ MB
None
Date              0
Company           0
State             0
City              0
Level             0
Working Mode      0
Average_salary    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Working Mode'].fillna('On-Site', inplace=True)


In [4]:
df_uncleaned = pd.read_csv(file_path)
df_uncleaned.head()

Unnamed: 0,Title,Company,Location,Salary,Description,Job URL,Date,State
0,Data Scientist,"DESE Research, Inc.","Huntsville, AL 35806",,"Familiarity with advanced machine learning, da...",https://www.indeed.com/rc/clk?jk=b31b63cb0d9fd...,2024-06-23,Alabama
1,Senior Data Analyst,PCI Government Services,"Hybrid work in Huntsville, AL 35808","From $85,000 a year",Must have strong technical skills in areas suc...,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,2024-06-23,Alabama
2,Data Scientist,"Interclypse, Inc.","Huntsville, AL",Full-time,Excellent interpersonal skills and ability to ...,https://www.indeed.com/rc/clk?jk=c34e8a98b5f9a...,2024-06-23,Alabama
3,"Data Scientist, Mid",Booz Allen,"Huntsville, AL","$75,600 - $172,000 a year","As a data scientist, you’re excited at the pro...",https://www.indeed.com/rc/clk?jk=908b996e5ba98...,2024-06-23,Alabama
4,Senior Data Analyst (U.S. remote eligible),"Eternal Word Television Network, Inc.",Remote in Alabama,,You are experienced using web analytics and Go...,https://www.indeed.com/rc/clk?jk=400cd0aab0d76...,2024-06-23,Alabama


In [5]:
cleaned_df.head()

Unnamed: 0,Date,Company,State,City,Level,Working Mode,Average_salary
1,2024-06-23,Pci Government Services,Alabama,Huntsville,Senior,On-Site,85000.0
3,2024-06-23,Booz Allen,Alabama,Huntsville,Mid-level,On-Site,123800.0
15,2024-06-23,Booz Allen,Alabama,Huntsville,Senior,On-Site,158300.0
29,2024-06-23,Leidos,Alabama,Huntsville,Mid-level,On-Site,114062.5
30,2024-06-23,Recruiting From Scratch,Alabama,Huntsville,Lead,On-Site,160000.0
