In [1]:
import pandas as pd


In [2]:
df = pd.read_csv('../fake_job_postings.csv')

# Cleaning Employment Type for deduplication
df_cleaning = df.copy()

# Clean columns by replacing nulls and unspecified values with 'unknown'
cols_to_clean = ['required_experience', 'required_education', 'industry', 'function']
unspecified_values = ['Not Applicable','NaN','not applicable', 'Unspecified', 'Other','Others','none', 'na', 'n/a', '', ' ', None]

for col in cols_to_clean:
    df_cleaning[col] = df_cleaning[col].replace(unspecified_values, 'unknown')
    df_cleaning[col] = df_cleaning[col].fillna('unknown')

for col in df_cleaning.columns:
    df_cleaning[col] = df_cleaning[col].fillna('unknown')

# Adding columns for location cleaning: [country, state, city]
def clean_location(loc):
    if pd.isna(loc) or loc in unspecified_values:
        return ("unknown", "unknown", "unknown")
    parts = loc.split(',')
    parts = [part.strip() if part.strip() not in unspecified_values else "unknown" for part in parts]
    # Pad with "unknown" if we don't have all three parts
    while len(parts) < 3:
        parts.append("unknown")
    return (parts[0], parts[1], parts[2])
df_cleaning_loc = df_cleaning['location'].apply(clean_location)
df_cleaning['location_country'] = df_cleaning_loc.apply(lambda x: x[0])
df_cleaning['location_state'] = df_cleaning_loc.apply(lambda x: x[1])
df_cleaning['location_city'] = df_cleaning_loc.apply(lambda x: x[2])

def simplify_employment_type(x):
    if pd.isna(x):
        return 'unknown'
    x = x.strip().lower()
    if x in ['full-time', 'part-time']:
        return x  # keep these separate
    elif x in ['contract', 'temporary']:
        return 'non-permanent'
    elif x in ['other', 'unknown', '']:
        return 'unknown'
    else:
        return 'unknown'
df_cleaning['employment_type_clean'] = df_cleaning['employment_type'].apply(simplify_employment_type)

# Define a new function to compare locations accounting for unknowns
def compare_locations(row1, row2):
    # Compare countries first
    if row1['location_country'] is None or row2['location_country'] is None:
        return True
    if row1['location_country'] != row2['location_country']:
        return False
    # If countries match, compare states (unless either is unknown)
    if (row1['location_state'] != 'unknown' and 
        row2['location_state'] != 'unknown' and 
        row1['location_state'] != row2['location_state']):
        return False
    # If states match or either is unknown, compare cities (unless either is unknown)
    if (row1['location_city'] != 'unknown' and 
        row2['location_city'] != 'unknown' and 
        row1['location_city'] != row2['location_city']):
        return False
    return True

# Create a new comparison key function that uses the location comparison
def comparison_key(row):
    # Get location info with handling for unknown values
    location_info = (
        row['location_country'],
        row['location_state'] if row['location_state'] != 'unknown' else None,
        row['location_city'] if row['location_city'] != 'unknown' else None
    )
    
    # Get employment type, None if unknown
    emp = None if row['employment_type_clean'] == 'unknown' else row['employment_type_clean']
    
    # Return tuple with all comparison fields
    return (location_info, row['title'], row['description'], row['requirements'], emp)

df_cleaning['dedup_key'] = df_cleaning.apply(comparison_key, axis=1)
df = df_cleaning.drop_duplicates(subset=['dedup_key'])

In [3]:
# Via EDA
binary_cols = [col for col in df.columns if df[col].nunique() == 2]
categorical_cols = [col for col in df.columns if 2 < df[col].nunique() < 150]
text_cols = [col for col in df.columns if df[col].dtype == 'object' and col not in categorical_cols + ['job_id']]


In [4]:
# Deduplicate based on all columns except 'job_id'
# Checked in EDA, okay to drop duplicates this way
df_dedup = df.drop_duplicates(subset=[col for col in df.columns if col != 'job_id'])
print(f"Original shape: {df.shape}, Deduplicated shape: {df_dedup.shape}")

Original shape: (17479, 23), Deduplicated shape: (17479, 23)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# word n-grams (1-2)
word_tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=3000, lowercase=True)

# char n-grams (3-5)
char_tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3,5), max_features=3000)

# Fit & transform per field
title_word_features = word_tfidf.fit_transform(df['title'])
desc_char_features = char_tfidf.fit_transform(df['description'])

In [None]:
# Sentence embeddings for text columns(DistilBERT)


In [5]:
# Count of text features (URLs, email, phone numbers, money symbols, other symbols, etc)
# Columns: 'description', 'company_profile', 'requirements', 'benefits'
import re
def count_patterns(text, patterns):
    # Ensure text is a string
    if not isinstance(text, str):
        text = ""
    counts = {}
    for name, pattern in patterns.items():
        counts[name] = len(re.findall(pattern, text))
    return counts

patterns = {
    'urls': r'http[s]?://\S+',
    'emails': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
    'phone_numbers': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
    'money_symbols': r'[$€£¥]',
    'other_symbols': r'[©®™]',
}
# build and join prefixed char-feature DataFrames for each text field
desc_char_features = pd.DataFrame(
    list(df['description'].apply(lambda x: count_patterns(x, patterns))),
    index=df.index
).add_prefix('description_').fillna(0).astype(int)
df = pd.concat([df, desc_char_features], axis=1)

company_profile_char_features = pd.DataFrame(
    list(df['company_profile'].apply(lambda x: count_patterns(x, patterns))),
    index=df.index
).add_prefix('company_profile_').fillna(0).astype(int)
df = pd.concat([df, company_profile_char_features], axis=1)

requirements_char_features = pd.DataFrame(
    list(df['requirements'].apply(lambda x: count_patterns(x, patterns))),
    index=df.index
).add_prefix('requirements_').fillna(0).astype(int)
df = pd.concat([df, requirements_char_features], axis=1)

benefits_char_features = pd.DataFrame(
    list(df['benefits'].apply(lambda x: count_patterns(x, patterns))),
    index=df.index
).add_prefix('benefits_').fillna(0).astype(int)
df = pd.concat([df, benefits_char_features], axis=1)

display(df.head())


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,...,requirements_urls,requirements_emails,requirements_phone_numbers,requirements_money_symbols,requirements_other_symbols,benefits_urls,benefits_emails,benefits_phone_numbers,benefits_money_symbols,benefits_other_symbols
0,1,Marketing Intern,"US, NY, New York",Marketing,unknown,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,unknown,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,unknown,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,...,0,0,0,0,0,0,0,0,2,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",unknown,unknown,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,unknown,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,unknown,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Bill Review Manager,"US, FL, Fort Worth",unknown,unknown,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Suscpicious word counts for job descriptions/company (using heuristics)
suspicious_keywords = [
    'quick money', 'easy money', 'high pay', 'earn fast', 'upfront fee',
    'no experience required', 'work from home', 'bonus', 'registration fee',
    'processing fee', 'pay before start', 'investment required', 'fee upfront',
    'deposit required', 'apply now', 'limited spots', 'immediate start',
    'act fast', 'urgent', 'deadline', 'be your own boss', 'flexible hours',
    'online opportunity', 'training provided', 'click here', 'get rich'
]
def count_suspicious_words(text, keywords):
    text_lower = text.lower()
    return sum(text_lower.count(keyword) for keyword in keywords)  
# df['description_suspicious_word_count'] = df['description'].apply(lambda x: count_suspicious_words(x, suspicious_keywords))
# df['company_profile_suspicious_word_count'] = df['company_profile'].apply(lambda x: count_suspicious_words(x, suspicious_keywords))
description_suspicious_word_count = df['description'].apply(lambda x: count_suspicious_words(x, suspicious_keywords))
company_profile_suspicious_word_count = df['company_profile'].apply(lambda x: count_suspicious_words(x, suspicious_keywords))
benefits_suspicious_word_count = df['benefits'].apply(lambda x: count_suspicious_words(x, suspicious_keywords))
requirements_suspicious_word_count = df['requirements'].apply(lambda x: count_suspicious_words(x, suspicious_keywords))

df['description_suspicious_word_count'] = description_suspicious_word_count
df['company_profile_suspicious_word_count'] = company_profile_suspicious_word_count
df['benefits_suspicious_word_count'] = benefits_suspicious_word_count
df['requirements_suspicious_word_count'] = requirements_suspicious_word_count

In [None]:
# One hot encoding for categorical variables
ohe = pd.get_dummies(df[categorical_cols].fillna('unknown').astype(str), prefix=categorical_cols, prefix_sep='__', dummy_na=False)
df = pd.concat([df, ohe], axis=1)
ohe.shape

# categorical_cols

['employment_type',
 'required_experience',
 'required_education',
 'industry',
 'function',
 'location_country',
 'employment_type_clean']

In [30]:
# Parse Salary into Min-Max (country + units?)
def parse_salary(salary_str):
    if salary_str == 'unknown':
        return (0.0, 0.0)
    try:
        parts = salary_str.split('-')
        min_salary = float(parts[0].strip())
        max_salary = float(parts[1].strip())
        return (min_salary, max_salary)
    except:
        return (0.0, 0.0)
df[['salary_min', 'salary_max']] = df['salary_range'].apply(lambda x: pd.Series(parse_salary(x)))
# Parse Location