In [25]:
import pandas as pd

In [26]:
# Cleaning Employment Type for deduplication

def cleanAndDeduplicate(df):

    df_cleaning = df.copy()

    # Clean columns by replacing nulls and unspecified values with 'unknown'
    cols_to_clean = ['required_experience', 'required_education', 'industry', 'function']
    unspecified_values = ['Not Applicable','NaN','not applicable', 'Unspecified', 'Other','Others','none', 'na', 'n/a', '', ' ', None]

    for col in cols_to_clean:
        df_cleaning[col] = df_cleaning[col].replace(unspecified_values, 'unknown')
        df_cleaning[col] = df_cleaning[col].fillna('unknown')

    for col in df_cleaning.columns:
        df_cleaning[col] = df_cleaning[col].fillna('unknown')

    # Adding columns for location cleaning: [country, state, city]
    def clean_location(loc):
        if pd.isna(loc) or loc in unspecified_values:
            return ("unknown", "unknown", "unknown")
        parts = loc.split(',')
        parts = [part.strip() if part.strip() not in unspecified_values else "unknown" for part in parts]
        # Pad with "unknown" if we don't have all three parts
        while len(parts) < 3:
            parts.append("unknown")
        return (parts[0], parts[1], parts[2])
    df_cleaning_loc = df_cleaning['location'].apply(clean_location)
    df_cleaning['location_country'] = df_cleaning_loc.apply(lambda x: x[0])
    df_cleaning['location_state'] = df_cleaning_loc.apply(lambda x: x[1])
    df_cleaning['location_city'] = df_cleaning_loc.apply(lambda x: x[2])

    def simplify_employment_type(x):
        if pd.isna(x):
            return 'unknown'
        x = x.strip().lower()
        if x in ['full-time', 'part-time']:
            return x  # keep these separate
        elif x in ['contract', 'temporary']:
            return 'non-permanent'
        elif x in ['other', 'unknown', '']:
            return 'unknown'
        else:
            return 'unknown'
    df_cleaning['employment_type_clean'] = df_cleaning['employment_type'].apply(simplify_employment_type)

    # Define a new function to compare locations accounting for unknowns
    def compare_locations(row1, row2):
        # Compare countries first
        if row1['location_country'] is None or row2['location_country'] is None:
            return True
        if row1['location_country'] != row2['location_country']:
            return False
        # If countries match, compare states (unless either is unknown)
        if (row1['location_state'] != 'unknown' and 
            row2['location_state'] != 'unknown' and 
            row1['location_state'] != row2['location_state']):
            return False
        # If states match or either is unknown, compare cities (unless either is unknown)
        if (row1['location_city'] != 'unknown' and 
            row2['location_city'] != 'unknown' and 
            row1['location_city'] != row2['location_city']):
            return False
        return True

    # Create a new comparison key function that uses the location comparison
    def comparison_key(row):
        # Get location info with handling for unknown values
        location_info = (
            row['location_country'],
            row['location_state'] if row['location_state'] != 'unknown' else None,
            row['location_city'] if row['location_city'] != 'unknown' else None
        )
        
        # Get employment type, None if unknown
        emp = None if row['employment_type_clean'] == 'unknown' else row['employment_type_clean']
        
        # Return tuple with all comparison fields
        return (location_info, row['title'], row['description'], row['requirements'], emp)

    df_cleaning['dedup_key'] = df_cleaning.apply(comparison_key, axis=1)
    df = df_cleaning.drop_duplicates(subset=['dedup_key'])
    return df

In [27]:
# Deduplicate based on all columns except 'job_id'
# Checked in EDA, okay to drop duplicates this way


#Unused
def deduplicate(df):
    df_dedup = df.drop_duplicates(subset=[col for col in df.columns if col != 'job_id'])
    print(f"Original shape: {df.shape}, Deduplicated shape: {df_dedup.shape}")
    return df_dedup

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

def wordTDIDFfunc(col, ngram_range, max_features):
    word_tfidf = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features, lowercase=True)
    return word_tfidf.fit_transform(col)


def charTDIDFfunc(col, ngram_range, max_features):
    char_tfidf = TfidfVectorizer(analyzer='char', ngram_range=ngram_range, max_features=max_features)
    return char_tfidf.fit_transform(col)

def addAllWordTDIDF(df, ngram_range, max_features, text_cols=['title', 'company_profile', 'description', 'requirements', 'benefits']):
    tfidf_features = []
    for col in text_cols:
        X_col = wordTDIDFfunc(df[col], ngram_range, max_features)
        tfidf_features.append(X_col)
    return hstack(tfidf_features)


def addAllCharTDIDF(df, ngram_range, max_features, text_cols=['title', 'company_profile', 'description', 'requirements', 'benefits']):
    tfidf_features = []
    for col in text_cols:
        X_col = charTDIDFfunc(df[col], ngram_range, max_features)
        tfidf_features.append(X_col)
    return hstack(tfidf_features)
    
"""# word n-grams (1-2)
word_tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=1000, lowercase=True)

# char n-grams (3-5)
char_tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3,5), max_features=1000)

# Fit & transform per field
title_word_tfidf = word_tfidf.fit_transform(df['title'])
company_profile_word_tfidf = word_tfidf.fit_transform(df['company_profile'])
derscription_word_tfidf = word_tfidf.fit_transform(df['description'])
requirements_word_tfidf = word_tfidf.fit_transform(df['requirements'])
benefits_word_tfidf = word_tfidf.fit_transform(df['benefits'])

title_char_tfidf = char_tfidf.fit_transform(df['title'])
company_profile_char_tfidf = char_tfidf.fit_transform(df['company_profile'])
derscription_char_tfidf = char_tfidf.fit_transform(df['description'])
requirements_char_tfidf = char_tfidf.fit_transform(df['requirements'])
benefits_char_tfidf = char_tfidf.fit_transform(df['benefits'])"""

"# word n-grams (1-2)\nword_tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=1000, lowercase=True)\n\n# char n-grams (3-5)\nchar_tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3,5), max_features=1000)\n\n# Fit & transform per field\ntitle_word_tfidf = word_tfidf.fit_transform(df['title'])\ncompany_profile_word_tfidf = word_tfidf.fit_transform(df['company_profile'])\nderscription_word_tfidf = word_tfidf.fit_transform(df['description'])\nrequirements_word_tfidf = word_tfidf.fit_transform(df['requirements'])\nbenefits_word_tfidf = word_tfidf.fit_transform(df['benefits'])\n\ntitle_char_tfidf = char_tfidf.fit_transform(df['title'])\ncompany_profile_char_tfidf = char_tfidf.fit_transform(df['company_profile'])\nderscription_char_tfidf = char_tfidf.fit_transform(df['description'])\nrequirements_char_tfidf = char_tfidf.fit_transform(df['requirements'])\nbenefits_char_tfidf = char_tfidf.fit_transform(df['benefits'])"

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
from tqdm import tqdm  # optional: shows progress bar for large DataFrames

def sentenceEmbedding(df, text_col='text_column', device=None):
    # Load DistilBERT tokenizer & model once
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    model = AutoModel.from_pretrained("distilbert-base-uncased")

    # Use GPU if available, I have AMD GPU :(
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()  # set to inference mode

    def get_embedding(text):
        if pd.isna(text) or not isinstance(text, str) or text.strip() == "":
            return torch.zeros(model.config.hidden_size, device=device)
        
        inputs = tokenizer(
            text, return_tensors="pt", truncation=True, padding=True, max_length=512
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
        return embeddings

    tqdm.pandas(desc="Generating embeddings")  # progress bar
    df["text_embedding"] = df[text_col].progress_apply(get_embedding)

    # Optional: convert to list for storage or export
    df["text_embedding_list"] = df["text_embedding"].apply(lambda x: x.cpu().tolist())

    return df


In [30]:
# Count of text tfidf (URLs, email, phone numbers, money symbols, other symbols, etc)
# Columns: 'description', 'company_profile', 'requirements', 'benefits'
import re

def addWordPatterns(df):
    
    patterns = {
        'urls': r'http[s]?://\S+',
        'emails': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
        'phone_numbers': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
        'money_symbols': r'[$€£¥]',
        'other_symbols': r'[©®™]',
    }
    
    def count_patterns(text, patterns):
        # Ensure text is a string
        if not isinstance(text, str):
            text = ""
        counts = {}
        for name, pattern in patterns.items():
            counts[name] = len(re.findall(pattern, text))
        return counts

    # build and join prefixed char-feature DataFrames for each text field
    desc_char_tfidf = pd.DataFrame(
        list(df['description'].apply(lambda x: count_patterns(x, patterns))),
        index=df.index
    ).add_prefix('description_').fillna(0).astype(int)
    df = pd.concat([df, desc_char_tfidf], axis=1)

    company_profile_char_tfidf = pd.DataFrame(
        list(df['company_profile'].apply(lambda x: count_patterns(x, patterns))),
        index=df.index
    ).add_prefix('company_profile_').fillna(0).astype(int)
    df = pd.concat([df, company_profile_char_tfidf], axis=1)

    requirements_char_tfidf = pd.DataFrame(
        list(df['requirements'].apply(lambda x: count_patterns(x, patterns))),
        index=df.index
    ).add_prefix('requirements_').fillna(0).astype(int)
    df = pd.concat([df, requirements_char_tfidf], axis=1)

    benefits_char_tfidf = pd.DataFrame(
        list(df['benefits'].apply(lambda x: count_patterns(x, patterns))),
        index=df.index
    ).add_prefix('benefits_').fillna(0).astype(int)
    df = pd.concat([df, benefits_char_tfidf], axis=1)

    return df


In [31]:
# Suscpicious word counts for job descriptions/company (using heuristics)

def addSuspiciousWordCount(df):

    suspicious_keywords = [
        'quick money', 'easy money', 'high pay', 'earn fast', 'upfront fee',
        'no experience required', 'work from home', 'bonus', 'registration fee',
        'processing fee', 'pay before start', 'investment required', 'fee upfront',
        'deposit required', 'apply now', 'limited spots', 'immediate start',
        'act fast', 'urgent', 'deadline', 'be your own boss', 'flexible hours',
        'online opportunity', 'training provided', 'click here', 'get rich'
    ]
    def count_suspicious_words(text, keywords):
        text_lower = text.lower()
        return sum(text_lower.count(keyword) for keyword in keywords)  
    # df['description_suspicious_word_count'] = df['description'].apply(lambda x: count_suspicious_words(x, suspicious_keywords))
    # df['company_profile_suspicious_word_count'] = df['company_profile'].apply(lambda x: count_suspicious_words(x, suspicious_keywords))
    description_suspicious_word_count = df['description'].apply(lambda x: count_suspicious_words(x, suspicious_keywords))
    company_profile_suspicious_word_count = df['company_profile'].apply(lambda x: count_suspicious_words(x, suspicious_keywords))
    benefits_suspicious_word_count = df['benefits'].apply(lambda x: count_suspicious_words(x, suspicious_keywords))
    requirements_suspicious_word_count = df['requirements'].apply(lambda x: count_suspicious_words(x, suspicious_keywords))

    df['description_suspicious_word_count'] = description_suspicious_word_count
    df['company_profile_suspicious_word_count'] = company_profile_suspicious_word_count
    df['benefits_suspicious_word_count'] = benefits_suspicious_word_count
    df['requirements_suspicious_word_count'] = requirements_suspicious_word_count
    
    return df

In [32]:
# One hot encoding for categorical variables

def oneHotEncoding(df, categorical_cols):
    ohe = pd.get_dummies(df[categorical_cols].fillna('unknown').astype(str), prefix=categorical_cols, prefix_sep='__', dummy_na=False)
    df = pd.concat([df, ohe], axis=1)
    return df

In [33]:
# Parse Salary
def parse_salary_range(df):
    MONTH_MAP = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
    'jul': 7, 'aug': 8, 'sep': 9, 'sept': 9, 'oct': 10, 'nov': 11, 'dec': 12
}   
    def parse_salary(s):
        if pd.isna(s) or str(s).lower() == 'unknown':
            return (None, None)

        s = str(s).strip()
        if '-' not in s:
            try: return (int(s), int(s))
            except ValueError: return (None, None)

        left, right = [v.strip() for v in s.split('-', 1)]

        def val(v):
            return int(v) if v.isdigit() else MONTH_MAP.get(v.lower())

        l_val, r_val = val(left), val(right)
        return (l_val, r_val) if l_val is not None and r_val is not None else (None, None)

    df[['salary_min', 'salary_max']] = df['salary_range'].apply(lambda x: pd.Series(parse_salary(x)))
    
    return df


In [35]:
df = pd.read_csv('../fake_job_postings.csv')

# Colum types via EDA
binary_cols = [col for col in df.columns if df[col].nunique() == 2]
categorical_cols = [col for col in df.columns if 2 < df[col].nunique() < 150]
text_cols = [col for col in df.columns if df[col].dtype == 'object' and col not in categorical_cols + ['job_id']]

df = cleanAndDeduplicate(df)
df = addWordPatterns(df)
df = addSuspiciousWordCount(df)
df = oneHotEncoding(df, categorical_cols)
df = parse_salary_range(df)

# TDIDF
WORD_NGRAM = (1, 2)
CHAR_NGRAM = (3, 5)
MAX_FEATURES = 1000

word_tfidf_matrix = addAllWordTDIDF(df, WORD_NGRAM, MAX_FEATURES)
char_tfidf_matrix = addAllCharTDIDF(df, CHAR_NGRAM, MAX_FEATURES)
combined_tfidf = hstack([word_tfidf_matrix, char_tfidf_matrix])

# Sentence Embedding
df = sentenceEmbedding(df, text_col='description')

Generating embeddings: 100%|██████████| 17479/17479 [23:54<00:00, 12.19it/s]


In [36]:
# Drop Raw Columns
to_drop = categorical_cols + text_cols
df_dropped = df.drop(columns=to_drop)

# Combine
tfidf_feature_names = [f"tfidf_{i}" for i in range(combined_tfidf.shape[1])]
tfidf_df = pd.DataFrame(combined_tfidf.toarray(), columns=tfidf_feature_names, index=df_dropped.index)
df_combined = pd.concat([df_dropped, tfidf_df], axis=1)

print(df_combined.head())
print("New df shape:", df_combined.shape)

   job_id  telecommuting  has_company_logo  has_questions  fraudulent  \
0       1              0                 1              0           0   
1       2              0                 1              0           0   
2       3              0                 1              0           0   
3       4              0                 1              0           0   
4       5              0                 1              1           0   

  location_country location_state location_city employment_type_clean  \
0               US             NY      New York               unknown   
1               NZ        unknown      Auckland             full-time   
2               US             IA         Wever               unknown   
3               US             DC    Washington             full-time   
4               US             FL    Fort Worth             full-time   

                                           dedup_key  ...  tfidf_9990  \
0  ((US, NY, New York), Marketing Intern, Food52,