**Feature Engineering Pipeline**
Steps:
1. Standardize & Clean Text
2. Stopword Removal
3. Lemmatization
4. Lexical Features
5. TF-IDF (Word + Char)
6. Embeddings (DistilBERT)
7. Structured & Pattern Features

In [22]:
import pandas as pd
import numpy as np
import re
import torch
from tqdm import tqdm
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from transformers import AutoTokenizer, AutoModel

# NLP preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

lemmatizer = WordNetLemmatizer()
STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tingw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tingw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tingw\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\tingw\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [23]:
# Cleaning Employment Type for deduplication

def cleanAndDeduplicate(df):

    df_cleaning = df.copy()

    # Clean columns by replacing nulls and unspecified values with 'unknown'
    cols_to_clean = ['required_experience', 'required_education', 'industry', 'function']
    unspecified_values = ['Not Applicable','NaN','not applicable', 'Unspecified', 'Other','Others','none', 'na', 'n/a', '', ' ', None]

    for col in cols_to_clean:
        df_cleaning[col] = df_cleaning[col].replace(unspecified_values, 'unknown')
        df_cleaning[col] = df_cleaning[col].fillna('unknown')

    for col in df_cleaning.columns:
        df_cleaning[col] = df_cleaning[col].fillna('unknown')

    # Adding columns for location cleaning: [country, state, city]
    def clean_location(loc):
        if pd.isna(loc) or loc in unspecified_values:
            return ("unknown", "unknown", "unknown")
        parts = loc.split(',')
        parts = [part.strip() if part.strip() not in unspecified_values else "unknown" for part in parts]
        # Pad with "unknown" if we don't have all three parts
        while len(parts) < 3:
            parts.append("unknown")
        return (parts[0], parts[1], parts[2])
    df_cleaning_loc = df_cleaning['location'].apply(clean_location)
    df_cleaning['location_country'] = df_cleaning_loc.apply(lambda x: x[0])
    df_cleaning['location_state'] = df_cleaning_loc.apply(lambda x: x[1])
    df_cleaning['location_city'] = df_cleaning_loc.apply(lambda x: x[2])

    def simplify_employment_type(x):
        if pd.isna(x):
            return 'unknown'
        x = x.strip().lower()
        if x in ['full-time', 'part-time']:
            return x  # keep these separate
        elif x in ['contract', 'temporary']:
            return 'non-permanent'
        elif x in ['other', 'unknown', '']:
            return 'unknown'
        else:
            return 'unknown'
    df_cleaning['employment_type_clean'] = df_cleaning['employment_type'].apply(simplify_employment_type)

    # Define a new function to compare locations accounting for unknowns
    def compare_locations(row1, row2):
        # Compare countries first
        if row1['location_country'] is None or row2['location_country'] is None:
            return True
        if row1['location_country'] != row2['location_country']:
            return False
        # If countries match, compare states (unless either is unknown)
        if (row1['location_state'] != 'unknown' and 
            row2['location_state'] != 'unknown' and 
            row1['location_state'] != row2['location_state']):
            return False
        # If states match or either is unknown, compare cities (unless either is unknown)
        if (row1['location_city'] != 'unknown' and 
            row2['location_city'] != 'unknown' and 
            row1['location_city'] != row2['location_city']):
            return False
        return True

    # Create a new comparison key function that uses the location comparison
    def comparison_key(row):
        # Get location info with handling for unknown values
        location_info = (
            row['location_country'],
            row['location_state'] if row['location_state'] != 'unknown' else None,
            row['location_city'] if row['location_city'] != 'unknown' else None
        )
        
        # Get employment type, None if unknown
        emp = None if row['employment_type_clean'] == 'unknown' else row['employment_type_clean']
        
        # Return tuple with all comparison fields
        return (location_info, row['title'], row['description'], row['requirements'], emp)

    df_cleaning['dedup_key'] = df_cleaning.apply(comparison_key, axis=1)
    df = df_cleaning.drop_duplicates(subset=['dedup_key'])
    return df

In [24]:
# Helpers

def check_corpus(df, text_cols, top_n=50):
    corpus_stats = {}
    
    for col in text_cols:
        tqdm.pandas(desc=f"Analyzing {col}")
        
        # Combine all text in the column
        all_text = df[col].fillna("").astype(str).progress_apply(lambda x: x.lower())
        
        # Tokenize
        tokens = [token for text in all_text for token in word_tokenize(text) if len(token) > 2]
        
        # Count
        counts = Counter(tokens)
        
        # Most common
        corpus_stats[col] = counts.most_common(top_n)
    
    print(f"Corpus size (number of unique tokens): {len(set(tokens))}")
    
    return corpus_stats

In [35]:
# 1. Standardize and Clean
def normalize_text(text: str) -> str:
    if not isinstance(text, str) or not text.strip():
        return ""

    text = text.lower().strip()
    text = re.sub(r"http\S+|www\S+", "", text)  # remove URLs
    text = re.sub(r"[^a-z\s']", " ", text)  # remove punctuation/numbers except apostrophes
    text = re.sub(r"\s+", " ", text).strip()

    return text

def apply_text_normalization(df, text_cols):
    
    for col in text_cols:
        tqdm.pandas(desc=f"Normalizing {col}")
        df[col] = df[col].progress_apply(normalize_text)
    return df

df = pd.read_csv('../fake_job_postings.csv')

raw_corpus = check_corpus(df, ['title', 'description', 'requirements', 'benefits', 'company_profile'], top_n=50)

df = apply_text_normalization(df, ['title', 'description', 'requirements', 'benefits', 'company_profile'])

Analyzing title: 100%|██████████| 17880/17880 [00:00<00:00, 1655025.17it/s]
Analyzing description: 100%|██████████| 17880/17880 [00:00<00:00, 155398.03it/s]
Analyzing requirements: 100%|██████████| 17880/17880 [00:00<00:00, 315402.32it/s]
Analyzing benefits: 100%|██████████| 17880/17880 [00:00<00:00, 587935.84it/s]
Analyzing company_profile: 100%|██████████| 17880/17880 [00:00<00:00, 283695.88it/s]


Corpus size (number of unique tokens): 18468


Normalizing title: 100%|██████████| 17880/17880 [00:00<00:00, 275735.10it/s]
Normalizing description: 100%|██████████| 17880/17880 [00:01<00:00, 15752.99it/s]
Normalizing requirements: 100%|██████████| 17880/17880 [00:00<00:00, 33603.84it/s]
Normalizing benefits: 100%|██████████| 17880/17880 [00:00<00:00, 82402.65it/s]
Normalizing company_profile: 100%|██████████| 17880/17880 [00:00<00:00, 30031.74it/s]


In [36]:
# 2. Remove stopwords
def remove_stopwords_df(df, text_cols):
    """
    Remove English stopwords from multiple text columns in a DataFrame.
    """
    def remove_stopwords_text(text):
        if not isinstance(text, str):
            return ""
        tokens = word_tokenize(text.lower())
        clean_tokens = [t for t in tokens if t not in STOPWORDS and len(t) > 2]
        return " ".join(clean_tokens)
    
    for col in text_cols:
        tqdm.pandas(desc=f"Removing stopwords in {col}")
        df[col] = df[col].progress_apply(remove_stopwords_text)
    
    return df

df = remove_stopwords_df(df, ['title', 'description', 'requirements', 'benefits', 'company_profile'])


def lemmatize_df(df, text_cols):
    """
    Lemmatize multiple text columns in a DataFrame.
    """
    def lemmatize_text(text):
        if not isinstance(text, str):
            return ""
        tokens = word_tokenize(text.lower())
        lemmas = [lemmatizer.lemmatize(t) for t in tokens if len(t) > 2]
        return " ".join(lemmas)
    
    for col in text_cols:
        tqdm.pandas(desc=f"Lemmatizing {col}")
        df[col] = df[col].progress_apply(lemmatize_text)
    
    return df

df = lemmatize_df(df, ['title', 'description', 'requirements', 'benefits', 'company_profile'])

cleaned_corpus = check_corpus(df, ['title', 'description', 'requirements', 'benefits', 'company_profile'], top_n=50)

Removing stopwords in title: 100%|██████████| 17880/17880 [00:00<00:00, 45628.37it/s]
Removing stopwords in description: 100%|██████████| 17880/17880 [00:06<00:00, 2624.27it/s]
Removing stopwords in requirements: 100%|██████████| 17880/17880 [00:03<00:00, 5342.96it/s]
Removing stopwords in benefits: 100%|██████████| 17880/17880 [00:01<00:00, 14042.50it/s]
Removing stopwords in company_profile: 100%|██████████| 17880/17880 [00:03<00:00, 4948.84it/s]
Lemmatizing title: 100%|██████████| 17880/17880 [00:00<00:00, 33611.47it/s]
Lemmatizing description: 100%|██████████| 17880/17880 [00:09<00:00, 1970.79it/s]
Lemmatizing requirements: 100%|██████████| 17880/17880 [00:04<00:00, 3866.94it/s]
Lemmatizing benefits: 100%|██████████| 17880/17880 [00:01<00:00, 10569.49it/s]
Lemmatizing company_profile: 100%|██████████| 17880/17880 [00:04<00:00, 3763.36it/s]
Analyzing title: 100%|██████████| 17880/17880 [00:00<00:00, 1730967.24it/s]
Analyzing description: 100%|██████████| 17880/17880 [00:00<00:00, 92

Corpus size (number of unique tokens): 12650


In [27]:
# PATTERN FEATURES

def addWordPatterns(df):    
    patterns = {
        'urls': r'http[s]?://\S+',
        'emails': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
        'phone_numbers': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
        'money_symbols': r'[$€£¥]',
        'other_symbols': r'[©®™]',
    }
    
    def count_patterns(text, patterns):
        # Ensure text is a string
        if not isinstance(text, str):
            text = ""
        return {k: len(re.findall(pat, text)) for k, pat in patterns.items()}

    for col in ['description', 'company_profile', 'requirements', 'benefits']:
        feat_df = pd.DataFrame(list(df[col].apply(count_patterns)), index=df.index).add_prefix(f'{col}_')
        df = pd.concat([df, feat_df], axis=1)

    return df



In [28]:
# One-Hot Encoding Categorical Variables
def one_hot_encode(df, categorical_cols):
    df = df.copy()
    ohe = pd.get_dummies(df[categorical_cols].fillna('unknown'), prefix=categorical_cols)
    return pd.concat([df, ohe], axis=1)

In [29]:
# 6. TF-IDF Feature Extraction

def build_tfidf(df, text_cols, word_ngrams=(1, 2), char_ngrams=(3, 5), max_features=2000):
    tfidf_blocks = []
    word_vectorizer = TfidfVectorizer(ngram_range=word_ngrams, max_features=max_features)
    char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=char_ngrams, max_features=max_features)

    for col in text_cols:
        word_vec = word_vectorizer.fit_transform(df[col].fillna(''))
        char_vec = char_vectorizer.fit_transform(df[col].fillna(''))
        tfidf_blocks.extend([word_vec, char_vec])

    combined = hstack(tfidf_blocks)
    return combined

In [30]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
from tqdm import tqdm  # optional: shows progress bar for large DataFrames

def sentenceEmbedding(df, text_col='text_column', device=None):
    # Load DistilBERT tokenizer & model once
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    model = AutoModel.from_pretrained("distilbert-base-uncased")

    # Use GPU if available, I have AMD GPU :(
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()  # set to inference mode

    def get_embedding(text):
        if pd.isna(text) or not isinstance(text, str) or text.strip() == "":
            return torch.zeros(model.config.hidden_size, device=device)
        
        inputs = tokenizer(
            text, return_tensors="pt", truncation=True, padding=True, max_length=512
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
        return embeddings

    tqdm.pandas(desc="Generating embeddings")  # progress bar
    df["text_embedding"] = df[text_col].progress_apply(get_embedding)

    # Optional: convert to list for storage or export
    df["text_embedding_list"] = df["text_embedding"].apply(lambda x: x.cpu().tolist())

    return df


In [31]:
# Suscpicious word counts for job descriptions/company (using heuristics)

def addSuspiciousWordCount(df):

    suspicious_keywords = [
        'quick money', 'easy money', 'high pay', 'earn fast', 'upfront fee',
        'no experience required', 'work from home', 'bonus', 'registration fee',
        'processing fee', 'pay before start', 'investment required', 'fee upfront',
        'deposit required', 'apply now', 'limited spots', 'immediate start',
        'act fast', 'urgent', 'deadline', 'be your own boss', 'flexible hours',
        'online opportunity', 'training provided', 'click here', 'get rich'
    ]
    def count_suspicious_words(text, keywords):
        text_lower = text.lower()
        return sum(text_lower.count(keyword) for keyword in keywords)  
    # df['description_suspicious_word_count'] = df['description'].apply(lambda x: count_suspicious_words(x, suspicious_keywords))
    # df['company_profile_suspicious_word_count'] = df['company_profile'].apply(lambda x: count_suspicious_words(x, suspicious_keywords))
    description_suspicious_word_count = df['description'].apply(lambda x: count_suspicious_words(x, suspicious_keywords))
    company_profile_suspicious_word_count = df['company_profile'].apply(lambda x: count_suspicious_words(x, suspicious_keywords))
    benefits_suspicious_word_count = df['benefits'].apply(lambda x: count_suspicious_words(x, suspicious_keywords))
    requirements_suspicious_word_count = df['requirements'].apply(lambda x: count_suspicious_words(x, suspicious_keywords))

    df['description_suspicious_word_count'] = description_suspicious_word_count
    df['company_profile_suspicious_word_count'] = company_profile_suspicious_word_count
    df['benefits_suspicious_word_count'] = benefits_suspicious_word_count
    df['requirements_suspicious_word_count'] = requirements_suspicious_word_count
    
    return df

In [32]:
# One hot encoding for categorical variables

def oneHotEncoding(df, categorical_cols):
    ohe = pd.get_dummies(df[categorical_cols].fillna('unknown').astype(str), prefix=categorical_cols, prefix_sep='__', dummy_na=False)
    df = pd.concat([df, ohe], axis=1)
    return df

In [33]:
# Parse Salary
def parse_salary_range(df):
    MONTH_MAP = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
    'jul': 7, 'aug': 8, 'sep': 9, 'sept': 9, 'oct': 10, 'nov': 11, 'dec': 12
}   
    def parse_salary(s):
        if pd.isna(s) or str(s).lower() == 'unknown':
            return (None, None)

        s = str(s).strip()
        if '-' not in s:
            try: return (int(s), int(s))
            except ValueError: return (None, None)

        left, right = [v.strip() for v in s.split('-', 1)]

        def val(v):
            return int(v) if v.isdigit() else MONTH_MAP.get(v.lower())

        l_val, r_val = val(left), val(right)
        return (l_val, r_val) if l_val is not None and r_val is not None else (None, None)

    df[['salary_min', 'salary_max']] = df['salary_range'].apply(lambda x: pd.Series(parse_salary(x)))
    
    return df


In [34]:
df = pd.read_csv('../fake_job_postings.csv')

# Colum types via EDA
binary_cols = [col for col in df.columns if df[col].nunique() == 2]
categorical_cols = [col for col in df.columns if 2 < df[col].nunique() < 150]
text_cols = [col for col in df.columns if df[col].dtype == 'object' and col not in categorical_cols + ['job_id']]

df = cleanAndDeduplicate(df)
df = addWordPatterns(df)
df = addSuspiciousWordCount(df)
df = oneHotEncoding(df, categorical_cols)
df = parse_salary_range(df)

# TDIDF
WORD_NGRAM = (1, 2)
CHAR_NGRAM = (3, 5)
MAX_FEATURES = 1000

word_tfidf_matrix = addAllWordTDIDF(df, WORD_NGRAM, MAX_FEATURES)
char_tfidf_matrix = addAllCharTDIDF(df, CHAR_NGRAM, MAX_FEATURES)
combined_tfidf = hstack([word_tfidf_matrix, char_tfidf_matrix])

# Sentence Embedding
df = sentenceEmbedding(df, text_col='description')

TypeError: addWordPatterns.<locals>.count_patterns() missing 1 required positional argument: 'patterns'

In [None]:
# Drop Raw Columns
to_drop = categorical_cols + text_cols
df_dropped = df.drop(columns=to_drop)

# Combine
tfidf_feature_names = [f"tfidf_{i}" for i in range(combined_tfidf.shape[1])]
tfidf_df = pd.DataFrame(combined_tfidf.toarray(), columns=tfidf_feature_names, index=df_dropped.index)
df_combined = pd.concat([df_dropped, tfidf_df], axis=1)

print(df_combined.head())
print("New df shape:", df_combined.shape)

   job_id  telecommuting  has_company_logo  has_questions  fraudulent  \
0       1              0                 1              0           0   
1       2              0                 1              0           0   
2       3              0                 1              0           0   
3       4              0                 1              0           0   
4       5              0                 1              1           0   

  location_country location_state location_city employment_type_clean  \
0               US             NY      New York               unknown   
1               NZ        unknown      Auckland             full-time   
2               US             IA         Wever               unknown   
3               US             DC    Washington             full-time   
4               US             FL    Fort Worth             full-time   

                                           dedup_key  ...  tfidf_9990  \
0  ((US, NY, New York), Marketing Intern, Food52,