### Best Model and Imbalance Handling
#### From `notebooks\3_model_exploration.ipynb`
- SMOTE + XGBoost

### Steps
1. reproduce the feature engineering again in each K-fold
2. train and preprocess properly to ensure no data leakage
3. evaluation

In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import hstack

# NLP preprocessing
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download('punkt_tab')
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
import wordninja

lemmatizer = WordNetLemmatizer()
STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

TEXT_COLS = ['title', 'description', 'requirements', 'benefits', 'company_profile']
CATEGORICAL_COLS = ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'location', 'department']
BINARY_COLS = ['telecommuting', 'has_company_logo', 'has_questions', 'fraudulent']

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alden\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\alden\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alden\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alden\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\alden\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
def handle_nulls_upfront(df):
    """
    Handle all null values upfront before any processing.
    This ensures consistent null handling throughout the pipeline.
    
    Strategy:
    - Text columns → empty string ""
    - Categorical columns → "unknown"
    - salary_range → "unknown"
    - Binary columns → keep as-is (will be handled during model training)
    """
    df_clean = df.copy()
    
    # Text columns: replace with empty string
    for col in TEXT_COLS:
        if col in df_clean.columns:
            df_clean[col] = df_clean[col].fillna('').astype(str)
            # Also handle explicit null-like strings
            df_clean[col] = df_clean[col].replace(['nan', 'NaN', 'None', 'none'], '')
    
    # Categorical columns: replace with 'unknown'
    for col in CATEGORICAL_COLS:
        if col in df_clean.columns:
            df_clean[col] = df_clean[col].fillna('unknown').astype(str)
            # Handle various null-like values
            null_like = ['Not Applicable', 'NaN', 'not applicable', 'Unspecified', 
                        'Other', 'Others', 'none', 'na', 'n/a', '', ' ', 'nan', 'None']
            df_clean[col] = df_clean[col].replace(null_like, 'unknown')
    
    # Salary range: replace with 'unknown'
    if 'salary_range' in df_clean.columns:
        df_clean['salary_range'] = df_clean['salary_range'].fillna('unknown').astype(str)
    
    return df_clean


def validate_nulls(df, stage_name=""):
    """
    Validate that no unexpected nulls exist after processing.
    Reports null counts and returns boolean indicating if nulls were found.
    """
    null_counts = df.isnull().sum()
    has_nulls = null_counts.any()
    
    if has_nulls:
        print(f"Nulls found after {stage_name}:")
        print(null_counts[null_counts > 0])
        return False
    else:
        print(f"No nulls found after {stage_name}")
        return True
    
def cleanAndDeduplicate(df):
    df_cleaning = df.copy()

    def simplify_employment_type(x):
        if not isinstance(x, str) or x == 'unknown':
            return 'unknown'
        
        x = x.strip().lower()
        if x in ['full-time', 'part-time']:
            return x
        elif x in ['contract', 'temporary']:
            return 'non-permanent'
        else:
            return 'unknown'
    
    df_cleaning['employment_type_clean'] = df_cleaning['employment_type'].apply(simplify_employment_type)

    def comparison_key(row):
        emp = None if row['employment_type_clean'] == 'unknown' else row['employment_type_clean']
        return (row['location'], row['title'], row['description'], row['requirements'], emp)

    df_cleaning['dedup_key'] = df_cleaning.apply(comparison_key, axis=1)
    df_deduped = df_cleaning.drop_duplicates(subset=['dedup_key'])
    
    print(f"Removed {len(df_cleaning) - len(df_deduped)} duplicate rows")
    return df_deduped

def check_corpus(df, text_cols):
    corpus_stats = {}

    for col in text_cols:
        texts = df[col].fillna("").astype(str).str.lower().tolist()

        tokens = []
        for t in texts:
            tokens.extend([w for w in word_tokenize(t) if len(w) > 2])

        corpus_stats[col] = len(set(tokens))

    print(corpus_stats)
    return corpus_stats

def apply_text_normalization(df, text_cols):
    """
    Normalize text: lowercase, remove URLs, punctuation, extra whitespace.
    
    Note: Assumes nulls have been handled - all text should be strings.
    Empty strings remain empty and are handled properly.
    """
    def normalize_text(text: str) -> str:
        # Handle empty strings (from nulls)
        if not text or not text.strip():
            return ""

        text = text.lower().strip()
        text = re.sub(r"http\S+|www\S+", " ", text)  # remove URLs
        text = re.sub(r"[^a-z\s']", " ", text)  # remove punctuation/numbers except apostrophes
        text = re.sub(r"\s+", " ", text).strip()

        return text
    
    for col in text_cols:
        tqdm.pandas(desc=f"Normalizing {col}")
        df[col] = df[col].progress_apply(normalize_text)
    
    print("Text normalization complete")
    return df


def apply_split_df(df, text_cols):
    """
    Split CamelCase and joined words using wordninja.
    Examples: 'SmartContract' -> 'Smart Contract', 'makemoney' -> 'make money'
    """
    def split_camel_case(token):
        """Splits CamelCase tokens: 'SmartContract' -> ['Smart', 'Contract']"""
        return re.sub('([a-z])([A-Z])', r'\1 \2', token).split()

    def split_joined_words(text, min_len=10):
        # Handle empty strings
        if not text or not text.strip():
            return ""
        
        tokens = text.split()
        new_tokens = []

        for token in tokens:
            # Skip short tokens
            if len(token) < min_len:
                new_tokens.append(token)
                continue

            # 1. Try CamelCase split
            camel_split = split_camel_case(token)

            if len(camel_split) > 1:
                # After splitting CamelCase, apply wordninja to each part
                final_parts = []
                for part in camel_split:
                    wn = wordninja.split(part)
                    final_parts.extend(wn)
                new_tokens.extend(final_parts)
                continue

            # 2. If no CamelCase, try wordninja directly
            wn = wordninja.split(token)
            if len(wn) > 1:
                new_tokens.extend(wn)
            else:
                new_tokens.append(token)

        return " ".join(new_tokens)

    for col in text_cols:
        tqdm.pandas(desc=f"Splitting joined words in {col}")
        df[col] = df[col].progress_apply(split_joined_words)
    
    print("Word splitting complete")
    return df

def remove_stopwords_df(df, text_cols):
    """
    Remove English stopwords from multiple text columns.
    Handles empty strings gracefully.
    """
    def remove_stopwords_text(text):
        # Handle empty strings
        if not text or not text.strip():
            return ""
        
        tokens = word_tokenize(text.lower())
        clean_tokens = [t for t in tokens if t not in STOPWORDS and len(t) > 2]
        return " ".join(clean_tokens)
    
    for col in text_cols:
        tqdm.pandas(desc=f"Removing stopwords in {col}")
        df[col] = df[col].progress_apply(remove_stopwords_text)
    
    print("Stopword removal complete")
    return df

def lemmatize_df(df, text_cols):
    """
    Lemmatize multiple text columns.
    Handles empty strings gracefully.
    """
    def lemmatize_text(text):
        # Handle empty strings
        if not text or not text.strip():
            return ""
        
        tokens = word_tokenize(text.lower())
        lemmas = [lemmatizer.lemmatize(t) for t in tokens if len(t) > 2]
        return " ".join(lemmas)
    
    for col in text_cols:
        tqdm.pandas(desc=f"Lemmatizing {col}")
        df[col] = df[col].progress_apply(lemmatize_text)
    
    print("Lemmatization complete")
    return df

def addWordPatterns(df):
    patterns = {
        'urls': r'http[s]?://\S+',
        'emails': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
        'phone_numbers': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
        'money_symbols': r'[$€£¥]',
        'other_symbols': r'[©®™]',
    }
    
    def count_patterns(text, patterns):
        if not text or not text.strip():
            return {k: 0 for k in patterns.keys()}
        return {k: len(re.findall(pat, str(text))) for k, pat in patterns.items()}

    for col in ['description', 'company_profile', 'requirements', 'benefits']:
        if col in df.columns:
            feat_df = df[col].apply(lambda x: count_patterns(x, patterns))
            feat_df = pd.DataFrame(list(feat_df), index=df.index).add_prefix(f'{col}_')
            df = pd.concat([df, feat_df], axis=1)

    print("Pattern features added")
    return df

def parse_salary_range(df):
    MONTH_MAP = {
        'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
        'jul': 7, 'aug': 8, 'sep': 9, 'sept': 9, 'oct': 10, 'nov': 11, 'dec': 12
    }   
    
    def parse_salary(s):
        if not s or str(s).lower() == 'unknown':
            return (0, 0)

        s = str(s).strip()
        
        if '-' not in s:
            try: 
                val = int(s)
                return (val, val)
            except ValueError: 
                return (0, 0)

        left, right = [v.strip() for v in s.split('-', 1)]

        def val(v):
            return int(v) if v.isdigit() else MONTH_MAP.get(v.lower())

        l_val, r_val = val(left), val(right)
        
        if l_val is not None and r_val is not None:
            return (l_val, r_val)
        else:
            return (0, 0)

    df[['salary_min', 'salary_max']] = df['salary_range'].apply(
        lambda x: pd.Series(parse_salary(x))
    )
    
    print("Salary parsing complete")
    return df

from sentence_transformers import SentenceTransformer

def sentenceEmbedding(df, text_cols, model=None, model_name='all-MiniLM-L6-v2', device=None):
    if model is None:
        model = SentenceTransformer(model_name, device=device)
        print(f"Loaded SentenceTransformer model: {model_name}")

    for col in text_cols:
        tqdm.pandas(desc=f"Embedding {col}")
        embeddings = df[col].progress_apply(
            lambda x: model.encode(str(x)) if pd.notna(x) else np.zeros(model.get_sentence_embedding_dimension())
        )
        df[f"{col}_embedding"] = embeddings

    return df, model

def expand_embeddings(df, embedding_cols):
    """
    Expand embedding arrays (stored as numpy arrays in cells) into separate columns.
    """
    print("\n--- Expanding Embeddings ---")
    
    for col in embedding_cols:
        if col not in df.columns:
            print(f"Warning: {col} not found in dataframe")
            continue

        embedding_matrix = np.vstack(df[col].values)
        n_dims = embedding_matrix.shape[1]
        col_names = [f"{col}_dim_{i}" for i in range(n_dims)]
        
        # df from embeddings
        embedding_df = pd.DataFrame(embedding_matrix, columns=col_names, index=df.index)
        df = pd.concat([df, embedding_df], axis=1)
        df = df.drop(columns=[col])
        
        print(f"Expanded {col}: {n_dims} dimensions")
    
    print("Embedding expansion complete")
    return df

def build_tfidf(
    df, 
    text_cols, 
    word_ngrams=(1, 2), 
    char_ngrams=(3, 5),
    min_df=3,            # ignore terms appearing in <3 docs
    max_df=0.8,          # ignore terms appearing in >80% of docs
    vectorizers=None     # Pre-fitted vectorizers for test data
):
    """
    Build TF-IDF features for text columns.
    Creates both word-level and character-level n-grams.
    """
    tfidf_results = {}
    vectorizers_out = {} if vectorizers is None else vectorizers
    is_training = vectorizers is None

    for col in text_cols:
        print(f'Processing {col}...')
        
        # Ensure all values are strings (should already be from null handling)
        text_data = df[col].astype(str)

        if is_training:
            # Training: fit new vectorizers
            word_vectorizer = TfidfVectorizer(
                tokenizer=word_tokenize,
                token_pattern=None,
                ngram_range=word_ngrams,
                max_features=2000,
                min_df=min_df,
                max_df=max_df
            )
            word_vec = word_vectorizer.fit_transform(text_data)

            char_vectorizer = TfidfVectorizer(
                analyzer='char_wb',
                ngram_range=char_ngrams,
                max_features=3000,
                min_df=min_df,
                max_df=max_df
            )
            char_vec = char_vectorizer.fit_transform(text_data)
            
            # Store vectorizers for reuse
            vectorizers_out[f'{col}_word'] = word_vectorizer
            vectorizers_out[f'{col}_char'] = char_vectorizer
        else:
            # Test: use pre-fitted vectorizers
            word_vectorizer = vectorizers_out[f'{col}_word']
            word_vec = word_vectorizer.transform(text_data)
            
            char_vectorizer = vectorizers_out[f'{col}_char']
            char_vec = char_vectorizer.transform(text_data)

        tfidf_results[col] = {
            "word_tfidf": word_vec,
            "char_tfidf": char_vec,
            "word_features": word_vectorizer.get_feature_names_out(),
            "char_features": char_vectorizer.get_feature_names_out()
        }

    if is_training:
        print("TF-IDF feature extraction complete (fitted new vectorizers)")
    else:
        print("TF-IDF feature extraction complete (used pre-fitted vectorizers)")
    
    return tfidf_results, vectorizers_out

def merge_tfidf_results(tfidf_results):
    """
    Merge all TF-IDF matrices (word and char) into a single sparse matrix.
    Returns both the sparse matrix and a DataFrame representation.
    """
    all_matrices = []
    all_feature_names = []

    for col, result in tfidf_results.items():
        word_features = [f"{col}_word_{f}" for f in result["word_features"]]
        all_feature_names.extend(word_features)
        all_matrices.append(result["word_tfidf"])
        
        char_features = [f"{col}_char_{f}" for f in result["char_features"]]
        all_feature_names.extend(char_features)
        all_matrices.append(result["char_tfidf"])

    # Combine all sparse matrices horizontally
    combined_matrix = hstack(all_matrices).tocsr()

    # Create a sparse DataFrame (efficient for large feature sets)
    tfidf_df = pd.DataFrame.sparse.from_spmatrix(combined_matrix, columns=all_feature_names)

    print(f"TF-IDF merge complete: {combined_matrix.shape[0]} samples × {combined_matrix.shape[1]} features")
    return combined_matrix, tfidf_df, all_feature_names

def apply_svd_reduction(tfidf_matrix, n_components=500, random_state=42):
    """
    Apply TruncatedSVD to reduce TF-IDF dimensionality.
    """
    print(f"Applying TruncatedSVD: {tfidf_matrix.shape[1]} features -> {n_components} components")
    
    svd = TruncatedSVD(n_components=n_components, random_state=random_state)
    svd_matrix = svd.fit_transform(tfidf_matrix)
    return svd_matrix, svd

def parse_location(df):
    def parse_location_parts(loc):
        if not isinstance(loc, str) or loc == 'unknown':
            return ("unknown", "unknown", "unknown")
        
        # Split by comma and strip, replacing empty strings with "unknown"
        parts = [p.strip() if p.strip() else "unknown" for p in loc.split(',')]
        while len(parts) < 3:
            parts.append("unknown")
        return (parts[0], parts[1], parts[2])
    
    df_loc = df['location'].apply(parse_location_parts)
    df['location_country'] = df_loc.apply(lambda x: x[0])
    df['location_state'] = df_loc.apply(lambda x: x[1])
    df['location_city'] = df_loc.apply(lambda x: x[2])
    
    # Ensure no NaN values remain (extra safety check)
    df['location_country'] = df['location_country'].fillna('unknown')
    df['location_state'] = df['location_state'].fillna('unknown')
    df['location_city'] = df['location_city'].fillna('unknown')
    
    print("Location parsing complete")
    return df

def preprocess_df(initial_df, n_svd_components=500, svd_model=None, tfidf_vectorizers=None):
    df = initial_df.copy()
    
    # STEP 3: Add pattern features (on raw text)
    df = addWordPatterns(df)
    validate_nulls(df, "pattern features")
    
    # STEP 4: Normalize text
    print("\n--- Text Processing ---")
    df = apply_text_normalization(df, TEXT_COLS)
    
    # STEP 5: Split joined words
    df = apply_split_df(df, TEXT_COLS)
    
    # STEP 6: Generate embeddings (before stopword removal)
    print("\n--- Generating Embeddings ---")
    sentence_cols = ['description', 'requirements', 'benefits', 'company_profile']
    df = sentenceEmbedding(df, text_cols=sentence_cols)
    validate_nulls(df, "sentence embeddings")
    
    # STEP 7: Remove stopwords
    df = remove_stopwords_df(df, TEXT_COLS)
    
    # STEP 8: Lemmatize
    df = lemmatize_df(df, TEXT_COLS)
    
    # STEP 9: Build TF-IDF for all text columns
    print("\n--- Building TF-IDF Features ---")
    tfidf_results, tfidf_vectorizers = build_tfidf(df, TEXT_COLS, vectorizers=tfidf_vectorizers)
    tfidf_matrix, tfidf_df, tfidf_feature_names = merge_tfidf_results(tfidf_results)
    
    # STEP 10: Apply TruncatedSVD to reduce TF-IDF dimensions
    print("\n--- Applying TruncatedSVD Dimensionality Reduction ---")
    if svd_model is None:
        # Training data: fit new SVD model
        svd_matrix, svd_model = apply_svd_reduction(tfidf_matrix, n_components=n_svd_components)
        print("✓ Fitted new SVD model")
    else:
        # Test data: use pre-fitted SVD model
        svd_matrix = svd_model.transform(tfidf_matrix)
        explained_variance = svd_model.explained_variance_ratio_.sum()
        print(f"✓ Used pre-fitted SVD model")
        print(f"Explained variance: {explained_variance:.2%}")
        print(f"SVD output shape: {svd_matrix.shape}")
    
    # Convert SVD matrix to DataFrame
    svd_col_names = [f"tfidf_svd_{i}" for i in range(svd_matrix.shape[1])]
    svd_df = pd.DataFrame(svd_matrix, columns=svd_col_names, index=df.index)
    
    # STEP 11: Parse salary range
    df = parse_salary_range(df)
    validate_nulls(df, "salary parsing")
    
    # STEP 12: Parse location
    df = parse_location(df)
    validate_nulls(df, "location parsing")
    
    # STEP 13: Prepare final features
    categorical_cols = [
        'employment_type_clean', 
        'required_experience', 
        'required_education', 
        'industry', 
        'function',
        'department',
        'location_country', 
        'location_state', 
        'location_city'
    ]
    
    for col in categorical_cols:
        if col in df.columns:
            df[col] = df[col].astype(str)
    
    # Drop intermediate columns (processed text, original location/employment/salary)
    df_features = df.drop(
        columns=TEXT_COLS + ['dedup_key', 'location', 'employment_type', 'salary_range'], 
        errors='ignore'
    )
    
    # STEP 14: Expand embeddings into separate columns
    embedding_cols = [f'{col}_embedding' for col in sentence_cols]
    df_features = expand_embeddings(df_features, embedding_cols)
    
    # STEP 15: Add SVD-reduced TF-IDF features
    print("\n--- Adding TF-IDF SVD Features ---")
    df_features = pd.concat([df_features, svd_df], axis=1)
    print(f"Added {svd_matrix.shape[1]} TF-IDF SVD features")
    
    validate_nulls(df_features, "final processing")
    
    print(f"\n=== Final Feature Summary ===")
    print(f"Total features: {df_features.shape[1]}")
    print(f"Total samples: {df_features.shape[0]}")
    
    return df_features, svd_model, tfidf_vectorizers

In [3]:
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    roc_auc_score, precision_recall_curve, auc, 
    classification_report, confusion_matrix,
    f1_score, accuracy_score, precision_score, 
    recall_score, average_precision_score, roc_curve
)
import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

  import pkg_resources


In [4]:
target_col = 'fraudulent'
id_col = 'job_id'

# load preprocessed data for hyperparameter tuning
train_df = pd.read_csv('../data/processed_train_features.csv')
test_df = pd.read_csv('../data/processed_test_features.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nTarget distribution in train:")
print(train_df[target_col].value_counts())

# load raw data for final k-fold training
raw_data = pd.read_csv('../data/fake_job_postings.csv')
raw_data = handle_nulls_upfront(raw_data)
raw_data = cleanAndDeduplicate(raw_data)
raw_data = raw_data.reset_index(drop=True)

print(f"\nRaw data shape: {raw_data.shape}")

Train shape: (14045, 2072)
Test shape: (3561, 2072)

Target distribution in train:
fraudulent
0    13383
1      662
Name: count, dtype: int64
Removed 399 duplicate rows

Raw data shape: (17481, 20)


In [5]:
# hp search space
search_space = {
    'max_depth': hp.choice('max_depth', range(3, 10)),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.3)),
    'n_estimators': hp.choice('n_estimators', range(100, 500, 50)),
    'min_child_weight': hp.choice('min_child_weight', range(1, 8)),
    'gamma': hp.uniform('gamma', 0, 0.5),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
    'reg_alpha': hp.loguniform('reg_alpha', np.log(1e-5), np.log(10)),
    'reg_lambda': hp.loguniform('reg_lambda', np.log(1e-5), np.log(10))
}

In [6]:
def objective_preprocessed(params):
    """
    hyperopt objective using preprocessed train/test split
    """
    params_int = {
        'max_depth': int(params['max_depth']),
        'n_estimators': int(params['n_estimators']),
        'min_child_weight': int(params['min_child_weight'])
    }
    params_float = {k: v for k, v in params.items() if k not in params_int}
    params_combined = {**params_int, **params_float}
    
    # prepare features
    X_train = train_df.drop(columns=[id_col, target_col], errors='ignore')
    y_train = train_df[target_col]
    X_test = test_df.drop(columns=[id_col, target_col], errors='ignore')
    y_test = test_df[target_col]
    
    # categorical columns to 'category' dtype
    categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    for col in categorical_cols:
        X_train[col] = X_train[col].astype('category')
        X_test[col] = X_test[col].astype('category')
    
    # apply smote
    smote = SMOTE(random_state=RANDOM_STATE)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    X_train_resampled = pd.DataFrame(X_train_resampled, columns=X_train.columns)
    for col in categorical_cols:
        X_train_resampled[col] = X_train_resampled[col].astype('category')
    
    # train model
    model = XGBClassifier(
        **params_combined,
        enable_categorical=True,
        tree_method='hist',
        random_state=RANDOM_STATE,
        eval_metric='logloss',
        use_label_encoder=False
    )
    model.fit(X_train_resampled, y_train_resampled, verbose=False)
    
    # evaluate on test set
    y_test_pred_proba = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_test_pred_proba)
    
    return {'loss': -roc_auc, 'status': STATUS_OK, 'roc_auc': roc_auc}

In [7]:
# run hyperparameter optimization on preprocessed data
trials = Trials()
best_params = fmin(
    fn=objective_preprocessed,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=np.random.default_rng(RANDOM_STATE),
    verbose=1
)

print("\n" + "="*50)
print("Best hyperparameters found:")
print("="*50)
for k, v in best_params.items():
    print(f"  {k}: {v}")

# get best roc-auc from trials
best_trial = trials.best_trial
print(f"\nBest ROC-AUC: {best_trial['result']['roc_auc']:.4f}")

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

job exception: Input contains NaN



  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]


ValueError: Input contains NaN

In [None]:
best_params_final = {
    'max_depth': int(best_params['max_depth']),
    'learning_rate': float(best_params['learning_rate']),
    'n_estimators': int(best_params['n_estimators']),
    'min_child_weight': int(best_params['min_child_weight']),
    'gamma': float(best_params['gamma']),
    'subsample': float(best_params['subsample']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'reg_alpha': float(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda'])
}

print("Final parameters for K-fold training:")
for k, v in best_params_final.items():
    print(f"  {k}: {v}")

In [None]:
# train final model with k-fold cv on raw data using best hyperparameters
print("\n" + "="*60)
print("Training final model with K-fold CV on raw data")
print("="*60)

X = raw_data.drop(columns=[target_col])
y = raw_data[target_col]

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

fold_results = []
all_y_true = []
all_y_pred = []
all_y_pred_proba = []

for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\nFold {fold_idx}/5")
    print("-" * 40)
    
    X_train_raw = X.iloc[train_idx].reset_index(drop=True)
    X_val_raw = X.iloc[val_idx].reset_index(drop=True)
    y_train = y.iloc[train_idx].reset_index(drop=True)
    y_val = y.iloc[val_idx].reset_index(drop=True)
    
    # preprocess within fold
    X_train_processed, svd_model, tfidf_vectorizers = preprocess_df(X_train_raw)
    X_val_processed, _, _ = preprocess_df(
        X_val_raw, 
        svd_model=svd_model, 
        tfidf_vectorizers=tfidf_vectorizers
    )
    
    X_val_processed = X_val_processed.reindex(columns=X_train_processed.columns, fill_value=0)
    
    categorical_cols = X_train_processed.select_dtypes(include=['object']).columns.tolist()
    if target_col in categorical_cols:
        categorical_cols.remove(target_col)
    if id_col in categorical_cols:
        categorical_cols.remove(id_col)
    
    X_train_features = X_train_processed.drop(columns=[id_col, target_col], errors='ignore')
    X_val_features = X_val_processed.drop(columns=[id_col, target_col], errors='ignore')
    
    for col in categorical_cols:
        if col in X_train_features.columns:
            X_train_features[col] = X_train_features[col].astype('category')
        if col in X_val_features.columns:
            X_val_features[col] = X_val_features[col].astype('category')
    
    # apply smote
    smote = SMOTE(random_state=RANDOM_STATE)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_features, y_train)
    
    X_train_resampled = pd.DataFrame(X_train_resampled, columns=X_train_features.columns)
    for col in categorical_cols:
        if col in X_train_resampled.columns:
            X_train_resampled[col] = X_train_resampled[col].astype('category')
    
    # train with best hyperparameters
    model = XGBClassifier(
        **best_params_final,
        enable_categorical=True,
        tree_method='hist',
        random_state=RANDOM_STATE,
        eval_metric='logloss',
        use_label_encoder=False
    )
    model.fit(X_train_resampled, y_train_resampled, verbose=False)
    
    # predictions
    y_val_pred_proba = model.predict_proba(X_val_features)[:, 1]
    y_val_pred = model.predict(X_val_features)
    
    # metrics
    roc_auc = roc_auc_score(y_val, y_val_pred_proba)
    pr_auc = average_precision_score(y_val, y_val_pred_proba)
    f1 = f1_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    accuracy = accuracy_score(y_val, y_val_pred)
    
    fold_results.append({
        'fold': fold_idx,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'accuracy': accuracy
    })
    
    all_y_true.extend(y_val.tolist())
    all_y_pred.extend(y_val_pred.tolist())
    all_y_pred_proba.extend(y_val_pred_proba.tolist())
    
    print(f"ROC-AUC: {roc_auc:.4f} | PR-AUC: {pr_auc:.4f} | F1: {f1:.4f}")
    print(f"Precision: {precision:.4f} | Recall: {recall:.4f} | Accuracy: {accuracy:.4f}")

# summary
results_df = pd.DataFrame(fold_results)
print("\n" + "="*60)
print("K-Fold Cross-Validation Summary")
print("="*60)
print(results_df.to_string(index=False))
print("\nMean ± Std:")
for col in ['roc_auc', 'pr_auc', 'f1', 'precision', 'recall', 'accuracy']:
    print(f"  {col:12s}: {results_df[col].mean():.4f} ± {results_df[col].std():.4f}")

In [None]:
# confusion matrix
cm = confusion_matrix(all_y_true, all_y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['legitimate', 'fraudulent'],
            yticklabels=['legitimate', 'fraudulent'])
plt.title('confusion matrix (aggregated across folds)')
plt.ylabel('true label')
plt.xlabel('predicted label')
plt.show()

print("\nclassification report:")
print(classification_report(all_y_true, all_y_pred, 
                          target_names=['legitimate', 'fraudulent']))

In [None]:
# roc curve
fpr, tpr, _ = roc_curve(all_y_true, all_y_pred_proba)
roc_auc_agg = roc_auc_score(all_y_true, all_y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'roc curve (auc = {roc_auc_agg:.4f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='random classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.title('roc curve (aggregated across folds)')
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.show()

In [None]:
# precision-recall curve
precision_vals, recall_vals, _ = precision_recall_curve(all_y_true, all_y_pred_proba)
pr_auc_agg = average_precision_score(all_y_true, all_y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(recall_vals, precision_vals, label=f'pr curve (auc = {pr_auc_agg:.4f})', linewidth=2)
plt.xlabel('recall')
plt.ylabel('precision')
plt.title('precision-recall curve (aggregated across folds)')
plt.legend(loc="lower left")
plt.grid(alpha=0.3)
plt.show()