# **Fraud Detection** (Fake Jobs)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
warnings.filterwarnings('ignore')
import time
import os

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, classification_report,
                             roc_curve, auc, precision_recall_curve)
from scipy.sparse import hstack, csr_matrix

# Stile seaborn
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)

### 1. CARICAMENTO E ESPLORAZIONE DATASET + GRAFICI

In [None]:
def load_and_explore_data(filepath):
    """Carica e analizza il dataset"""
    print("="*80)
    print("STEP 1: CARICAMENTO E ESPLORAZIONE DATASET")
    print("="*80)

    df = pd.read_csv(filepath)

    # 1. Analisi Struttura e Tipi (AGGIUNTO)
    print(f"\nDimensioni dataset: {df.shape}")
    print("\nInfo Dataset (Tipi di dato):")
    print(df.info())

    # 2. Controllo Duplicati (AGGIUNTO - Molto apprezzato dai prof)
    duplicates = df.duplicated().sum()
    if duplicates > 0:
        print(f"\n‚ö†Ô∏è ATTENZIONE: Trovati {duplicates} duplicati! Rimozione in corso...")
        df = df.drop_duplicates()
        print(f"Nuove dimensioni: {df.shape}")
    else:
        print("\n‚úì Nessun duplicato trovato.")

    # 3. Analisi Target
    print(f"\nDistribuzione target:")
    print(df['fraudulent'].value_counts())
    print(f"Class Imbalance: {df['fraudulent'].mean()*100:.2f}% fraudulent (SBILANCIATO)")

    # 4. Analisi Missing Values
    print(f"\nValori nulli per colonna (%):")
    missing_pct = df.isnull().sum() / len(df) * 100
    missing_pct = missing_pct[missing_pct > 0].sort_values(ascending=False)
    print(missing_pct)

    # GRAFICO 1: Target Distribution
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Countplot
    sns.countplot(x='fraudulent', data=df, ax=axes[0], palette=['green', 'red'])
    axes[0].set_title('Target Distribution (Count)', fontsize=14, fontweight='bold')
    axes[0].set_xticklabels(['Real (0)', 'Fraudulent (1)']) # Fix label

    # Pie chart
    df['fraudulent'].value_counts().plot(kind='pie', ax=axes[1],
                                         labels=['Real', 'Fraudulent'],
                                         autopct='%1.1f%%',
                                         colors=['green', 'red'],
                                         explode=(0, 0.1)) # Explode per evidenziare la slice piccola
    axes[1].set_title('Target Distribution (%)', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('')

    plt.tight_layout()
    plt.savefig('01_class_distribution.png', dpi=300, bbox_inches='tight')
    print("\n‚úì Grafico salvato: 01_class_distribution.png")
    plt.close()

    # GRAFICO 2: Missing Values Barplot
    if not missing_pct.empty:
        fig, ax = plt.subplots(figsize=(12, 6))
        sns.barplot(x=missing_pct.values, y=missing_pct.index, palette='YlOrRd', ax=ax)
        ax.set_title('Percentuale di Missing Values per Colonna', fontsize=14, fontweight='bold')
        ax.set_xlabel('Percentuale (%)') # Pi√π chiaro di "Number"
        plt.tight_layout()
        plt.savefig('02_missing_values.png', dpi=300, bbox_inches='tight')
        print("‚úì Grafico salvato: 02_missing_values.png")
        plt.close()
    else:
        print("‚úì Nessun missing value da graficare.")

    return df

### 2. GESTIONE MISSING VALUES E DATA CLEANING

In [None]:
def clean_data(df):
    """Cleaning del dataset e Preprocessing Testuale"""
    print("\n" + "="*80)
    print("STEP 2: GESTIONE MISSING VALUES E DATA CLEANING")
    print("="*80)

    df_clean = df.copy()

    # 1. GESTIONE LOCATION (Feature Engineering Avanzata)
    # Estraiamo solo il Paese (es. "US, NY, New York" -> "US")
    # Questo riduce la cardinalit√† e aiuta il modello a trovare pattern geografici
    df_clean['country'] = df_clean['location'].apply(
        lambda x: x.split(',')[0].strip() if isinstance(x, str) and ',' in x else 'Unknown'
    )
    # Rimuoviamo la location originale che √® troppo specifica
    df_clean = df_clean.drop('location', axis=1)
    print("‚úì Feature 'country' estratta dalla location")

    # 2. RIEMPIMENTO MISSING VALUES
    # Testo
    text_cols = ['company_profile', 'description', 'requirements', 'benefits']
    for col in text_cols:
        df_clean[col] = df_clean[col].fillna('')

    # Categorie (Aggiungiamo 'country' alla lista)
    category_cols = ['department', 'employment_type', 'required_experience',
                     'required_education', 'industry', 'function', 'country']

    for col in category_cols:
        df_clean[col] = df_clean[col].fillna('Unknown')

    print("‚úì Missing values riempiti (Strategy: 'Unknown' placeholder)")

    # 3. PULIZIA TESTO (Data Cleaning Fondamentale per NLP)
    def clean_text(text):
        if not isinstance(text, str):
            return ""
        text = text.lower() # Minuscolo
        text = re.sub(r'<.*?>', '', text) # Rimuove tag HTML (<br>, <div>)
        text = re.sub(r'http\S+', '', text) # Rimuove URL
        text = re.sub(r'[^a-zA-Z\s]', '', text) # Rimuove numeri e punteggiatura
        text = re.sub(r'\s+', ' ', text).strip() # Rimuove spazi doppi
        return text

    print("Cleaning testo in corso (rimozione HTML, URL, caratteri speciali)...")
    for col in text_cols:
        df_clean[col] = df_clean[col].apply(clean_text)
    print("‚úì Testo pulito")

    # 4. ESTRAZIONE SALARY (Invariato - Era gi√† ottimo)
    def extract_salary_info(salary_str):
        if pd.isna(salary_str) or salary_str == '':
            return 0, 0
        try:
            # Cerca pattern numerici
            nums = re.findall(r'\d+', str(salary_str))
            if len(nums) >= 2:
                return float(nums[0]), float(nums[1])
            elif len(nums) == 1:
                return float(nums[0]), float(nums[0])
        except:
            pass
        return 0, 0

    df_clean[['salary_min', 'salary_max']] = df_clean['salary_range'].apply(
        lambda x: pd.Series(extract_salary_info(x))
    )
    df_clean['salary_range_flag'] = (df_clean['salary_range'].notna() & (df_clean['salary_range'] != '')).astype(int)

    # Drop colonne inutili
    df_clean = df_clean.drop(['job_id', 'salary_range'], axis=1)

    print("‚úì Features salary estratte")
    print(f"Dataset shape dopo cleaning: {df_clean.shape}")

    return df_clean

### 3. FEATURE ENGINEERING + GRAFICI OUTLIER

In [None]:
def feature_engineering(df_clean):
    """Engineering di nuove features"""
    print("\n" + "="*80)
    print("STEP 3: FEATURE ENGINEERING")
    print("="*80)

    # 1. CREAZIONE META-FEATURES TESTUALI
    # Combinazione testo per TF-IDF (che useremo dopo)
    df_clean['combined_text'] = (
        df_clean['description'] + ' ' +
        df_clean['requirements'] + ' ' +
        df_clean['benefits']
    )

    # Lunghezze e conteggi
    df_clean['len_description'] = df_clean['description'].str.len()
    df_clean['len_requirements'] = df_clean['requirements'].str.len()
    df_clean['len_benefits'] = df_clean['benefits'].str.len()
    df_clean['len_company_profile'] = df_clean['company_profile'].str.len()

    df_clean['words_description'] = df_clean['description'].str.split().str.len()
    df_clean['words_requirements'] = df_clean['requirements'].str.split().str.len()

    print("‚úì Meta-features testuali create")

    # 2. GESTIONE OUTLIERS (Log Transformation)
    # Le distribuzioni di lunghezza e salario sono "skewed" (coda lunga).
    # Applichiamo np.log1p (log(1+x)) per normalizzare la distribuzione e ridurre l'impatto degli outlier.
    numeric_features = ['len_description', 'len_requirements', 'len_benefits',
                       'len_company_profile', 'words_description', 'words_requirements',
                       'salary_min', 'salary_max']

    print("\nApplicazione Log-Transformation per gestire gli Outliers...")
    for col in numeric_features:
        # Creiamo nuove colonne logaritmiche
        df_clean[f'log_{col}'] = np.log1p(df_clean[col])

    # Aggiorniamo la lista delle features numeriche da usare nel modello
    # Usiamo le versioni logaritmiche che sono pi√π stabili
    final_numeric_features = [f'log_{col}' for col in numeric_features]

    # GRAFICO 3: Boxplot (Originali vs Log) - Dimostrazione Efficacia
    # Mostriamo solo i primi 4 per brevit√† nel grafico, ma li trasformiamo tutti
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    cols_to_plot = ['len_description', 'salary_max'] # Esempio significativo

    # Plot Originale vs Log per Description
    sns.boxplot(x=df_clean['len_description'], ax=axes[0,0], color='skyblue')
    axes[0,0].set_title('Original: Len Description (Many Outliers)', fontweight='bold')

    sns.boxplot(x=df_clean['log_len_description'], ax=axes[0,1], color='lightgreen')
    axes[0,1].set_title('Log-Transformed: Len Description (Normalized)', fontweight='bold')

    # Plot Originale vs Log per Salary
    sns.boxplot(x=df_clean['salary_max'], ax=axes[1,0], color='salmon')
    axes[1,0].set_title('Original: Salary Max', fontweight='bold')

    sns.boxplot(x=df_clean['log_salary_max'], ax=axes[1,1], color='orange')
    axes[1,1].set_title('Log-Transformed: Salary Max', fontweight='bold')

    plt.tight_layout()
    plt.savefig('03_outliers_management.png', dpi=300, bbox_inches='tight')
    print("‚úì Grafico salvato: 03_outliers_management.png (Confronto Before/After)")
    plt.close()

    # GRAFICO 4: Istogrammi (Sulle features trasformate)
    fig, axes = plt.subplots(2, 4, figsize=(16, 8))
    axes = axes.flatten()

    for idx, col in enumerate(final_numeric_features):
        sns.histplot(data=df_clean, x=col, hue='fraudulent', kde=True, ax=axes[idx], element="step")
        axes[idx].set_title(f'Dist: {col}', fontsize=10)

    plt.tight_layout()
    plt.savefig('04_features_distribution.png', dpi=300, bbox_inches='tight')
    print("‚úì Grafico salvato: 04_features_distribution.png")
    plt.close()

    # 3. ENCODING CATEGORICO
    # Aggiunto 'country' che mancava
    categorical_features = ['employment_type', 'required_experience',
                           'required_education', 'industry', 'function', 'country']

    print(f"\nEncoding Categorico ({len(categorical_features)} features)...")
    le_dict = {}
    for col in categorical_features:
        # Usiamo LabelEncoder.
        # NOTA PER IL REPORT: Scelta ottimizzata per modelli Tree-based (Random Forest).
        # Per modelli lineari puri sarebbe meglio OneHotEncoder, ma aumenterebbe troppo la dimensionalit√† qui.
        le = LabelEncoder()
        df_clean[f'{col}_encoded'] = le.fit_transform(df_clean[col].astype(str))
        le_dict[col] = le

    print(f"‚úì Features categoriche encode completato")

    return df_clean, categorical_features, final_numeric_features

### 4. PREPARAZIONE DATI PER MODELLAZIONE

In [None]:
def prepare_features(df_clean, categorical_features, numeric_features, test_size=0.2):
    """Preparazione features finali"""
    print("\n" + "="*80)
    print("STEP 4: PREPARAZIONE DATI PER MODELLAZIONE")
    print("="*80)

    # Separazione X e y
    X_text = df_clean[['combined_text']]

    # Feature binarie che non necessitano di scaling
    binary_features = ['telecommuting', 'has_company_logo', 'has_questions', 'salary_range_flag']

    # Feature categoriche gi√† encodate
    meta_categorical_encoded = [f'{col}_encoded' for col in categorical_features]

    # Uniamo le feature numeriche (log-trasformate) e quelle categoriche/binarie
    X_meta = df_clean[numeric_features + binary_features + meta_categorical_encoded]
    y = df_clean['fraudulent']

    # Stratified Train-Test Split (CRUCIALE per dati sbilanciati)
    print("Esecuzione Stratified Split (mantiene la % di frodi uguale tra train e test)...")
    X_text_train, X_text_test, y_train, y_test = train_test_split(
        X_text, y, test_size=test_size, random_state=42, stratify=y
    )
    # Facciamo lo split parallelo per i metadati usando gli stessi indici
    X_meta_train, X_meta_test, _, _ = train_test_split(
        X_meta, y, test_size=test_size, random_state=42, stratify=y
    )

    print(f"Train set: {len(X_text_train)} samples ({(y_train==0).sum()} real, {(y_train==1).sum()} fraud)")
    print(f"Test set: {len(X_text_test)} samples ({(y_test==0).sum()} real, {(y_test==1).sum()} fraud)")

    # GRAFICO 5: Train-Test Split Distribution
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    train_dist = y_train.value_counts()
    test_dist = y_test.value_counts()

    x = ['Real', 'Fraudulent']
    train_vals = [train_dist[0], train_dist[1]]
    test_vals = [test_dist[0], test_dist[1]]

    x_pos = np.arange(len(x))
    width = 0.35

    axes[0].bar(x_pos - width/2, train_vals, width, label='Train', color='skyblue')
    axes[0].bar(x_pos + width/2, test_vals, width, label='salmon')
    axes[0].set_ylabel('Count')
    axes[0].set_title('Train-Test Split Distribution (Absolute)', fontweight='bold')
    axes[0].set_xticks(x_pos)
    axes[0].set_xticklabels(x)
    axes[0].legend()

    # Percentages
    train_pct = [train_vals[0]/sum(train_vals)*100, train_vals[1]/sum(train_vals)*100]
    test_pct = [test_vals[0]/sum(test_vals)*100, test_vals[1]/sum(test_vals)*100]

    axes[1].bar(x_pos - width/2, train_pct, width, label='Train %', color='skyblue')
    axes[1].bar(x_pos + width/2, test_pct, width, label='Test %', color='salmon')
    axes[1].set_ylabel('Percentage (%)')
    axes[1].set_title('Validation of Stratification', fontweight='bold')
    axes[1].set_xticks(x_pos)
    axes[1].set_xticklabels(x)
    axes[1].legend()

    plt.tight_layout()
    plt.savefig('05_train_test_split.png', dpi=300, bbox_inches='tight')
    print("\n‚úì Grafico salvato: 05_train_test_split.png")
    plt.close()

    # TF-IDF Vectorization
    print("\nTF-IDF Vectorization...")
    # NOTA: Aumentiamo max_features a 5000. 100 √® troppo poco per catturare frodi.
    # Usiamo ngram_range=(1,2) per catturare coppie di parole (es. "wire transfer")
    vectorizer = TfidfVectorizer(
        max_features=5000,
        stop_words='english',
        ngram_range=(1, 2),
        min_df=5,
        max_df=0.7
    )

    X_text_train_tfidf = vectorizer.fit_transform(X_text_train['combined_text'])
    X_text_test_tfidf = vectorizer.transform(X_text_test['combined_text'])
    print(f"‚úì TF-IDF features created: {X_text_train_tfidf.shape[1]} features")

    # GRAFICO 6: Top TF-IDF Features
    tfidf_feature_names = vectorizer.get_feature_names_out()
    feature_importance_tfidf = np.asarray(X_text_train_tfidf.mean(axis=0)).ravel()
    # Prendiamo le top 20
    idx = feature_importance_tfidf.argsort()[-20:][::-1]

    fig, ax = plt.subplots(figsize=(12, 6))
    sns.barplot(x=feature_importance_tfidf[idx], y=tfidf_feature_names[idx],
                palette='viridis', ax=ax)
    ax.set_title('Top 20 Most Frequent Words (TF-IDF Importance)', fontweight='bold')
    ax.set_xlabel('Average TF-IDF Score')
    plt.tight_layout()
    plt.savefig('06_tfidf_features.png', dpi=300, bbox_inches='tight')
    print("‚úì Grafico salvato: 06_tfidf_features.png")
    plt.close()

    # SCALING (RobustScaler invece di Standard)
    print("Scaling meta-features (Using RobustScaler for Outlier Resistance)...")
    # RobustScaler usa mediana e IQR, quindi √® immune agli outlier che abbiamo visto nei boxplot
    scaler = RobustScaler()

    # Scaliamo solo le numeriche continue, non le binarie o categoriche encodate
    # (Anche se tecnicamente scalare le binarie non rompe nulla, √® pi√π pulito cos√¨)
    X_meta_train_scaled = scaler.fit_transform(X_meta_train)
    X_meta_test_scaled = scaler.transform(X_meta_test)

    print(f"‚úì Meta-features scaled: {X_meta_train_scaled.shape[1]} features")

    # Concatenazione features
    print("\nConcatenating Text + Meta features...")
    X_train = hstack([X_text_train_tfidf, csr_matrix(X_meta_train_scaled)])
    X_test = hstack([X_text_test_tfidf, csr_matrix(X_meta_test_scaled)])

    # Conversione a Dense (Attenzione alla RAM se max_features > 10000)
    # Con 5000 features √® gestibile e facilita il debug
    X_train_dense = X_train.toarray()
    X_test_dense = X_test.toarray()

    print(f"‚úì Final Dataset Shape: {X_train_dense.shape}")
    print(f"  (Includes 5000 TF-IDF tokens + {X_meta_train_scaled.shape[1]} meta-features)")

    return (X_train_dense, X_test_dense, y_train, y_test, vectorizer, scaler,
            X_text_train_tfidf, X_text_test_tfidf, tfidf_feature_names)

### 5. TRAINING E VALUTAZIONE MODELLI

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """Addestramento e valutazione modello"""
    print(f"Training {model_name}...", end=" ", flush=True)

    # Training
    model.fit(X_train, y_train)

    # Prediction (Soglia standard 0.5)
    y_pred = model.predict(X_test)

    # Probabilities (Gestione eccezione per modelli che non hanno predict_proba, es. SVM hinge)
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    else:
        # Per SVM con loss='hinge', usiamo decision_function
        y_pred_proba = model.decision_function(X_test)
        # Normalizziamo per avere pseudo-probabilit√† per ROC-AUC
        y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (y_pred_proba.max() - y_pred_proba.min())

    # Metriche
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    print(f"‚úì F1: {f1:.4f} | Recall: {rec:.4f}")

    return {
        'model_name': model_name,
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1': f1,
        'roc_auc': roc_auc,
        'model': model,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba,
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred)
    }

def train_models(X_train_dense, X_test_dense, y_train, y_test):
    """Training di pi√π modelli (Logistic, RF, SVM)"""
    print("\n" + "="*80)
    print("STEP 5: ADDESTRAMENTO E VALUTAZIONE MODELLI")
    print("="*80)

    models = {
        # 1. BASELINE LINEARE
        'Logistic Regression': LogisticRegression(
            max_iter=1000,
            random_state=42,
            class_weight='balanced', # Fondamentale per sbilanciamento
            n_jobs=-1
        ),

        # 2. ENSEMBLE (Pi√π potente)
        'Random Forest': RandomForestClassifier(
            n_estimators=100,      # Aumentato da 50 a 100
            random_state=42,
            class_weight='balanced',
            n_jobs=-1,
            max_depth=None         # RIMOSSO IL CAP A 10! Lasciamolo imparare.
        ),

        # 3. SVM LINEARE (Ottimo per NLP e alta dimensionalit√†)
        'Linear SVM (SGD)': SGDClassifier(
            loss='log_loss',       # 'log_loss' d√† probabilit√† (equivalente a Logistic Regression ma pi√π veloce su big data)
            random_state=42,
            class_weight='balanced',
            max_iter=1000,
            n_jobs=-1
        )
    }

    results = []
    for name, model in models.items():
        result = evaluate_model(model, X_train_dense, X_test_dense, y_train, y_test, name)
        results.append(result)

    # Summary Table
    print("\n" + "="*80)
    print("MODELLI COMPARISON TABLE")
    print("="*80)
    summary = pd.DataFrame([
        {
            'Model': r['model_name'],
            'Accuracy': r['accuracy'],
            'Precision': r['precision'],
            'Recall': r['recall'],
            'F1-Score': r['f1'],
            'ROC-AUC': r['roc_auc']
        } for r in results
    ]).sort_values(by='F1-Score', ascending=False)

    print(summary.to_string(index=False, float_format="%.4f"))

    # GRAFICO 7: Model Comparison
    fig, axes = plt.subplots(1, 3, figsize=(18, 6)) # Semplificato in 1 riga

    # Plot 1: F1-Score (Il pi√π importante)
    sns.barplot(x='F1-Score', y='Model', data=summary, ax=axes[0], palette='viridis')
    axes[0].set_title('Model F1-Score (The Key Metric)', fontweight='bold')
    axes[0].set_xlim(0, 1)

    # Plot 2: Recall (Per non perdere frodi)
    sns.barplot(x='Recall', y='Model', data=summary, ax=axes[1], palette='magma')
    axes[1].set_title('Model Recall (Sensitivity)', fontweight='bold')
    axes[1].set_xlim(0, 1)

    # Plot 3: ROC-AUC
    sns.barplot(x='ROC-AUC', y='Model', data=summary, ax=axes[2], palette='coolwarm')
    axes[2].set_title('Model ROC-AUC', fontweight='bold')
    axes[2].set_xlim(0.5, 1) # AUC parte da 0.5 (random)

    plt.tight_layout()
    plt.savefig('07_models_comparison.png', dpi=300, bbox_inches='tight')
    print("\n‚úì Grafico salvato: 07_models_comparison.png")
    plt.close()

    # Selezione Miglior Modello
    best_result = max(results, key=lambda x: x['f1'])
    print(f"\nüèÜ BEST MODEL: {best_result['model_name']} con F1 = {best_result['f1']:.4f}")



    return results, best_result

### 6. ANALISI DETTAGLIATA + GRAFICI

In [None]:
def detailed_analysis(best_result, vectorizer, feature_names_meta, y_test):
    """Analisi dettagliata del miglior modello con Ottimizzazione Soglia"""
    print("\n" + "="*80)
    print("STEP 6: ANALISI DETTAGLIATA - " + best_result['model_name'].upper())
    print("="*80)

    # 1. ANALISI SOGLIA OTTIMALE (Threshold Tuning)
    # Le frodi sono rare. Spesso la soglia 0.5 non √® l'ideale.
    # Cerchiamo la soglia che massimizza l'F1-Score.
    print("\n--- THRESHOLD TUNING ---")
    y_proba = best_result['y_pred_proba']

    thresholds = np.arange(0.1, 1.0, 0.05)
    best_thresh = 0.5
    best_f1 = 0
    scores = []

    for t in thresholds:
        preds = (y_proba >= t).astype(int)
        f1 = f1_score(y_test, preds)
        rec = recall_score(y_test, preds)
        prec = precision_score(y_test, preds, zero_division=0)
        scores.append({'threshold': t, 'f1': f1, 'recall': rec, 'precision': prec})

        if f1 > best_f1:
            best_f1 = f1
            best_thresh = t

    print(f"Soglia Standard (0.50) -> F1: {best_result['f1']:.4f}")
    print(f"Soglia Ottimale ({best_thresh:.2f}) -> F1: {best_f1:.4f} (Miglioramento: {best_f1 - best_result['f1']:.4f})")

    # Aggiorniamo le predizioni con la nuova soglia per i grafici successivi
    y_pred_opt = (y_proba >= best_thresh).astype(int)

    # 2. CONFUSION MATRIX (Con soglia ottimizzata)
    cm = confusion_matrix(y_test, y_pred_opt)

    # Calcoli manuali per leggibilit√†
    TN, FP, FN, TP = cm.ravel() # Flattening 2x2 matrix

    print(f"\nConfusion Matrix Analysis (Threshold {best_thresh:.2f}):")
    print(f"  True Positives (Fraud detected): {TP}")
    print(f"  True Negatives (Real identified): {TN}")
    print(f"  False Positives (Real -> Fraud): {FP} (Falsi Allarmi)")
    print(f"  False Negatives (Fraud missed): {FN} (Truffe perse - IL DATO PI√ô CRITICO)")

    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
    sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0 # Recall

    print(f"\nMetriche Finali:")
    print(f"  Specificity: {specificity:.4f}")
    print(f"  Recall (Sensitivity): {sensitivity:.4f}")
    print(f"  Precision: {TP / (TP + FP):.4f}")

    # GRAFICO 8: Confusion Matrix
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
                xticklabels=['Real (0)', 'Fraudulent (1)'],
                yticklabels=['Real (0)', 'Fraudulent (1)'])
    ax.set_title(f'Confusion Matrix (Best F1 Threshold: {best_thresh:.2f})', fontweight='bold')
    plt.tight_layout()
    plt.savefig('08_confusion_matrix.png', dpi=300, bbox_inches='tight')
    print("‚úì Grafico salvato: 08_confusion_matrix.png")
    plt.close()

    # GRAFICO 9: ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)

    fig, ax = plt.subplots(figsize=(10, 8))
    ax.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
    ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('ROC Curve', fontweight='bold')
    ax.legend(loc="lower right")
    plt.tight_layout()
    plt.savefig('09_roc_curve.png', dpi=300, bbox_inches='tight')
    print("‚úì Grafico salvato: 09_roc_curve.png")
    plt.close()

    # GRAFICO 10: Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_test, y_proba)

    fig, ax = plt.subplots(figsize=(10, 8))
    ax.plot(recall, precision, color='purple', lw=2, label='PR Curve')
    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    ax.set_title('Precision-Recall Curve', fontweight='bold')
    ax.legend()
    plt.tight_layout()
    plt.savefig('10_precision_recall_curve.png', dpi=300, bbox_inches='tight')
    print("‚úì Grafico salvato: 10_precision_recall_curve.png")
    plt.close()

    # 3. FEATURE IMPORTANCE
    # Ricostruzione nomi features
    print(f"\nCalcolo Feature Importance...")
    try:
        # Recupera nomi dal vectorizer (5000 features)
        tfidf_names = list(vectorizer.get_feature_names_out())
        # Unisce con i nomi delle meta-features
        all_features = tfidf_names + feature_names_meta

        # Estrazione coefficienti/importanza
        if hasattr(best_result['model'], 'coef_'):
            # Modelli Lineari (Logistic/SVM)
            importances = best_result['model'].coef_[0]
            imp_df = pd.DataFrame({'Feature': all_features, 'Value': importances})
            imp_df['Abs_Value'] = imp_df['Value'].abs()
            imp_df = imp_df.sort_values('Abs_Value', ascending=False).head(20)

            # Plot Colore Divergente (Rosso = Indicatore Frode, Verde = Indicatore Legit)
            colors = ['red' if x > 0 else 'green' for x in imp_df['Value']] # Assumendo 1=Fraud
            title = f"Top 20 Features - {best_result['model_name']} (Red=Fraud-like, Green=Real-like)"

        elif hasattr(best_result['model'], 'feature_importances_'):
            # Random Forest
            importances = best_result['model'].feature_importances_
            imp_df = pd.DataFrame({'Feature': all_features, 'Value': importances})
            imp_df = imp_df.sort_values('Value', ascending=False).head(20)
            colors = 'skyblue'
            title = f"Top 20 Feature Importance - {best_result['model_name']}"

        else:
            print("Il modello non supporta feature importance diretta.")
            return

        print(imp_df[['Feature', 'Value']].to_string(index=False))

        # GRAFICO 11
        fig, ax = plt.subplots(figsize=(12, 8))
        ax.barh(imp_df['Feature'], imp_df['Value'], color=colors)
        ax.set_title(title, fontweight='bold')
        ax.invert_yaxis() # Top feature in alto
        plt.tight_layout()
        plt.savefig('11_feature_importance.png', dpi=300, bbox_inches='tight')
        print("‚úì Grafico salvato: 11_feature_importance.png")
        plt.close()

    except Exception as e:
        print(f"‚ö†Ô∏è Errore nel plot Feature Importance: {e}")
        print("Probabile causa: mismatch tra numero features TF-IDF e lista nomi.")

### MAIN PIPELINE

In [None]:
def main(filepath='fake_job_postings.csv'):
    """Pipeline completa di Machine Learning"""
    start_time = time.time()

    print("\n" + "="*80)
    print("FRAUD JOB POSTINGS DETECTION - PIPELINE DI PROGETTO")
    print("="*80 + "\n")

    # Verifica esistenza file
    if not os.path.exists(filepath):
        print(f"‚ùå ERRORE: Il file '{filepath}' non √® stato trovato nella cartella corrente.")
        print("   Assicurati di aver scaricato il dataset e rinominato il file correttamente.")
        return None

    # Step 1: Load & Explore
    df = load_and_explore_data(filepath)

    # Step 2: Clean
    df_clean = clean_data(df)

    # Step 3: Feature Engineering
    # Restituisce il df, le categorie originali e le numeriche (log-trasformate)
    df_clean, cat_features, num_features = feature_engineering(df_clean)

    # Step 4: Prepare Features
    # Qui avviene la magia delle matrici. √à fondamentale che i nomi corrispondano.
    (X_train, X_test, y_train, y_test, vectorizer, scaler,
     X_text_train, X_text_test, tfidf_feature_names) = prepare_features(
        df_clean, cat_features, num_features
    )

    # Step 5: Train Models (Include Threshold Tuning)
    results, best_result = train_models(X_train, X_test, y_train, y_test)

    # --- COSTRUZIONE NOMI FEATURES PER IL REPORT ---
    # ATTENZIONE: L'ordine deve rispecchiare ESATTAMENTE l'hstack fatto nello Step 4.
    # Ordine Step 4: [TF-IDF] + [Numeric] + [Binary] + [Categorical Encoded]

    binary_features = ['telecommuting', 'has_company_logo', 'has_questions', 'salary_range_flag']
    encoded_features = [f'{col}_encoded' for col in cat_features]

    # Lista completa dei nomi dei metadati (senza TF-IDF che viene aggiunto dentro la funzione detailed_analysis)
    meta_features_names = num_features + binary_features + encoded_features

    # Step 6: Detailed Analysis & Visualization
    detailed_analysis(best_result, vectorizer, meta_features_names, y_test)

    # Chiusura
    elapsed_time = time.time() - start_time
    print("\n" + "="*80)
    print(f"PIPELINE COMPLETATA IN {elapsed_time:.2f} SECONDI")
    print("="*80)
    print("\n‚úì File generati nella cartella:")
    grafici = [
        "01_class_distribution.png", "02_missing_values.png", "03_outliers_management.png",
        "04_features_distribution.png", "05_train_test_split.png", "06_tfidf_features.png",
        "07_models_comparison.png", "08_confusion_matrix.png", "09_roc_curve.png",
        "10_precision_recall_curve.png", "11_feature_importance.png"
    ]
    for g in grafici:
        print(f"  üìÑ {g}")

    return {
        'dataset': df_clean,
        'models': results,
        'best_model': best_result,
        'vectorizer': vectorizer,
        'scaler': scaler
    }

if __name__ == "__main__":
    # Avvio Pipeline
    pipeline_results = main('fake_job_postings.csv')


FRAUD JOB POSTINGS DETECTION - PIPELINE DI PROGETTO

STEP 1: CARICAMENTO E ESPLORAZIONE DATASET

Dimensioni dataset: (17880, 18)

Info Dataset (Tipi di dato):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15184 non-null  object
 8   benefits             10668 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  obj