In [None]:
import numpy as np
import pandas as pd
import pickle

In [None]:
dataset = pd.read_csv('cleaned_profiles.csv')
dataset

XGBOOST - BO

In [None]:
import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from xgboost import XGBClassifier
import warnings
from datetime import datetime
import pickle
import os

warnings.filterwarnings("ignore")

class XGBoostAdversarialTuning:
    def __init__(self, embedding_file, dataset, model_name='roberta'):
        # Store model name
        self.model_name = model_name
        
        # Prepare data
        self.emb = pd.read_csv(embedding_file)
        self.y = dataset["Label"]
        
        # Prepare profile type masks
        self.genuine_mask = self.y == 0
        self.fake_manual_mask = self.y == 1
        self.gptv1_mask = self.y.isin([10, 11])
        self.gptv2_mask = self.y == 12
        
        # Initialize scaler
        self.scaler = StandardScaler()
        
        # Create models directory if it doesn't exist
        os.makedirs('models', exist_ok=True)

    def _get_data_splits(self, random_state=42):
        """Prepare all data splits with 70-30 ratio"""
        # Split genuine profiles (70-30)
        X_genuine = self.emb[self.genuine_mask]
        X_train_genuine, X_test_genuine = train_test_split(
            X_genuine, test_size=0.3, random_state=random_state
        )
        
        # Split manual fake profiles (70-30)
        X_fake = self.emb[self.fake_manual_mask]
        X_train_fake, X_test_fake = train_test_split(
            X_fake, test_size=0.3, random_state=random_state
        )
        
        # Split GPTv1 profiles (70-30)
        X_gptv1 = self.emb[self.gptv1_mask]
        X_train_gptv1, X_test_gptv1 = train_test_split(
            X_gptv1, test_size=0.3, random_state=random_state
        )
        
        # Load GPTv2 profiles using the model-specific file from the embeddings_output directory
        generated_file = f'embeddings_output/{self.model_name}_PCA_150_components_generated.csv'
        gptv2_emb = pd.read_csv(generated_file)
        X_train_gptv2, X_test_gptv2 = train_test_split(
            gptv2_emb, test_size=0.3, random_state=random_state
        )
        
        return {
            'train': {
                'genuine': X_train_genuine,
                'fake_manual': X_train_fake,
                'gptv1': X_train_gptv1,
                'gptv2': X_train_gptv2
            },
            'test': {
                'genuine': X_test_genuine,
                'fake_manual': X_test_fake,
                'gptv1': X_test_gptv1,
                'gptv2': X_test_gptv2
            }
        }

    def _get_training_data(self, train_data, scenario):
        """Get training data for each scenario"""
        if scenario == 'baseline':
            X_train = pd.concat([
                train_data['genuine'],
                train_data['fake_manual']
            ])
            y_train = pd.Series([0]*len(train_data['genuine']) + [1]*len(train_data['fake_manual']))
            
        elif scenario == 'gptv1_assisted':
            X_train = pd.concat([
                train_data['genuine'],
                train_data['fake_manual'],
                train_data['gptv1']
            ])
            y_train = pd.Series([0]*len(train_data['genuine']) + 
                               [1]*(len(train_data['fake_manual']) + len(train_data['gptv1'])))
            
        elif scenario == 'gptv2_assisted':
            X_train = pd.concat([
                train_data['genuine'],
                train_data['fake_manual'],
                train_data['gptv2']
            ])
            y_train = pd.Series([0]*len(train_data['genuine']) + 
                               [1]*(len(train_data['fake_manual']) + len(train_data['gptv2'])))
            
        elif scenario == 'gptv1v2_assisted':
            X_train = pd.concat([
                train_data['genuine'],
                train_data['fake_manual'],
                train_data['gptv1'],
                train_data['gptv2']
            ])
            y_train = pd.Series([0]*len(train_data['genuine']) + 
                               [1]*(len(train_data['fake_manual']) + 
                                   len(train_data['gptv1']) + len(train_data['gptv2'])))
        
        X_train_scaled = self.scaler.fit_transform(X_train)
        return X_train_scaled, y_train

    def _get_test_data(self, test_data, scenario):
        """Get test data for each scenario"""
        if scenario == 'baseline':
            X_test = pd.concat([
                test_data['genuine'],
                test_data['fake_manual']
            ]).reset_index(drop=True)
            y_test = pd.Series([0]*len(test_data['genuine']) + [1]*len(test_data['fake_manual']))
            
        elif scenario == 'gptv1':
            X_test = pd.concat([
                test_data['genuine'],
                test_data['fake_manual'],
                test_data['gptv1']
            ]).reset_index(drop=True)
            y_test = pd.Series([0]*len(test_data['genuine']) + 
                              [1]*(len(test_data['fake_manual']) + len(test_data['gptv1'])))
            
        elif scenario == 'gptv2':
            X_test = pd.concat([
                test_data['genuine'],
                test_data['fake_manual'],
                test_data['gptv2']
            ]).reset_index(drop=True)
            y_test = pd.Series([0]*len(test_data['genuine']) + 
                              [1]*(len(test_data['fake_manual']) + len(test_data['gptv2'])))
            
        else:  # gptv1v2
            X_test = pd.concat([
                test_data['genuine'],
                test_data['fake_manual'],
                test_data['gptv1'],
                test_data['gptv2']
            ]).reset_index(drop=True)
            y_test = pd.Series([0]*len(test_data['genuine']) + 
                              [1]*(len(test_data['fake_manual']) + 
                                  len(test_data['gptv1']) + len(test_data['gptv2'])))
        
        X_test_scaled = self.scaler.transform(X_test)
        return X_test_scaled, y_test

    def _calculate_metrics(self, y_true, y_pred):
        """Calculate F1, FAR, and FRR"""
        f1 = f1_score(y_true, y_pred, average='weighted')
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        far = fp / (fp + tn)
        frr = fn / (fn + tp)
        return {'f1_score': f1, 'far': far, 'frr': frr}

    def bayesian_optimization_xgboost(self, X_train, y_train):
        """Two-phase Bayesian optimization for XGBoost with GPU acceleration"""
        start_time = datetime.now()
        print(f"\nStarting Two-Phase Bayesian Optimization at {start_time.strftime('%H:%M:%S')}")
    
        # Split training data for quick validation in Phase 1
        X_train_quick, X_val_quick, y_train_quick, y_val_quick = train_test_split(
            X_train, y_train, test_size=0.2, random_state=42
        )
        
        # Create a separate validation set for final model training (to avoid test data leakage)
        X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
            X_train, y_train, test_size=0.15, random_state=42
        )
        
        best_score_so_far = 0
        trial_times = []
    
        def objective(trial):
            nonlocal best_score_so_far
            trial_start = datetime.now()
            
            # Determine which phase we're in
            is_phase_2 = trial.number >= 30
            phase_name = "Phase 2 (CV)" if is_phase_2 else "Phase 1 (Quick)"
            trial_in_phase = trial.number - 30 + 1 if is_phase_2 else trial.number + 1
            max_trials_in_phase = 20 if is_phase_2 else 30
            
            print(f"\n{phase_name} - Trial {trial_in_phase}/{max_trials_in_phase} started at {trial_start.strftime('%H:%M:%S')}")
    
            # Define parameter space
            params = {
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.05, 0.1, 0.2, 0.3]),
                'n_estimators': trial.suggest_int('n_estimators', 50, 550, 50),
                'subsample': trial.suggest_categorical('subsample', [0.6, 0.7, 0.8, 0.9, 1.0]),
                'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.6, 0.7, 0.8, 0.9, 1.0]),
                'tree_method': 'gpu_hist',  # Changed from 'hist' to 'gpu_hist' for GPU acceleration
                'gpu_id': 0,  # Specify GPU device ID (usually 0 for the first GPU)
                'predictor': 'gpu_predictor',  # Use GPU for prediction
                'random_state': 42
            }
    
            try:
                clf = XGBClassifier(**params)
            
                # Phase 1: Quick validation
                if not is_phase_2:
                    clf.fit(X_train_quick, y_train_quick,
                           eval_set=[(X_val_quick, y_val_quick)],
                           verbose=False)
                    score = f1_score(y_val_quick, clf.predict(X_val_quick), average='weighted')
                # Phase 2: 5-fold CV
                else:
                    score = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1').mean()
                
                trial_end = datetime.now()
                trial_duration = trial_end - trial_start
                trial_times.append(trial_duration.total_seconds())
                avg_trial_time = np.mean(trial_times[-5:]) if len(trial_times) > 5 else np.mean(trial_times)
                
                best_score_so_far = max(best_score_so_far, score)
                print(f"Trial {trial_in_phase} results:")
                print(f"Score: {score:.4f}")
                print(f"Best score so far: {best_score_so_far:.4f}")
                print(f"Trial duration: {trial_duration}")
                print(f"Average trial duration: {avg_trial_time:.2f} seconds")
                
                remaining_in_phase = max_trials_in_phase - trial_in_phase
                remaining_time = avg_trial_time * remaining_in_phase
                print(f"Estimated time remaining in {phase_name}: {remaining_time:.2f} seconds")
            
                return score
                
            except Exception as e:
                print(f"Error in trial: {str(e)}")
                return 0
    
        # Create and run study
        study = optuna.create_study(direction='maximize')
        print(f"\nPhase 1: Quick Evaluation with Single Validation Split (30 trials)")
        print(f"Phase 2: Fine-tuning with 5-fold CV (20 trials)")
        study.optimize(objective, n_trials=50, show_progress_bar=True)
    
        # Final evaluation - using a separate validation set instead of test data for early stopping
        final_model = XGBClassifier(**study.best_params)
        final_model.fit(X_train_final, y_train_final, 
                       eval_set=[(X_val_final, y_val_final)], 
                       verbose=False)
    
        end_time = datetime.now()
        duration = end_time - start_time
        print(f"\nBayesian Optimization completed at {end_time.strftime('%H:%M:%S')}")
        print(f"Total duration: {duration}")
        print(f"Best parameters: {study.best_params}")
    
        return study.best_params, final_model
    
    def train_and_evaluate(self, embedding_file, load_existing=False, model_name=None):
        """Main method to train and evaluate across scenarios"""
        # Use model name in the saved file path if provided
        model_prefix = f"{self.model_name}_" if model_name is None else f"{model_name}_"
        model_path = f'models/{model_prefix}xgb_bo_models.pkl'
        
        if load_existing and os.path.exists(model_path):
            print("Loading existing models and results...")
            with open(model_path, 'rb') as f:
                saved_data = pickle.load(f)
                
            # Return the loaded data
            return saved_data['results']
        
        splits = self._get_data_splits()
        training_scenarios = ['baseline', 'gptv1_assisted', 'gptv2_assisted', 'gptv1v2_assisted']
        test_scenarios = ['baseline', 'gptv1', 'gptv2', 'gptv1v2']
        
        # Store models, parameters, scalers, and results
        all_models = {}
        all_params = {}
        all_scalers = {}
        all_results = {}
        
        for train_scenario in training_scenarios:
            print(f"\n{'='*60}")
            print(f"Training scenario: {train_scenario}")
            print(f"{'='*60}")
            
            X_train, y_train = self._get_training_data(splits['train'], train_scenario)
            
            # Store the scaler for this scenario
            all_scalers[train_scenario] = self.scaler
            
            # Optimize XGBoost for this training scenario
            best_params, model = self.bayesian_optimization_xgboost(X_train, y_train)
            
            # Store the model and parameters
            all_models[train_scenario] = model
            all_params[train_scenario] = best_params
            
            scenario_results = {}
            for test_scenario in test_scenarios:
                X_test, y_test = self._get_test_data(splits['test'], test_scenario)
                
                # Get predictions and metrics
                y_pred = model.predict(X_test)
                scenario_results[test_scenario] = self._calculate_metrics(y_test, y_pred)
                
            all_results[train_scenario] = scenario_results
            
        # Save all components
        save_data = {
            'models': all_models,
            'params': all_params,
            'scalers': all_scalers,
            'results': all_results
        }
        
        # Use model name in the saved file paths
        model_prefix = f"{self.model_name}_"
        model_path = f'models/{model_prefix}xgb_bo_models.pkl'
        results_path = f'models/{model_prefix}xgb_bo.pkl'
        
        # Save all to a single file
        with open(model_path, 'wb') as f:
            pickle.dump(save_data, f)
        
        # Also save only the results separately (for backward compatibility)
        with open(results_path, 'wb') as f:
            pickle.dump(all_results, f)
            
        print(f"\nModels, parameters, scalers, and results saved to '{model_path}'")
        print(f"Results also saved to '{results_path}' for backward compatibility")
        
        return all_results

    def print_results(self, results):
        """Print results in a formatted manner"""
        print("\nDetailed Results:")
        print("=" * 50)
        
        # Metrics to print
        metrics = ['f1_score', 'far', 'frr']
        
        # Get unique training scenarios from results dict
        train_scenarios = list(results.keys())
        test_scenarios = ['baseline', 'gptv1', 'gptv2', 'gptv1v2']
        
        for metric in metrics:
            print(f"\n{metric.upper()} Results:")
            print("-" * 50)
            print(f"{'Training Scenario':<20} {'Baseline':<15} {'GPTv1':<15} {'GPTv2':<15} {'GPTv1v2':<15}")
            print("-" * 50)
            
            for train_scenario in train_scenarios:
                metrics_row = [results[train_scenario][test][metric] 
                               for test in test_scenarios]
                print(f"{train_scenario:<20} " + " ".join(f"{m:.4f}{' '*8}" for m in metrics_row))
                
    def load_models(self, model_name=None):
        """Load saved models and data"""
        # Use model name in the file path
        model_to_load = model_name if model_name else self.model_name
        model_path = f'models/{model_to_load}_xgb_bo_models.pkl'
        
        if os.path.exists(model_path):
            with open(model_path, 'rb') as f:
                saved_data = pickle.load(f)
            print(f"Models for {model_to_load} loaded successfully.")
            return saved_data
        else:
            print(f"No saved models found for {model_to_load}.")
            return None

def main():
    # Specify which model to use
    model_name = 'roberta'  # Change this to 'deberta', 'modernbert', or 'flair' as needed
    
    # Load embedding file based on the model
    embedding_file = f'embeddings_output/{model_name}_PCA_150_components.csv'
    
    # Assuming dataset is a DataFrame with a 'Label' column
    # You need to properly define this based on your data structure
    #dataset = pd.read_csv('your_dataset.csv')  # Replace with actual dataset path
    
    # Initialize the tuner with the embedding file, dataset, and model name
    tuner = XGBoostAdversarialTuning(embedding_file, dataset, model_name)
    
    # Choose whether to load existing models or train new ones
    load_existing = False  # Set to True to load previously saved models
    
    model_path = f'models/{model_name}_xgb_bo_models.pkl'
    
    if load_existing and os.path.exists(model_path):
        # Load existing models and results
        saved_data = tuner.load_models(model_name)
        # Print results from the saved data
        tuner.print_results(saved_data['results'])
    else:
        # Train and evaluate new models
        results = tuner.train_and_evaluate(embedding_file, load_existing=False, model_name=model_name)
        # Print the results
        tuner.print_results(results)

if __name__ == "__main__":
    main()

XGBOOST - GA