In [None]:
from pathlib import Path
import polars as pl
from farcaster_sybil_detection.config.defaults import Config
from farcaster_sybil_detection.services.detector import DetectorService
from farcaster_sybil_detection.features.registry import FeatureRegistry
from farcaster_sybil_detection.features.extractors.content_engagement_extractor import (
    ContentEngagementExtractor,
)
from farcaster_sybil_detection.features.extractors.network_analysis_extractor import (
    NetworkAnalysisExtractor,
)
from farcaster_sybil_detection.features.extractors.temporal_behavior_extractor import (
    TemporalBehaviorExtractor,
)
from farcaster_sybil_detection.features.extractors.user_identity_extractor import (
    UserIdentityExtractor,
)
# from farcaster_sybil_detection.features.extractors.reputation_meta_extractor import (
#     ReputationMetaExtractor,
# )

pl.Config.set_streaming_chunk_size(1_000_000)
pl.Config.set_fmt_str_lengths(50)

config = Config(
    data_path=Path("data"),
    checkpoint_dir=Path("checkpoints"),
    model_dir=Path("models"),
)

registry = FeatureRegistry()

# Register in any order - manager will figure out correct build order
registry.register("user_identity", UserIdentityExtractor)
registry.register("network_analysis", NetworkAnalysisExtractor)
registry.register("temporal_behavior", TemporalBehaviorExtractor)
registry.register("content_engagement", ContentEngagementExtractor)
# registry.register("reputation_meta", ReputationMetaExtractor)

detector = DetectorService(config, registry)

# Load Labels
labels_df = pl.read_csv("data/labels.csv").limit(1000)

# Validate labels_df
required_columns = {'fid', 'bot'}
if not required_columns.issubset(labels_df.columns):
    missing = required_columns - set(labels_df.columns)
    raise ValueError(f"Missing required columns in labels.csv: {missing}")

  from .autonotebook import tqdm as notebook_tqdm
2024-12-10 17:06:01,696 - DetectorService - DEBUG - No existing model found. Model will be trained when `train` is called.


In [2]:
metrics = detector.trainer.train(labels_df)
print("Training Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")

2024-12-10 17:06:01,704 - Trainer - DEBUG - Building feature matrix...
2024-12-10 17:06:01,705 - Trainer - DEBUG - Preparing features for 1000 labeled fids
2024-12-10 17:06:01,706 - FeatureManager - DEBUG - Starting feature matrix build - Memory usage: 254.80 MB
2024-12-10 17:06:01,706 - FeatureManager - DEBUG - Base FIDs: 1000
  f"Feature matrix schema: {feature_matrix.schema} ({len(feature_matrix.columns)} columns)"
  f"Feature matrix schema: {feature_matrix.schema} ({len(feature_matrix.columns)} columns)"
2024-12-10 17:06:01,707 - FeatureManager - DEBUG - Feature matrix schema: Schema({'fid': Int64}) (1 columns)
2024-12-10 17:06:01,707 - FeatureManager - DEBUG - Feature matrix size: naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

 SELECT [col("fid").count()] FROM
  DF ["fid"]; PROJECT */1 COLUMNS; SELECTION: None
2024-12-10 17:06:01,708 - FeatureManager - DEBUG - Starting network_analysis - Memory usage: 255.17 MB
2024-12-10 17:06:01,710 - FeatureManag

[LightGBM] [Info] Number of positive: 467, number of negative: 333
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000785 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8099
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.583750 -> initscore=0.338187
[LightGBM] [Info] Start training from score 0.338187
[LightGBM] [Info] Number of positive: 373, number of negative: 267
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000801 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 640, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.582812 -> initscore=0.334330
[LightGBM] [Info] Start training from score 0.334330
[LightGBM] [Info] Number of 

2024-12-10 17:06:33,596 - BaseModel - INFO - lgbm best score: 0.9786


[LightGBM] [Info] Number of positive: 373, number of negative: 267
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000843 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 640, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.582812 -> initscore=0.334330
[LightGBM] [Info] Start training from score 0.334330
[LightGBM] [Info] Number of positive: 373, number of negative: 267
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000689 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6652
[LightGBM] [Info] Number of data points in the train set: 640, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.582812 -> initscore=0.334330
[LightGBM] [Info]

2024-12-10 17:06:38,167 - Trainer - DEBUG - 
Evaluation metrics:
2024-12-10 17:06:38,167 - Trainer - DEBUG - roc_auc: 0.990
2024-12-10 17:06:38,167 - Trainer - DEBUG - precision: 0.950
2024-12-10 17:06:38,167 - Trainer - DEBUG - recall: 0.974
2024-12-10 17:06:38,167 - Trainer - DEBUG - f1: 0.962
2024-12-10 17:06:38,168 - Trainer - DEBUG - mcc: 0.907
2024-12-10 17:06:38,168 - Trainer - DEBUG - kappa: 0.907
2024-12-10 17:06:38,168 - Trainer - DEBUG - tn: 77.000
2024-12-10 17:06:38,168 - Trainer - DEBUG - fp: 6.000
2024-12-10 17:06:38,168 - Trainer - DEBUG - fn: 3.000
2024-12-10 17:06:38,168 - Trainer - DEBUG - tp: 114.000


Training Metrics:
roc_auc: 0.990
precision: 0.950
recall: 0.974
f1: 0.962
mcc: 0.907
kappa: 0.907
tn: 77.000
fp: 6.000
fn: 3.000
tp: 114.000


In [3]:
result = [
    # detector.predict(identifier='rpunkt'),
    detector.predict(identifier='vitalik'),
    detector.predict(identifier='ipungkribo')
]

print(result)

2024-12-10 17:06:38,172 - DetectorService - DEBUG - Making prediction for identifier: vitalik
2024-12-10 17:06:38,172 - Predictor - DEBUG - Predicting for identifier: vitalik
2024-12-10 17:06:38,172 - Predictor - DEBUG - Loading ID mapping from profile data...
2024-12-10 17:06:38,173 - DatasetLoader - DEBUG - Loading profile_with_addresses with columns: ['fid', 'fname']
2024-12-10 17:06:38,183 - DatasetLoader - DEBUG - Filtered dataset: 894048 records, 893130 unique FIDs
2024-12-10 17:06:38,191 - DatasetLoader - DEBUG - Loaded farcaster-profile_with_addresses: 894048 records
2024-12-10 17:06:38,194 - FeatureManager - DEBUG - Starting feature matrix build - Memory usage: 386.23 MB
2024-12-10 17:06:38,195 - FeatureManager - DEBUG - Base FIDs: 1
  f"Feature matrix schema: {feature_matrix.schema} ({len(feature_matrix.columns)} columns)"
  f"Feature matrix schema: {feature_matrix.schema} ({len(feature_matrix.columns)} columns)"
2024-12-10 17:06:38,196 - FeatureManager - DEBUG - Feature matr

shape: (1, 86)
┌───────┬────────────┬────────────┬────────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ fid   ┆ follow_rat ┆ network_gr ┆ follow_vel ┆ … ┆ identity_ ┆ verificat ┆ profile_a ┆ resource_ │
│ ---   ┆ io         ┆ owth_rate  ┆ ocity      ┆   ┆ strength  ┆ ion_quali ┆ uthentici ┆ utilizati │
│ i64   ┆ ---        ┆ ---        ┆ ---        ┆   ┆ ---       ┆ ty        ┆ ty        ┆ on        │
│       ┆ f64        ┆ f64        ┆ f64        ┆   ┆ f64       ┆ ---       ┆ ---       ┆ ---       │
│       ┆            ┆            ┆            ┆   ┆           ┆ f64       ┆ f64       ┆ f64       │
╞═══════╪════════════╪════════════╪════════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 22032 ┆ 0.99422    ┆ 0.0        ┆ 22.114286  ┆ … ┆ 0.3       ┆ 0.0       ┆ 1.0       ┆ 0.7       │
└───────┴────────────┴────────────┴────────────┴───┴───────────┴───────────┴───────────┴───────────┘


2024-12-10 17:07:05,291 - DatasetLoader - DEBUG - Filtered dataset: 1437 records, 1 unique FIDs
2024-12-10 17:07:05,291 - FeatureExtractor - DEBUG - Loading dataset 'follow_counts' from source 'nindexer'
2024-12-10 17:07:05,292 - FeatureExtractor - DEBUG - Required columns: ['fid', 'follower_count', 'following_count', 'created_at']
2024-12-10 17:07:05,292 - FeatureExtractor - DEBUG - Filtering for 1 FIDs
2024-12-10 17:07:05,292 - DatasetLoader - DEBUG - Loading follow_counts with columns: ['fid', 'follower_count', 'following_count', 'created_at']
2024-12-10 17:07:05,304 - DatasetLoader - DEBUG - Filtered dataset: 1 records, 1 unique FIDs
2024-12-10 17:07:05,305 - FeatureExtractor - DEBUG - Loading dataset 'reactions' from source 'farcaster'
2024-12-10 17:07:05,305 - FeatureExtractor - DEBUG - Required columns: ['fid', 'target_fid', 'timestamp', 'deleted_at']
2024-12-10 17:07:05,305 - FeatureExtractor - DEBUG - Filtering for 1 FIDs
2024-12-10 17:07:05,305 - DatasetLoader - DEBUG - Loadi

shape: (1, 86)
┌────────┬────────────┬────────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ fid    ┆ follow_rat ┆ network_gr ┆ follow_ve ┆ … ┆ identity_ ┆ verificat ┆ profile_a ┆ resource_ │
│ ---    ┆ io         ┆ owth_rate  ┆ locity    ┆   ┆ strength  ┆ ion_quali ┆ uthentici ┆ utilizati │
│ i64    ┆ ---        ┆ ---        ┆ ---       ┆   ┆ ---       ┆ ty        ┆ ty        ┆ on        │
│        ┆ f64        ┆ f64        ┆ f64       ┆   ┆ f64       ┆ ---       ┆ ---       ┆ ---       │
│        ┆            ┆            ┆           ┆   ┆           ┆ f64       ┆ f64       ┆ f64       │
╞════════╪════════════╪════════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 452622 ┆ 0.999305   ┆ 0.0        ┆ 372.4787  ┆ … ┆ 0.3       ┆ 0.0       ┆ 1.0       ┆ 0.7       │
└────────┴────────────┴────────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘
[{'prediction': 0, 'probability': 0.10836767303728052, 'confidence': 0.88141

In [4]:
import numpy as np
def analyze_population_distribution(detector: DetectorService, feature_matrix: pl.DataFrame):
    """Analyze bot probability distribution across the entire population"""
    print("\nAnalyzing full population distribution...")
    
    try:
        # Get all features (excluding fid)
        feature_cols = [col for col in detector.model.feature_names if col != 'fid']
        X = feature_matrix.select(feature_cols).to_numpy()

        # Replace inf values with nan
        X = np.where(np.isinf(X), np.nan, X)
        
        # Replace nan with 0 and clip extreme values
        X = np.nan_to_num(X, nan=0.0)
        X = np.clip(X, -1e9, 1e9)
        
        # Convert to float32 safely
        X = X.astype(np.float32)
        
        # Get predictions and probabilities
        y_prob = detector.model.predict_proba(X)
        # Take only the probability for class 1 (bot)
        bot_probabilities = y_prob[:, 1]
        y_pred = (bot_probabilities >= 0.5).astype(int)

        fids = feature_matrix['fid'].to_list()
        
        # Create results DataFrame - now using only bot probabilities
        results_df = pl.DataFrame({
            'fid': fids,
            'bot_probability': bot_probabilities,
            'prediction': y_pred
        })
        
        # Calculate distribution statistics
        stats = {
            'total_users': len(results_df),
            'predicted_bots': (y_pred == 1).sum(),
            'predicted_humans': (y_pred == 0).sum(),
            'bot_ratio': (y_pred == 1).mean(),
            'avg_probability': bot_probabilities.mean(),
            'median_probability': np.median(bot_probabilities),
            'std_probability': np.std(bot_probabilities)
        }
        
        # Calculate probability buckets
        bucket_edges = np.arange(0, 1.1, 0.1)
        hist, _ = np.histogram(bot_probabilities, bins=bucket_edges)
        bucket_stats = {f"{bucket_edges[i]:.1f}-{bucket_edges[i+1]:.1f}": count 
                       for i, count in enumerate(hist)}
        
        # Print distribution analysis
        print("\nPopulation Distribution Analysis")
        print("=" * 50)
        print(f"Total Users: {stats['total_users']:,}")
        print(f"Predicted Bots: {stats['predicted_bots']:,} ({stats['bot_ratio']:.1%})")
        print(f"Predicted Humans: {stats['predicted_humans']:,} ({1-stats['bot_ratio']:.1%})")
        print("\nProbability Statistics:")
        print(f"Mean Bot Probability: {stats['avg_probability']:.3f}")
        print(f"Median Bot Probability: {stats['median_probability']:.3f}")
        print(f"Std Dev: {stats['std_probability']:.3f}")
        
        print("\nProbability Distribution:")
        for bucket, count in bucket_stats.items():
            print(f"{bucket}: {count:,} users ({count/stats['total_users']:.1%})")
        
        # Plot distribution
        import matplotlib.pyplot as plt
        plt.figure(figsize=(12, 6))
        plt.hist(bot_probabilities, bins=50, density=True, alpha=0.7)
        plt.axvline(x=0.5, color='r', linestyle='--', label='Decision Boundary')
        plt.xlabel('Bot Probability')
        plt.ylabel('Density')
        plt.title('Distribution of Bot Probabilities')
        plt.grid(True, alpha=0.3)
        plt.legend()
        plt.show()

        # Save results
        results_df = results_df.sort('bot_probability', descending=True)
        results_df.write_csv("population_analysis.csv")
        print("\nResults saved to population_analysis.csv")
        
        return {
            'results_df': results_df,
            'stats': stats,
            'bucket_stats': bucket_stats
        }
        
    except Exception as e:
        print(f"Error in population analysis: {str(e)}")
        raise

In [5]:
# First, get all available FIDs from the profile dataset
profiles_df = feature_manager.data_loader.load_dataset(
    'profile_with_addresses', 
    columns=['fid', 'fname']
)
all_fids = profiles_df['fid'].unique().sort()
print(f"Total population size: {len(all_fids)} FIDs")

NameError: name 'feature_manager' is not defined

In [None]:
# Build feature matrix for all FIDs
print("Building feature matrix for full population...")
full_matrix = feature_manager.build_feature_matrix()
print(f"Feature matrix shape: {full_matrix.shape}")

In [None]:
full_matrix.describe()

In [None]:
# Run the analysis
population_analysis = analyze_population_distribution(detector, full_matrix)

# If you want to examine specific probability ranges:
results_df = population_analysis['results_df']

# High confidence bots (e.g., >90% probability)
high_conf_bots = results_df.filter(pl.col('bot_probability') > 0.8)
print(f"\nHigh Confidence Bots (>80%): {len(high_conf_bots)}")

# High confidence humans (e.g., <10% probability)
high_conf_humans = results_df.filter(pl.col('bot_probability') < 0.2)
print(f"High Confidence Humans (<20%): {len(high_conf_humans)}")

# Uncertain predictions (e.g., 40-60% probability)
uncertain = results_df.filter(
    (pl.col('bot_probability') >= 0.4) & 
    (pl.col('bot_probability') <= 0.6)
)
print(f"Uncertain Predictions (40-60%): {len(uncertain)}")

In [None]:
full_results = results_df.join(profiles_df, on='fid').join(full_matrix, on='fid')
full_results.head()

In [None]:
# get examples of high confidence bots
high_conf_bots_examples = profiles_df.filter(pl.col('fid').is_in(high_conf_bots['fid'])).sort('fid')
high_conf_bots_examples.head(10)

In [None]:
# get examples of high confidence humans
high_conf_humans_examples = profiles_df.filter(pl.col('fid').is_in(high_conf_humans['fid'])).sort('fid')
high_conf_humans_examples.head(10)

In [None]:
population_analysis

In [None]:
# Function to analyze a prediction in detail
def analyze_prediction(detector, identifier):
    result = detector.predict(identifier)
    
    if result['status'] != 'success':
        print(f"Error: {result['error']}")
        return
    
    print(f"\nAnalysis for {result['fname']} (FID: {result['fid']})")
    print("=" * 50)
    print(f"Prediction: {result['prediction_label']}")
    print(f"Probability of being a bot: {result['probability']:.3f}")
    print(f"Model confidence: {result['confidence']:.3f}")
    
    # Get feature importance
    features = detector.feature_manager.get_features_for_fid(result['fid'])
    feature_importance = detector.model.get_feature_importance()
    
    print("\nTop contributing features:")
    for feature, importance in sorted(feature_importance.items(), 
                                    key=lambda x: abs(x[1]), 
                                    reverse=True)[:10]:
        print(f"{feature}: {importance:.3f}")

In [None]:
def identify_model_problems(detector, identifier):
    result = detector.predict(identifier)
    
    problems = []
    
    # Check probability threshold
    if 0.4 <= result['probability'] <= 0.6:
        problems.append("Uncertain prediction (probability near decision boundary)")
    
    # Check confidence
    if result['confidence'] < 0.7:
        problems.append("Low confidence prediction")
    
    # Check feature completeness
    missing_features = set(detector.model.feature_names) - set(result['features_used'])
    if missing_features:
        problems.append(f"Missing features: {missing_features}")
    
    # Check for extreme feature values
    features = detector.feature_manager.get_features_for_fid(result['fid'])
    for col in features.columns:
        if col != 'fid':
            value = features[col][0]
            if value and abs(value) > 1e6:
                problems.append(f"Extreme value in feature {col}: {value}")
    
    return problems

In [None]:
def evaluate_model_robustness(detector, test_cases):
    results = []
    for case in test_cases:
        pred = detector.predict(case)
        results.append({
            'identifier': case,
            'prediction': pred['prediction_label'],
            'probability': pred['probability'],
            'confidence': pred['confidence'],
            'problems': identify_model_problems(detector, case)
        })
    
    return pl.DataFrame(results)

In [None]:
# Analyze a single prediction
analyze_prediction(detector, 'vitalik')

# Check for problems
problems = identify_model_problems(detector, 'vitalik')
if problems:
    print("\nPotential problems identified:")
    for problem in problems:
        print(f"- {problem}")

# Evaluate multiple cases
test_cases = ['vitalik', 'rpunkt', 'ipungkribo']
evaluation = evaluate_model_robustness(detector, test_cases)

In [None]:
evaluation

In [None]:
from sklearn.preprocessing import StandardScaler

def analyze_high_confidence_predictions(detector, min_confidence=0.9, limit=20):
    """
    Find and analyze predictions with highest confidence scores.
    
    Args:
        detector: DetectorService instance
        min_confidence: Minimum confidence threshold (default 0.9)
        limit: Number of results to return (default 20)
    """
    # Get all profiles
    profiles_df = detector.feature_manager.data_loader.load_dataset(
        'profile_with_addresses', 
        columns=['fid', 'fname']
    )
    
    results = []
    print(f"Analyzing predictions for {len(profiles_df)} profiles...")
    
    # Build feature matrix for all FIDs
    feature_matrix = detector.feature_manager.build_feature_matrix()
    
    # Get predictions for all profiles
    feature_cols = [col for col in detector.model.feature_names if col != 'fid']
    X = feature_matrix.select(feature_cols).to_numpy()
    
    # Replace inf/nan values
    X = np.nan_to_num(X, nan=0.0)
    X = np.clip(X, -1e9, 1e9)
    
    # Get predictions and probabilities
    y_prob = detector.model.predict_proba(X)
    y_pred = (y_prob[:, 1] >= 0.5).astype(int)
    confidences = detector.model.get_prediction_confidence(X)
    
    # Create results DataFrame
    results_df = pl.DataFrame({
        'fid': feature_matrix['fid'],
        'probability': y_prob[:, 1],
        'prediction': y_pred,
        'confidence': confidences
    })
    
    # Join with profiles to get fnames
    results_df = results_df.join(profiles_df, on='fid')
    
    # Join with feature matrix to include all features
    results_df = results_df.join(feature_matrix, on='fid')
    
    # Filter for high confidence predictions
    high_conf_df = results_df.filter(pl.col('confidence') >= min_confidence)
    
    # Sort by confidence
    high_conf_df = high_conf_df.sort('confidence', descending=True)
    
    print("\nHigh Confidence Predictions:")
    print("=" * 80)
    print(f"Found {len(high_conf_df)} predictions with confidence >= {min_confidence}")
    
    # Analyze top results
    print("\nTop High-Confidence Predictions:")
    print("-" * 80)
    print(f"{'FID':<10} {'Username':<20} {'Prediction':<12} {'Probability':<12} {'Confidence':<12}")
    print("-" * 80)
    
    for row in high_conf_df.head(limit).iter_rows(named=True):
        pred_label = "Bot" if row['prediction'] == 1 else "Human"
        print(f"{row['fid']:<10} {row['fname']:<20} {pred_label:<12} {row['probability']:.3f}{'':>4} {row['confidence']:.3f}{'':>4}")
    
    return high_conf_df

def analyze_specific_prediction(detector, fid_or_fname, feature_matrix=None):
    """
    Detailed analysis of a specific prediction with feature importance
    
    Args:
        detector: DetectorService instance
        fid_or_fname: FID (int) or fname (str) to analyze
        feature_matrix: Optional pre-computed feature matrix
    """
    # Get prediction
    result = detector.predict(fid_or_fname)
    
    if result['status'] != 'success':
        print(f"Error: {result.get('error', 'Unknown error')}")
        return
    
    print("\nDetailed Prediction Analysis")
    print("=" * 80)
    print(f"User: {result['fname']} (FID: {result['fid']})")
    print(f"Prediction: {result['prediction_label']}")
    print(f"Probability: {result['probability']:.3f}")
    print(f"Confidence: {result['confidence']:.3f}")
    
    # Get feature values
    if feature_matrix is None:
        features = detector.feature_manager.get_features_for_fid(result['fid'])
    else:
        features = feature_matrix.filter(pl.col('fid') == result['fid'])
    
    # Get feature importance
    feature_importance = detector.model.get_feature_importance()
    
    print("\nTop Contributing Features:")
    print("-" * 80)
    print(f"{'Feature':<30} {'Importance':<15} {'Value':<15}")
    print("-" * 80)
    
    # Sort features by absolute importance
    sorted_features = sorted(feature_importance.items(), 
                           key=lambda x: abs(x[1]), 
                           reverse=True)
    
    for feature, importance in sorted_features[:15]:  # Show top 15 features
        value = features[feature][0] if feature in features.columns else 'N/A'
        print(f"{feature:<30} {importance:>15.3f} {value:>15.3f}")
    
    return result, feature_importance

def analyze_confidence_clusters(high_conf_df, n_clusters=3):
    """
    Analyze patterns in high confidence predictions using clustering
    
    Args:
        high_conf_df: DataFrame with high confidence predictions
        n_clusters: Number of clusters to analyze
    """
    from sklearn.cluster import KMeans
    
    # Select numeric features for clustering
    feature_cols = [col for col in high_conf_df.columns 
                   if col not in ['fid', 'fname', 'prediction', 'probability', 'confidence']]
    
    # Prepare data for clustering
    # Replace inf values with nan

    X = high_conf_df.select(feature_cols).to_numpy()
    X = np.where(np.isinf(X), np.nan, X)

    # Replace nan with 0 and clip extreme values
    X = np.nan_to_num(X, nan=0.0)
    X = np.clip(X, -1e9, 1e9)

    # Convert to float32 safely
    X = X.astype(np.float32)
    X = StandardScaler().fit_transform(X)
    
    # Perform clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(X)
    
    # Add cluster assignments to DataFrame
    high_conf_df = high_conf_df.with_columns([
        pl.Series(name='cluster', values=clusters)
    ])
    
    print("\nCluster Analysis:")
    print("=" * 80)
    
    for cluster in range(n_clusters):
        cluster_df = high_conf_df.filter(pl.col('cluster') == cluster)
        print(f"\nCluster {cluster}:")
        print(f"Size: {len(cluster_df)}")
        print(f"Average confidence: {cluster_df['confidence'].mean():.3f}")
        print(f"Bot ratio: {(cluster_df['prediction'] == 1).sum() / len(cluster_df):.2%}")
        
        # Get top features for this cluster
        cluster_center = kmeans.cluster_centers_[cluster]
        feature_importance = list(zip(feature_cols, cluster_center))
        feature_importance.sort(key=lambda x: abs(x[1]), reverse=True)
        
        print("\nTop distinguishing features:")
        for feature, value in feature_importance[:5]:
            print(f"{feature}: {value:.3f}")
    
    return high_conf_df

# Usage example:
high_conf_predictions = analyze_high_confidence_predictions(detector, min_confidence=0.70)

In [None]:
high_conf_predictions

In [None]:
# Analyze a specific high confidence prediction
result, importance = analyze_specific_prediction(detector, high_conf_predictions['fid'][0])

In [None]:
# Analyze patterns in high confidence predictions
clustered_df = analyze_confidence_clusters(high_conf_predictions, n_clusters=5)
clustered_df.head()