In [1]:
from pathlib import Path
import polars as pl
from farcaster_sybil_detection.config.defaults import Config
from farcaster_sybil_detection.features.manager import FeatureManager
from farcaster_sybil_detection.services.detector import DetectorService

pl.Config.set_streaming_chunk_size(1_000_000)
pl.Config.set_fmt_str_lengths(50)


# Create configuration
config = Config(
    data_path=Path("data"),
    checkpoint_dir=Path("checkpoints"),
    model_dir=Path("models"),
    debug_mode=True,
    cache_enabled=True
)

# Initialize Feature Manager
feature_manager = FeatureManager(config)

# Initialize Detector Service with the Feature Manager
detector = DetectorService(config, feature_manager)

# Load Labels
labels_df = pl.read_csv("data/labels.csv")

# Validate labels_df
required_columns = {'fid', 'bot'}
if not required_columns.issubset(labels_df.columns):
    missing = required_columns - set(labels_df.columns)
    raise ValueError(f"Missing required columns in labels.csv: {missing}")

  from .autonotebook import tqdm as notebook_tqdm
2024-12-09 09:23:52,124 - DetectorService - INFO - No existing model found. Model will be trained when `train` is called.


In [None]:
metrics = detector.trainer.train(labels_df)
print("Training Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")

In [3]:
result = detector.predict(identifier='vitalik')
# result = detector.predict(identifier='ipungkribo')
print("\nPrediction Result:")
print(result)

2024-12-09 09:23:52,132 - DetectorService - INFO - Making prediction for identifier: vitalik
2024-12-09 09:23:52,133 - Predictor - INFO - Predicting for identifier: vitalik
2024-12-09 09:23:52,133 - Predictor - INFO - Loading ID mapping from profile data...
2024-12-09 09:23:52,134 - DatasetLoader - INFO - Loading profile_with_addresses with columns: ['fid', 'fname']
2024-12-09 09:23:52,143 - DatasetLoader - INFO - Filtered dataset: 894048 records, 893130 unique FIDs
2024-12-09 09:23:52,148 - DatasetLoader - INFO - Loaded farcaster-profile_with_addresses: 894048 records
2024-12-09 09:23:52,150 - FeatureManager - INFO - Starting feature matrix build - Memory usage: 346.83 MB
2024-12-09 09:23:52,150 - FeatureManager - INFO - Base FIDs: 1
  f"Feature matrix schema: {feature_matrix.schema} ({len(feature_matrix.columns)} columns)"
  f"Feature matrix schema: {feature_matrix.schema} ({len(feature_matrix.columns)} columns)"
2024-12-09 09:23:52,151 - FeatureManager - INFO - Feature matrix schema

shape: (1, 49)
┌───────┬─────────┬─────────┬────────────┬───┬────────────┬─────────────┬─────────────┬────────────┐
│ fid   ┆ has_ens ┆ has_bio ┆ has_avatar ┆ … ┆ cast_count ┆ reply_count ┆ mention_cou ┆ avg_cast_l │
│ ---   ┆ ---     ┆ ---     ┆ ---        ┆   ┆ ---        ┆ ---         ┆ nt          ┆ ength      │
│ i64   ┆ i64     ┆ i64     ┆ i64        ┆   ┆ u32        ┆ i64         ┆ ---         ┆ ---        │
│       ┆         ┆         ┆            ┆   ┆            ┆             ┆ i64         ┆ f64        │
╞═══════╪═════════╪═════════╪════════════╪═══╪════════════╪═════════════╪═════════════╪════════════╡
│ 22032 ┆ 0       ┆ 1       ┆ 1          ┆ … ┆ 12         ┆ 3           ┆ 0           ┆ 26.083333  │
└───────┴─────────┴─────────┴────────────┴───┴────────────┴─────────────┴─────────────┴────────────┘


ValueError: Model has no defined feature set. Please ensure model was properly trained and saved with feature names.

In [None]:
import numpy as np
import polars as pl
from framework.evaluation.metrics import EvaluationMetrics
from framework.evaluation.segmentation import UserSegmentation
from framework.evaluation.sampling import LabelingSampler
from framework.evaluation.reporting import EvaluationReport

def evaluate_model(detector, labels_df: pl.DataFrame, feature_matrix: pl.DataFrame):
    """Run comprehensive model evaluation"""
    try:
        print("Starting model evaluation...")
        
        # 1. Get predictions for all labeled instances
        fids = labels_df['fid'].to_list()
        features = detector.feature_manager.get_features_for_fids(fids)
        
        # Get feature columns (excluding 'fid')
        feature_cols = [col for col in features.columns if col != 'fid']
        X = features.select(feature_cols).to_numpy()
        
        # Get predictions and probabilities
        y_prob = detector.model.predict_proba(X)
        y_pred = (y_prob[:, 1] >= 0.5).astype(int)
        y_true = labels_df['bot'].to_numpy()
        
        print(f"\nDataset Statistics:")
        print(f"Total samples: {len(y_true)}")
        print(f"Positive samples (bots): {sum(y_true == 1)}")
        print(f"Negative samples (humans): {sum(y_true == 0)}")
        
        # 2. Initialize evaluation components
        metrics = EvaluationMetrics()
        segmentation = UserSegmentation()
        report = EvaluationReport()
        
        # 3. Compute overall metrics
        overall_metrics = metrics.compute_all_metrics(
            y_true=y_true,
            y_pred=y_pred,
            y_prob=y_prob[:, 1]
        )
        
        # 4. Get user segments and compute segment-specific metrics
        segments = segmentation.segment_users(feature_matrix)
        segment_results = {}
        
        for name, segment_df in segments.items():
            print(f"\nEvaluating {name} segment...")
            # Get segment mask
            segment_fids = segment_df['fid'].to_list()
            # Create mask by matching FIDs in labels_df
            segment_mask = labels_df['fid'].is_in(segment_fids).to_numpy()
            
            if not any(segment_mask):
                print(f"No labeled data for segment {name}")
                continue
                
            try:
                segment_metrics = metrics.compute_all_metrics(
                    y_true=y_true[segment_mask],
                    y_pred=y_pred[segment_mask],
                    y_prob=y_prob[segment_mask, 1]
                )
                
                segment_results[name] = {
                    'metrics': segment_metrics,
                    'size': len(segment_df),
                    'bot_ratio': (y_pred[segment_mask] == 1).mean(),
                    'sample_size': sum(segment_mask)
                }
            except Exception as e:
                print(f"Error computing metrics for segment {name}: {str(e)}")
                continue
        
        # 5. Generate comprehensive report
        report.add_metrics(overall_metrics)
        report.add_segment_results(segment_results)
        
        # 6. Visualize results
        print("\nGenerating visualizations...")
        metrics.plot_confusion_matrix(y_true, y_pred)
        report.plot_probability_distribution(y_prob[:, 1])
        
        # 7. Print detailed report
        print("\n" + "="*50)
        print("EVALUATION RESULTS")
        print("="*50)
        print("\nOverall Metrics:")
        for metric, value in overall_metrics.items():
            print(f"{metric}: {value:.3f}")
            
        print("\nSegment Performance:")
        for name, data in segment_results.items():
            print(f"\n{name}:")
            print(f"Size: {data['size']} users ({data['sample_size']} labeled)")
            print(f"Bot Ratio: {data['bot_ratio']:.3f}")
            print("Metrics:")
            for metric, value in data['metrics'].items():
                print(f"  {metric}: {value:.3f}")
        
        return {
            'overall_metrics': overall_metrics,
            'segment_results': segment_results,
            'predictions': {
                'y_true': y_true,
                'y_pred': y_pred,
                'y_prob': y_prob
            }
        }
        
    except Exception as e:
        print(f"Error in evaluation: {str(e)}")
        raise

In [None]:
matrix = feature_manager.build_feature_matrix()

results = evaluate_model(detector, labels_df, matrix)


In [None]:
def analyze_population_distribution(detector: DetectorService, feature_matrix: pl.DataFrame):
    """Analyze bot probability distribution across the entire population"""
    print("\nAnalyzing full population distribution...")
    
    try:
        # Get all features (excluding fid)
        feature_cols = [col for col in feature_matrix.columns if col != 'fid']
        X = feature_matrix.select(feature_cols).to_numpy()
        fids = feature_matrix['fid'].to_list()
        
        # Get predictions for entire population
        y_prob = detector.model.predict_proba(X)[:, 1]
        y_pred = (y_prob >= 0.5).astype(int)
        
        # Create results DataFrame
        results_df = pl.DataFrame({
            'fid': fids,
            'bot_probability': y_prob,
            'prediction': y_pred
        })
        
        # Calculate distribution statistics
        stats = {
            'total_users': len(results_df),
            'predicted_bots': (y_pred == 1).sum(),
            'predicted_humans': (y_pred == 0).sum(),
            'bot_ratio': (y_pred == 1).mean(),
            'avg_probability': y_prob.mean(),
            'median_probability': np.median(y_prob),
            'std_probability': np.std(y_prob)
        }
        
        # Calculate probability buckets
        bucket_edges = np.arange(0, 1.1, 0.1)
        hist, _ = np.histogram(y_prob, bins=bucket_edges)
        bucket_stats = {f"{bucket_edges[i]:.1f}-{bucket_edges[i+1]:.1f}": count 
                       for i, count in enumerate(hist)}
        
        # Print distribution analysis
        print("\nPopulation Distribution Analysis")
        print("=" * 50)
        print(f"Total Users: {stats['total_users']:,}")
        print(f"Predicted Bots: {stats['predicted_bots']:,} ({stats['bot_ratio']:.1%})")
        print(f"Predicted Humans: {stats['predicted_humans']:,} ({1-stats['bot_ratio']:.1%})")
        print("\nProbability Statistics:")
        print(f"Mean Bot Probability: {stats['avg_probability']:.3f}")
        print(f"Median Bot Probability: {stats['median_probability']:.3f}")
        print(f"Std Dev: {stats['std_probability']:.3f}")
        
        print("\nProbability Distribution:")
        for bucket, count in bucket_stats.items():
            print(f"{bucket}: {count:,} users ({count/stats['total_users']:.1%})")
        
        # Plot distribution
        import matplotlib.pyplot as plt
        plt.figure(figsize=(12, 6))
        plt.hist(y_prob, bins=50, density=True, alpha=0.7)
        plt.axvline(x=0.5, color='r', linestyle='--', label='Decision Boundary')
        plt.xlabel('Bot Probability')
        plt.ylabel('Density')
        plt.title('Distribution of Bot Probabilities')
        plt.grid(True, alpha=0.3)
        plt.legend()
        plt.show()

        # Save results
        results_df = results_df.sort('bot_probability', descending=True)
        results_df.write_csv("population_analysis.csv")
        print("\nResults saved to population_analysis.csv")
        
        return {
            'results_df': results_df,
            'stats': stats,
            'bucket_stats': bucket_stats
        }
        
    except Exception as e:
        print(f"Error in population analysis: {str(e)}")
        raise


In [None]:
# First, get all available FIDs from the profile dataset
profiles_df = feature_manager.data_loader.load_dataset(
    'profile_with_addresses', 
    columns=['fid']
)
all_fids = profiles_df['fid'].unique().sort()
print(f"Total population size: {len(all_fids)} FIDs")

# Clear any existing base FID filter
feature_manager.data_loader.clear_cache()
feature_manager.data_loader.set_base_fids(None)

In [None]:
# Build feature matrix for all FIDs
print("Building feature matrix for full population...")
full_matrix = feature_manager.build_feature_matrix()
print(f"Feature matrix shape: {full_matrix.shape}")

In [None]:
full_matrix.describe()

In [None]:
# Run the analysis
population_analysis = analyze_population_distribution(detector, matrix)

# If you want to examine specific probability ranges:
results_df = population_analysis['results_df']

# High confidence bots (e.g., >90% probability)
high_conf_bots = results_df.filter(pl.col('bot_probability') > 0.9)
print(f"\nHigh Confidence Bots (>90%): {len(high_conf_bots)}")

# High confidence humans (e.g., <10% probability)
high_conf_humans = results_df.filter(pl.col('bot_probability') < 0.1)
print(f"High Confidence Humans (<10%): {len(high_conf_humans)}")

# Uncertain predictions (e.g., 40-60% probability)
uncertain = results_df.filter(
    (pl.col('bot_probability') >= 0.4) & 
    (pl.col('bot_probability') <= 0.6)
)
print(f"Uncertain Predictions (40-60%): {len(uncertain)}")

In [None]:
feature_manager.data_loader.load_dataset(
    'profile_with_addresses', 
    columns=['fid', 'fname']
).filter(
    pl.col('fname').is_not_null()
).with_columns(
    [
        pl.col('fid').cast(pl.Int64).alias('fid'),
    ]
).join(full_matrix.with_columns(
    [
        pl.col('fid').cast(pl.Int64).alias('fid'),
    ]
), on='fid').join(
    results_df.with_columns([
        pl.col('fid').cast(pl.Int64).alias('fid'),
    ]), on='fid'
).filter(
    pl.col('bot_probability') > 0.9
).select([
    'fid', 'fname', 'bot_probability', 'prediction'
]).sort('bot_probability').filter(
    pl.col('fname') == 'vitalik'
)