In [None]:
# %% Setup
import os
import sys
from pathlib import Path
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Logging setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Project setup
def get_project_root() -> Path:
    """ŒïœçœÅŒµœÉŒ∑ project root"""
    current = Path.cwd()
    if current.name == "notebooks":
        return current.parent
    return current

try:
    # Setup paths
    project_root = get_project_root()
    src_path = project_root / "src"
    
    # Add src to Python path
    if str(src_path) not in sys.path:
        sys.path.insert(0, str(src_path))
        logger.info(f"Added {src_path} to Python path")
    
    # Import preprocessor from core
    from core.preprocessor import LegalTextPreprocessor
    logger.info("Successfully imported preprocessor")
    
except Exception as e:
    logger.error(f"Setup failed: {str(e)}")
    raise

# %% Load Data
try:
    data_file = project_root / "data/raw/legal_cases.csv"
    df = pd.read_csv(data_file)
    logger.info(f"Loaded {len(df)} cases")
except Exception as e:
    logger.error(f"Failed to load data: {str(e)}")
    raise

# %% Load Data
data_file = project_root / "data/raw/legal_cases.csv"
df = pd.read_csv(data_file)
logger.info(f"Loaded {len(df):,} cases")

# %% Process Features
def extract_features(texts, batch_size=32):
    """Extract features from texts using batch processing"""
    processor = LegalTextPreprocessor(logger)
    
    # Process in batches
    results = processor.process_batch(texts, batch_size)
    
    # Create feature DataFrame
    features = []
    for result in results:
        feature_dict = {
            'text_length': result['stats']['text_length'],
            'word_count': result['stats']['word_count'],
            'citation_count': result['stats']['citation_count'],
            'processing_time': result['performance']['processing_time'],
        }
        
        if processor.gpu_available:
            feature_dict.update({
                'gpu_memory': result['performance']['memory_usage_mb'],
                'gpu_util': result['performance']['gpu_utilization']
            })
            
        features.append(feature_dict)
        
    return pd.DataFrame(features)

# Extract features
try:
    features_df = extract_features(df['case_text'].tolist())
    logger.info(f"Successfully extracted features from {len(features_df)} documents")
except Exception as e:
    logger.error(f"Feature extraction failed: {str(e)}")
    raise

# %% Performance Analysis
def plot_performance_metrics(features_df):
    """Plot performance metrics"""
    fig = plt.figure(figsize=(15, 10))
    
    # Processing Time
    plt.subplot(221)
    sns.histplot(data=features_df, x='processing_time')
    plt.title('Processing Time Distribution')
    
    # Text Length vs Time
    plt.subplot(222)
    sns.scatterplot(data=features_df, x='text_length', y='processing_time')
    plt.title('Text Length vs Processing Time')
    
    # Citations vs Time
    plt.subplot(223)
    sns.scatterplot(data=features_df, x='citation_count', y='processing_time')
    plt.title('Citations vs Processing Time')
    
    # GPU Memory if available
    if 'gpu_memory' in features_df:
        plt.subplot(224)
        sns.histplot(data=features_df, x='gpu_memory')
        plt.title('GPU Memory Usage')
    
    plt.tight_layout()
    
    # Print performance stats
    print("\nüìä Performance Metrics")
    print("-" * 50)
    print(f"Average Processing Time: {features_df['processing_time'].mean():.2f}s")
    if 'gpu_memory' in features_df:
        print(f"Average GPU Memory: {features_df['gpu_memory'].mean():.1f}MB")
        print(f"Average GPU Utilization: {features_df['gpu_util'].mean():.1f}%")

# Plot metrics
plot_performance_metrics(features_df)

# %% Save Results
output_path = project_root / "data/processed/features.csv"
features_df.to_csv(output_path, index=False)
logger.info(f"Saved features to {output_path}")

INFO:__main__:Successfully imported preprocessor
INFO:__main__:Loaded 100 cases
INFO:__main__:Loaded 100 cases
INFO:__main__:Using GPU: NVIDIA GeForce RTX 3060 Laptop GPU
ERROR:__main__:Error processing text: NVML Shared Library Not Found
ERROR:__main__:Batch processing error: NVML Shared Library Not Found


NVMLError_LibraryNotFound: NVML Shared Library Not Found