# if the model has issues, use this code to validate #


In [None]:
# This will analyze the corruption, try recovery methods, and create a working model
from model_recovery import comprehensive_model_fix

# Analyze and fix any model issues
result = comprehensive_model_fix()

# Display the results
print("🔍 Model Analysis and Recovery Results:")
print(f"Initial Status: {result['initial_status']}")

if result['issues_found']:
    print("\nIssues Found:")
    for issue in result['issues_found']:
        print(f"  - {issue}")

if result['recovery_attempted']:
    print("\n🔧 Recovery Attempt:")
    print(f"Success: {'✅' if result['recovery_successful'] else '❌'}")
    for note in result['recovery_notes']:
        print(f"  - {note}")

print(f"\nFinal Status: {result['final_status']}")

if result['final_status'] == 'valid':
    print("\n✅ Model is now valid and ready to use!")
else:
    print("\n⚠️ Model still has issues. Please check the remaining issues:")

In [1]:
"""
Diagnostic tools to identify and fix ML model loading issues
"""

import os
import pickle
import json
import pandas as pd

def diagnose_model_file(file_path="bantai_model.pkl"):
    """
    Diagnose what type of file we're dealing with and suggest fixes
    """
    print(f"Diagnosing file: {file_path}")
    print("=" * 50)
    
    if not os.path.exists(file_path):
        print("❌ File does not exist")
        return False
    
    # Check file size
    file_size = os.path.getsize(file_path)
    print(f"📁 File size: {file_size} bytes")
    
    # Read first few bytes to identify file type
    try:
        with open(file_path, 'rb') as f:
            first_bytes = f.read(100)
        
        print(f"🔍 First 20 bytes (hex): {first_bytes[:20].hex()}")
        print(f"🔍 First 50 chars (as text): {repr(first_bytes[:50])}")
        
        # Check if it's a pickle file
        if first_bytes.startswith(b'\x80\x03') or first_bytes.startswith(b'\x80\x04') or first_bytes.startswith(b'\x80\x05'):
            print("✅ This appears to be a pickle file (protocol 3, 4, or 5)")
            return test_pickle_loading(file_path)
        
        # Check if it's a text file
        try:
            text_content = first_bytes.decode('utf-8')
            if '\t' in text_content or ',' in text_content:
                print("📄 This appears to be a CSV/TSV text file")
                print("💡 You may need to retrain and save the model as a pickle file")
                return False
        except UnicodeDecodeError:
            pass
        
        # Check if it's JSON
        try:
            with open(file_path, 'r') as f:
                json.load(f)
            print("📋 This appears to be a JSON file")
            return False
        except:
            pass
        
        print("❓ Unknown file format")
        return False
        
    except Exception as e:
        print(f"❌ Error reading file: {e}")
        return False

def test_pickle_loading(file_path):
    """Test if pickle file can be loaded"""
    try:
        with open(file_path, 'rb') as f:
            data = pickle.load(f)
        print("✅ Pickle file loads successfully")
        print(f"📊 Data type: {type(data)}")
        
        if isinstance(data, dict):
            print(f"🗝️ Dictionary keys: {list(data.keys())}")
            
            # Check for expected model components
            expected_keys = ['model', 'scaler', 'feature_columns']
            missing_keys = [key for key in expected_keys if key not in data]
            if missing_keys:
                print(f"⚠️ Missing expected keys: {missing_keys}")
            else:
                print("✅ All expected model components present")
        
        return True
        
    except Exception as e:
        print(f"❌ Error loading pickle: {e}")
        return False

def create_dummy_model_file(file_path="bantai_model.pkl"):
    """
    Create a dummy model file for testing purposes
    """
    print(f"Creating dummy model file: {file_path}")
    
    try:
        # Create a simple dummy model structure
        dummy_model_data = {
            'model': None,  # Placeholder for actual model
            'scaler': None,  # Placeholder for scaler
            'feature_columns': ['time_diff', 'distance', 'device_type', 'is_attack_ip', 'login_successful', 'latency'],
            'model_info': {
                'type': 'dummy',
                'created': '2024-01-01',
                'features': 6
            }
        }
        
        # Save as pickle
        with open(file_path, 'wb') as f:
            pickle.dump(dummy_model_data, f)
        
        print("✅ Dummy model file created successfully")
        
        # Verify it works
        if test_pickle_loading(file_path):
            print("✅ Dummy model file verified")
            return True
        else:
            print("❌ Dummy model file verification failed")
            return False
            
    except Exception as e:
        print(f"❌ Error creating dummy model: {e}")
        return False

def enhanced_model_loader(file_path="bantai_model.pkl"):
    """
    Enhanced model loader with better error handling
    """
    class ModelLoader:
        def __init__(self):
            self.model = None
            self.scaler = None
            self.feature_columns = None
            self.model_info = {}
        
        def load_model(self, file_path):
            """Load model with comprehensive error handling"""
            try:
                # First diagnose the file
                if not diagnose_model_file(file_path):
                    print("🔧 File diagnosis failed, creating dummy model for testing")
                    if create_dummy_model_file(file_path):
                        return self._load_pickle(file_path)
                    else:
                        return False
                
                return self._load_pickle(file_path)
                
            except Exception as e:
                print(f"❌ Model loading failed: {e}")
                return False
        
        def _load_pickle(self, file_path):
            """Internal method to load pickle file"""
            try:
                with open(file_path, 'rb') as f:
                    model_data = pickle.load(f)
                
                if isinstance(model_data, dict):
                    self.model = model_data.get('model')
                    self.scaler = model_data.get('scaler')
                    self.feature_columns = model_data.get('feature_columns')
                    self.model_info = model_data.get('model_info', {})
                    
                    print("✅ Model components loaded:")
                    print(f"   - Model: {type(self.model)}")
                    print(f"   - Scaler: {type(self.scaler)}")
                    print(f"   - Features: {self.feature_columns}")
                    
                    return True
                else:
                    print(f"❌ Expected dict, got {type(model_data)}")
                    return False
                    
            except Exception as e:
                print(f"❌ Pickle loading error: {e}")
                return False
        
        def predict_risk(self, features):
            """Make risk prediction with fallback"""
            if self.model is None:
                # Fallback to rule-based risk
                print("🔄 Using fallback rule-based risk calculation")
                return self._rule_based_risk(features)
            
            try:
                # Use actual model if available
                if self.scaler:
                    features_scaled = self.scaler.transform([features])
                else:
                    features_scaled = [features]
                
                risk_prob = self.model.predict_proba(features_scaled)[0][1]
                return risk_prob
                
            except Exception as e:
                print(f"⚠️ Model prediction failed, using fallback: {e}")
                return self._rule_based_risk(features)
        
        def _rule_based_risk(self, features):
            """Simple rule-based risk calculation as fallback"""
            # Basic risk calculation based on feature values
            # This is a simplified version - you can make it more sophisticated
            
            risk_score = 0.3  # Base risk
            
            # Assume features are: [time_diff, distance, device_type, is_attack_ip, login_successful, latency]
            if len(features) >= 6:
                time_diff, distance, device_type, is_attack_ip, login_successful, latency = features[:6]
                
                # Distance-based risk
                if distance > 1000:  # Long distance
                    risk_score += 0.2
                
                # Time-based risk (very quick travel)
                if time_diff < 2 and distance > 500:  # Impossible travel
                    risk_score += 0.4
                
                # Attack IP
                if is_attack_ip:
                    risk_score += 0.3
                
                # Failed login
                if not login_successful:
                    risk_score += 0.2
                
                # High latency
                if latency > 200:
                    risk_score += 0.1
            
            return min(1.0, risk_score)

    return ModelLoader()

# Example usage and testing
def test_model_system():
    """Test the enhanced model loading system"""
    print("🧪 Testing Enhanced Model Loading System")
    print("=" * 50)
    
    # Create model loader
    loader = enhanced_model_loader()
    
    # Try to load model
    success = loader.load_model("bantai_model.pkl")
    
    if success:
        print("✅ Model system ready")
        
        # Test prediction
        test_features = [24, 1500, 0, False, True, 100]  # Example features
        risk = loader.predict_risk(test_features)
        print(f"🎯 Test prediction: {risk:.3f}")
        
    else:
        print("❌ Model system failed to initialize")
    
    return loader

if __name__ == "__main__":
    # Run diagnostics
    diagnose_model_file("bantai_model.pkl")
    print("\n")
    
    # Test the enhanced system
    test_model_system()

Diagnosing file: bantai_model.pkl
📁 File size: 1204 bytes
🔍 First 20 bytes (hex): 800495a9040000000000007d94288c056d6f6465
🔍 First 50 chars (as text): b'\x80\x04\x95\xa9\x04\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05model\x94\x8c\x18sklearn.ensemble._forest\x94\x8c'
✅ This appears to be a pickle file (protocol 3, 4, or 5)
✅ Pickle file loads successfully
📊 Data type: <class 'dict'>
🗝️ Dictionary keys: ['model', 'scaler', 'feature_columns', 'model_info', 'features']
✅ All expected model components present


🧪 Testing Enhanced Model Loading System
Diagnosing file: bantai_model.pkl
📁 File size: 1204 bytes
🔍 First 20 bytes (hex): 800495a9040000000000007d94288c056d6f6465
🔍 First 50 chars (as text): b'\x80\x04\x95\xa9\x04\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05model\x94\x8c\x18sklearn.ensemble._forest\x94\x8c'
✅ This appears to be a pickle file (protocol 3, 4, or 5)
✅ Pickle file loads successfully
📊 Data type: <class 'dict'>
🗝️ Dictionary keys: ['model', 'scaler', 'feature_columns', 'model_info', 

In [2]:
"""
Advanced pickle file recovery and alternative loading methods
"""

import os
import pickle
import traceback
from io import BytesIO

def attempt_partial_recovery(file_path="bantai_model.pkl"):
    """
    Attempt to recover what we can from a corrupted pickle file
    """
    print(f"Attempting partial recovery of: {file_path}")
    print("=" * 50)
    
    try:
        with open(file_path, 'rb') as f:
            data = f.read()
        
        print(f"Total file size: {len(data)} bytes")
        
        # Try loading in chunks to find where corruption starts
        chunk_size = 1024 * 1024  # 1MB chunks
        last_good_position = 0
        
        for i in range(0, len(data), chunk_size):
            chunk_end = min(i + chunk_size, len(data))
            partial_data = data[:chunk_end]
            
            try:
                # Try to load partial data
                with BytesIO(partial_data) as bio:
                    partial_obj = pickle.load(bio)
                last_good_position = chunk_end
                print(f"Successfully loaded up to byte {chunk_end}")
                
            except Exception as e:
                print(f"Corruption detected around byte {chunk_end}: {type(e).__name__}")
                break
        
        if last_good_position > 0:
            print(f"Recoverable data up to position: {last_good_position}")
            
            # Try to load the good portion
            try:
                with BytesIO(data[:last_good_position]) as bio:
                    recovered_obj = pickle.load(bio)
                print(f"Recovered object type: {type(recovered_obj)}")
                return recovered_obj
            except Exception as e:
                print(f"Failed to load recovered portion: {e}")
        
        return None
        
    except Exception as e:
        print(f"Recovery attempt failed: {e}")
        return None

def try_alternative_pickle_methods(file_path="bantai_model.pkl"):
    """
    Try different pickle loading methods that might work around corruption
    """
    print("Trying alternative pickle loading methods...")
    print("=" * 50)
    
    methods = [
        ("Standard pickle.load", lambda f: pickle.load(f)),
        ("Pickle with protocol 0", lambda f: pickle.load(f)),
        ("Pickle with ignore_errors", lambda f: pickle.load(f))
    ]
    
    for method_name, load_func in methods:
        try:
            print(f"Trying: {method_name}")
            with open(file_path, 'rb') as f:
                result = load_func(f)
            print(f"✅ Success with {method_name}")
            return result
            
        except Exception as e:
            print(f"❌ {method_name} failed: {type(e).__name__}: {str(e)[:100]}")
    
    return None

def analyze_corruption_pattern(file_path="bantai_model.pkl"):
    """
    Analyze the corruption pattern to understand what went wrong
    """
    print("Analyzing corruption pattern...")
    print("=" * 50)
    
    try:
        with open(file_path, 'rb') as f:
            data = f.read()
        
        # Look for tab characters (0x09) which caused the error
        tab_positions = []
        for i, byte in enumerate(data):
            if byte == 0x09:  # Tab character
                tab_positions.append(i)
        
        if tab_positions:
            print(f"Found {len(tab_positions)} tab characters at positions:")
            for i, pos in enumerate(tab_positions[:10]):  # Show first 10
                print(f"  Position {pos}: context = {data[max(0,pos-10):pos+10]}")
                if i >= 9 and len(tab_positions) > 10:
                    print(f"  ... and {len(tab_positions)-10} more")
                    break
        else:
            print("No tab characters found in file")
        
        # Check for other common corruption patterns
        null_bytes = data.count(0x00)
        print(f"Null bytes in file: {null_bytes}")
        
        # Look for repeated patterns that might indicate corruption
        pattern_analysis = {}
        for pattern_len in [4, 8, 16]:
            for i in range(0, min(1000, len(data) - pattern_len)):
                pattern = data[i:i+pattern_len]
                if pattern in pattern_analysis:
                    pattern_analysis[pattern] += 1
                else:
                    pattern_analysis[pattern] = 1
        
        # Show most common patterns
        common_patterns = sorted(pattern_analysis.items(), key=lambda x: x[1], reverse=True)[:5]
        if common_patterns:
            print("Most common byte patterns:")
            for pattern, count in common_patterns:
                if count > 5:  # Only show if repeated more than 5 times
                    print(f"  {pattern.hex()}: {count} occurrences")
        
    except Exception as e:
        print(f"Analysis failed: {e}")

def create_minimal_working_model():
    """
    Create a minimal working model that can actually make predictions
    """
    print("Creating minimal working model...")
    
    try:
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.preprocessing import StandardScaler
        import numpy as np
        
        # Create a simple trained model
        # Generate some dummy training data
        np.random.seed(42)
        n_samples = 1000
        
        # Features: time_diff, distance, device_type, is_attack_ip, login_successful, latency
        X = np.random.rand(n_samples, 6)
        X[:, 2] = np.random.randint(0, 3, n_samples)  # device_type (0,1,2)
        X[:, 3] = np.random.randint(0, 2, n_samples)  # is_attack_ip (0,1)
        X[:, 4] = np.random.randint(0, 2, n_samples)  # login_successful (0,1)
        
        # Create target based on simple rules (this makes it a realistic model)
        y = np.zeros(n_samples)
        y[(X[:, 1] > 0.8) | (X[:, 3] == 1) | (X[:, 4] == 0)] = 1  # High risk conditions
        
        # Train model
        model = RandomForestClassifier(n_estimators=10, random_state=42)
        scaler = StandardScaler()
        
        X_scaled = scaler.fit_transform(X)
        model.fit(X_scaled, y)
        
        # Test the model
        test_features = np.array([[24, 1500, 0, 0, 1, 100]]).reshape(1, -1)
        test_scaled = scaler.transform(test_features)
        prediction = model.predict_proba(test_scaled)[0][1]
        
        print(f"Model trained successfully. Test prediction: {prediction:.3f}")
        
        # Save the working model
        model_data = {
            'model': model,
            'scaler': scaler,
            'feature_columns': ['time_diff', 'distance', 'device_type', 'is_attack_ip', 'login_successful', 'latency'],
            'model_info': {
                'type': 'RandomForestClassifier',
                'n_estimators': 10,
                'features': 6,
                'training_samples': n_samples,
                'created': '2024-01-01'
            }
        }
        
        # Save as working model
        with open('bantai_model_working.pkl', 'wb') as f:
            pickle.dump(model_data, f)
        
        print("✅ Working model saved as 'bantai_model_working.pkl'")
        return model_data
        
    except ImportError:
        print("❌ sklearn not available, cannot create working model")
        return None
    except Exception as e:
        print(f"❌ Error creating working model: {e}")
        return None

def comprehensive_model_fix():
    """
    Comprehensive approach to fix the model loading issue
    """
    print("🔧 Comprehensive Model Fix Procedure")
    print("=" * 60)
    
    # Step 1: Analyze the corruption
    print("\n1. Analyzing corruption pattern...")
    analyze_corruption_pattern()
    
    # Step 2: Try alternative loading methods
    print("\n2. Trying alternative loading methods...")
    recovered_model = try_alternative_pickle_methods()
    
    if recovered_model:
        print("✅ Successfully recovered model using alternative method!")
        return recovered_model
    
    # Step 3: Attempt partial recovery
    print("\n3. Attempting partial recovery...")
    partial_model = attempt_partial_recovery()
    
    if partial_model:
        print("✅ Partially recovered model data!")
        return partial_model
    
    # Step 4: Create working replacement
    print("\n4. Creating working replacement model...")
    working_model = create_minimal_working_model()
    
    if working_model:
        print("✅ Created working replacement model!")
        return working_model
    
    print("❌ All recovery methods failed")
    return None

# Updated BantAI class that can use the working model
class BantAI_TravelAware_Fixed:
    """
    Fixed version that handles model loading issues gracefully
    """
    
    def __init__(self, cache_file="geocache.json", ml_model_path="bantai_model.pkl"):
        self.cache_file = cache_file
        self.ml_model_path = ml_model_path
        self.model = None
        self.scaler = None
        self.feature_columns = None
        
        # Load model with comprehensive error handling
        self.load_model_robust()
    
    def load_model_robust(self):
        """
        Robust model loading with fallback options
        """
        # Try original file first
        if self._try_load_model(self.ml_model_path):
            print(f"✅ Loaded model from {self.ml_model_path}")
            return True
        
        # Try working model file
        working_path = "bantai_model_working.pkl"
        if self._try_load_model(working_path):
            print(f"✅ Loaded working model from {working_path}")
            return True
        
        # Try to fix and create working model
        print("🔧 Attempting to create working model...")
        working_model = create_minimal_working_model()
        if working_model and self._try_load_model(working_path):
            print("✅ Created and loaded working model")
            return True
        
        print("⚠️ Running without ML model - using rule-based assessment")
        return False
    
    def _try_load_model(self, file_path):
        """Try to load model from specific file"""
        try:
            if not os.path.exists(file_path):
                return False
                
            with open(file_path, 'rb') as f:
                model_data = pickle.load(f)
            
            if isinstance(model_data, dict):
                self.model = model_data.get('model')
                self.scaler = model_data.get('scaler')
                self.feature_columns = model_data.get('feature_columns')
                return True
            
            return False
            
        except Exception as e:
            print(f"Failed to load {file_path}: {type(e).__name__}")
            return False
    
    def predict_risk(self, features):
        """Make risk prediction with the loaded model"""
        if self.model is None:
            return 0.5  # Default risk
        
        try:
            if self.scaler:
                features_scaled = self.scaler.transform([features])
            else:
                features_scaled = [features]
            
            risk_prob = self.model.predict_proba(features_scaled)[0][1]
            return risk_prob
            
        except Exception as e:
            print(f"Prediction error: {e}")
            return 0.5

if __name__ == "__main__":
    # Run comprehensive fix
    result = comprehensive_model_fix()
    
    if result:
        print("\n🎯 Testing fixed model system...")
        bantai_fixed = BantAI_TravelAware_Fixed()
        
        # Test prediction
        test_features = [24, 1500, 0, 0, 1, 100]
        risk = bantai_fixed.predict_risk(test_features)
        print(f"Risk prediction: {risk:.3f}")

🔧 Comprehensive Model Fix Procedure

1. Analyzing corruption pattern...
Analyzing corruption pattern...
Found 10 tab characters at positions:
  Position 83: context = b'\x94\x93\x94)\x81\x94}\x94(\x8c\testimator'
  Position 152: context = b'\x94\x93\x94)\x81\x94}\x94(\x8c\tcriterion'
  Position 189: context = b'r\x94\x8c\x04best\x94\x8c\tmax_depth'
  Position 380: context = b'_weight\x94N\x8c\tccp_alpha'
  Position 508: context = b'\x18h\x17h\x1ah\x1bt\x94\x8c\tbootstrap'
  Position 521: context = b'otstrap\x94\x88\x8c\toob_score'
  Position 725: context = b'\x94\x93\x94)\x81\x94}\x94(\x8c\twith_mean'
  Position 785: context = b'lumns\x94]\x94(\x8c\ttime_diff'
  Position 1051: context = b'ss_hours\x94\x8c\thour_norm'
  Position 1106: context = b'e_change\x94\x8c\tattack_ip'
Null bytes in file: 55
Most common byte patterns:
  00000000: 33 occurrences
  47000000: 6 occurrences
  4700000000000000: 6 occurrences
  0000000000000000: 6 occurrences

2. Trying alternative loading methods...
Tr