In [5]:

# Improved Spam Detection Model with Enhanced Features - FIXED
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
import joblib
import re
import string
from collections import Counter

# Try to import NLTK components with fallback
try:
    import nltk
    # Try to download NLTK data
    try:
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        from nltk.corpus import stopwords
        from nltk.tokenize import word_tokenize
        NLTK_AVAILABLE = True
    except Exception as e:
        print(f"Warning: NLTK download failed: {e}")
        print("Falling back to basic text processing...")
        NLTK_AVAILABLE = False
except ImportError:
    print("Warning: NLTK not available. Using basic text processing...")
    NLTK_AVAILABLE = False

class ImprovedSpamDetector:
    def __init__(self):
        self.english_model = None
        self.sinhala_model = None
        self.english_vectorizer = None
        self.sinhala_vectorizer = None
        self.feature_scaler = StandardScaler()
        
    def extract_features(self, text, language='en'):
        """Extract additional features from text"""
        # Convert to string if not already
        text = str(text) if text is not None else ""
        
        features = {}
        
        # Basic text statistics
        features['length'] = len(text)
        features['word_count'] = len(text.split())
        features['char_count'] = len([c for c in text if c.isalpha()])
        features['digit_count'] = len([c for c in text if c.isdigit()])
        features['upper_case_count'] = len([c for c in text if c.isupper()])
        features['punctuation_count'] = len([c for c in text if c in string.punctuation])
        
        # Ratios (avoid division by zero)
        if len(text) > 0:
            features['digit_ratio'] = features['digit_count'] / len(text)
            features['upper_ratio'] = features['upper_case_count'] / len(text)
            features['punct_ratio'] = features['punctuation_count'] / len(text)
        else:
            features['digit_ratio'] = 0
            features['upper_ratio'] = 0
            features['punct_ratio'] = 0
            
        # Special characters
        features['exclamation_count'] = text.count('!')
        features['question_count'] = text.count('?')
        features['dollar_count'] = text.count('$')
        features['url_count'] = len(re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text))
        features['email_count'] = len(re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text))
        features['phone_count'] = len(re.findall(r'\b\d{3}-\d{3}-\d{4}\b|\b\d{10}\b', text))
        
        # Spam keywords (expanded for better detection)
        spam_keywords_en = ['free', 'win', 'winner', 'cash', 'prize', 'urgent', 'congratulations', 
                           'offer', 'deal', 'discount', 'limited', 'act now', 'call now', 'click here',
                           'guarantee', 'amazing', 'incredible', 'fantastic', 'bonus', 'reward']
        
        spam_keywords_si = ['නොමිලේ', 'ජයග්‍රහණය', 'ත්‍යාගය', 'මුදල්', 'හදිසි', 'සුභ පැතුම්', 
                           'දිනන', 'අවස්ථාවක්', 'ලක්ෂ', 'වාසියෙන්', 'තෝරාගෙන', 'බැංකු', 
                           'සත්‍යාපනය', 'ලියාපදිංචි', 'බාගන්න', 'දැන්ම', 'පිවිසෙන්න']
        
        if language == 'en':
            spam_words = spam_keywords_en
        else:
            spam_words = spam_keywords_si
            
        text_lower = text.lower()
        features['spam_keywords_count'] = sum(1 for word in spam_words if word in text_lower)
        
        # Average word length
        words = text.split()
        if words:
            features['avg_word_length'] = sum(len(word) for word in words) / len(words)
        else:
            features['avg_word_length'] = 0
            
        return list(features.values())
    
    def basic_tokenize(self, text):
        """Basic tokenization fallback when NLTK is not available"""
        # Simple word tokenization using regex
        words = re.findall(r'\b\w+\b', text.lower())
        return words
    
    def get_english_stopwords(self):
        """Get English stopwords with fallback"""
        if NLTK_AVAILABLE:
            try:
                return set(stopwords.words('english'))
            except:
                pass
        
        # Fallback stopwords list
        return {
            'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 
            'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 
            'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 
            'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 
            'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 
            'while', 'of', 'at', 'by', 'for', 'with', 'through', 'during', 'before', 'after', 
            'above', 'below', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 
            'further', 'then', 'once'
        }
    
    def preprocess_text(self, text, language='en'):
        """Enhanced text preprocessing with NLTK fallback"""
        # Convert to string if not already
        text = str(text) if text is not None else ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # For English, remove stopwords but keep some important ones for spam detection
        if language == 'en':
            stop_words = self.get_english_stopwords()
            # Keep these words as they might be important for spam detection
            keep_words = {'free', 'win', 'money', 'cash', 'offer', 'deal', 'urgent', 'now'}
            stop_words = stop_words - keep_words
            
            if NLTK_AVAILABLE:
                try:
                    words = word_tokenize(text)
                except:
                    words = self.basic_tokenize(text)
            else:
                words = self.basic_tokenize(text)
                
            text = ' '.join([word for word in words if word not in stop_words])
        
        return text
    
    def create_realistic_sinhala_dataset(self, original_df):
        """Create a more realistic Sinhala dataset by parsing the raw text"""
        # Parse the raw text you provided
        raw_text = """spam දැන්ම පිවිසෙන්න දිනන අවස්ථාවක් නැවත නොලැබේ ඔබට ලක්ෂ 100ක් නව ජංගම දුරකථනයක් දිනාගන්න ඔබ ඉතා වාසියෙන් තෝරාගෙන ඇත ඔබගේ බැංකු ගිණුම සත්‍යාපනය කරන්න spam දුරකථන අංකය භාවිතා කරන්න ඔබට පණිවුඩයක් ඇත ඔබට පණිවුඩයක් ඇත දැන්ම ලියාපදිංචි වන්න ඔබගේ බැංකු ගිණුම සත්‍යාපනය කරන්න ඔබ ඉතා වාසියෙන් තෝරාගෙන ඇත මෙම ඇප් එක බාගන්න ham මට නව පොතක් කියවන්න තියෙනවා අපි සෙරිනට යමුද ඔබට සුබ දවසක් වේවා කාලෙකින් හමුවුණා අද කඳවුරට යමු ham මම පාසලට යනවා ඊයේ චිත්‍රපටය ගොඩක් හොඳයි දැන්ම එන්න අපි එකට යමු අම්මා ආපහු ආවා"""
        
        # Split by spam/ham markers
        segments = re.split(r'\b(spam|ham)\b', raw_text)
        
        Messages = []
        current_Label = None
        
        for segment in segments:
            segment = segment.strip()
            if segment == 'spam':
                current_Label = 1
            elif segment == 'ham':
                current_Label = 0
            elif segment and current_Label is not None:
                # Split long segments into individual Messages
                sentences = re.split(r'[.!?]|\s{2,}', segment)
                for sentence in sentences:
                    sentence = sentence.strip()
                    if len(sentence) > 10:  # Only keep meaningful sentences
                        Messages.append({'Message': sentence, 'Label': current_Label})
        
        # Create additional spam and ham examples
        spam_templates = [
            "දැන්ම ලියාপදිංචි වන්න ඔබට ලක්ෂ {} දිනාගත හැකිය",
            "නොමිලේ {} ලබාගන්න දැන්ම කාර්ඩ් අංකය ඇතුළත් කරන්න",
            "ඔබ ජයග්‍රහණය කර ඇත! {} ත්‍යාගය ලබාගන්න",
            "වාසියෙන් තෝරාගත් ඔබට විශේෂ {} අවස්ථාව",
            "බැංකු ගිණුම සත්‍යාපනය කරන්න {} ලබාගන්න",
            "හදිසි! {} දිනන අවස්ථාව අවසන් වීමට කාලය සීමිතයි"
        ]
        
        ham_templates = [
            "අද පාසලට යන්න ඕනේ {} ගෙන්න අමතක කරන්න එපා",
            "අම්මා {} ගෙනාවා අපි එකට කමු",
            "හමුවීම {} ට නියමිතයි කාලයට එන්න",
            "චිත්‍රපටය හොඳයි {} එකට බලමු",
            "පොත් කියවන්න {} ගැන කියන්න",
            "කොහොමද {} ගැන පරණ කතා කරමු"
        ]
        
        # Generate additional examples
        items = ['ජංගම දුරකථනයක්', 'මුදල්', 'ත්‍යාගයක්', 'තෑග්ගක්', 'දිනුම්', 'ප්‍රමාණයක්']
        
        for template in spam_templates:
            for item in items[:3]:  # Use first 3 items
                Messages.append({
                    'Message': template.format(item),
                    'Label': 1
                })
        
        for template in ham_templates:
            for item in items[3:]:  # Use last 3 items
                Messages.append({
                    'Message': template.format(item),
                    'Label': 0
                })
        
        # Convert to DataFrame
        enhanced_df = pd.DataFrame(Messages)
        
        # Combine with original data if it exists and has both classes
        if len(original_df) > 0 and len(original_df['Label'].unique()) > 1:
            enhanced_df = pd.concat([original_df, enhanced_df], ignore_index=True)
        
        # Remove duplicates and ensure balanced classes
        enhanced_df = enhanced_df.drop_duplicates(subset=['Message'])
        
        # Balance the dataset
        spam_count = len(enhanced_df[enhanced_df['Label'] == 1])
        ham_count = len(enhanced_df[enhanced_df['Label'] == 0])
        
        if spam_count < ham_count:
            # Add more spam examples
            spam_df = enhanced_df[enhanced_df['Label'] == 1]
            additional_spam = spam_df.sample(n=min(ham_count - spam_count, len(spam_df)), replace=True)
            enhanced_df = pd.concat([enhanced_df, additional_spam], ignore_index=True)
        elif ham_count < spam_count:
            # Add more ham examples
            ham_df = enhanced_df[enhanced_df['Label'] == 0]
            additional_ham = ham_df.sample(n=min(spam_count - ham_count, len(ham_df)), replace=True)
            enhanced_df = pd.concat([enhanced_df, additional_ham], ignore_index=True)
        
        return enhanced_df
    
    def train_english_model(self, df):
        """Train improved English spam detection model"""
        print("Training English model with enhanced features...")
        
        # Handle missing values
        df = df.copy()
        df['Message'] = df['Message'].fillna('')
        df = df[df['Message'].str.strip() != '']
        
        # Preprocess text
        df['processed_Message'] = df['Message'].apply(lambda x: self.preprocess_text(x, 'en'))
        
        # Extract additional features
        additional_features = np.array([self.extract_features(text, 'en') for text in df['Message']])
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            df['processed_Message'], df['Label'], test_size=0.2, random_state=42, stratify=df['Label']
        )
        
        X_train_features, X_test_features = train_test_split(
            additional_features, test_size=0.2, random_state=42, stratify=df['Label']
        )
        
        # Create and train TF-IDF vectorizer with optimized parameters
        self.english_vectorizer = TfidfVectorizer(
            max_features=5000,
            ngram_range=(1, 3),
            stop_words='english',
            min_df=2,
            max_df=0.95,
            sublinear_tf=True
        )
        
        X_train_tfidf = self.english_vectorizer.fit_transform(X_train)
        X_test_tfidf = self.english_vectorizer.transform(X_test)
        
        # Scale additional features using MinMaxScaler to ensure non-negative values
        from sklearn.preprocessing import MinMaxScaler
        feature_scaler = MinMaxScaler()
        X_train_features_scaled = feature_scaler.fit_transform(X_train_features)
        X_test_features_scaled = feature_scaler.transform(X_test_features)
        self.feature_scaler = feature_scaler
        
        # Combine TF-IDF features with additional features
        from scipy.sparse import hstack
        X_train_combined = hstack([X_train_tfidf, X_train_features_scaled])
        X_test_combined = hstack([X_test_tfidf, X_test_features_scaled])
        
        # Create ensemble model - Use only algorithms that work with sparse matrices
        rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
        lr_model = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)
        
        # For MultinomialNB, use only TF-IDF features (non-negative)
        nb_model = MultinomialNB(alpha=0.1)
        nb_model.fit(X_train_tfidf, y_train)
        
        # Create voting classifier with compatible models
        self.english_model = VotingClassifier(
            estimators=[
                ('rf', rf_model),
                ('lr', lr_model)
            ],
            voting='soft'
        )
        
        # Train the ensemble model
        self.english_model.fit(X_train_combined, y_train)
        
        # Store the separate NB model for prediction combination
        self.english_nb_model = nb_model
        
        # Evaluate model
        y_pred_ensemble = self.english_model.predict(X_test_combined)
        y_pred_nb = nb_model.predict(X_test_tfidf)
        
        # Combine predictions (simple averaging)
        y_pred_ensemble_proba = self.english_model.predict_proba(X_test_combined)
        y_pred_nb_proba = nb_model.predict_proba(X_test_tfidf)
        y_pred_combined_proba = (y_pred_ensemble_proba + y_pred_nb_proba) / 2
        y_pred_combined = (y_pred_combined_proba[:, 1] > 0.5).astype(int)
        
        print("English Model Performance:")
        print(f"Ensemble Accuracy: {accuracy_score(y_test, y_pred_ensemble):.4f}")
        print(f"Combined Accuracy: {accuracy_score(y_test, y_pred_combined):.4f}")
        print("\nClassification Report (Combined):")
        print(classification_report(y_test, y_pred_combined))
        
        return self.english_model

    def train_sinhala_model(self, df):
        """Train improved Sinhala spam detection model"""
        print("Training Sinhala model with enhanced features...")
        
        # Handle missing values
        df = df.copy()
        df['Message'] = df['Message'].fillna('')
        df = df[df['Message'].str.strip() != '']
        
        # Create enhanced dataset
        df = self.create_realistic_sinhala_dataset(df)
        
        print(f"Enhanced Sinhala dataset: {len(df)} samples")
        print(f"Label distribution: {df['Label'].value_counts().to_dict()}")
        
        # Extract additional features
        additional_features = np.array([self.extract_features(text, 'si') for text in df['Message']])
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            df['Message'], df['Label'], test_size=0.2, random_state=42, stratify=df['Label']
        )
        
        X_train_features, X_test_features = train_test_split(
            additional_features, test_size=0.2, random_state=42, stratify=df['Label']
        )
        
        # Create TF-IDF vectorizer optimized for Sinhala
        self.sinhala_vectorizer = TfidfVectorizer(
            analyzer='char',
            ngram_range=(1, 4),
            max_features=3000,
            min_df=2,
            max_df=0.95,
            sublinear_tf=True
        )
        
        X_train_tfidf = self.sinhala_vectorizer.fit_transform(X_train)
        X_test_tfidf = self.sinhala_vectorizer.transform(X_test)
        
        # Scale additional features using MinMaxScaler
        from sklearn.preprocessing import MinMaxScaler
        feature_scaler = MinMaxScaler()
        X_train_features_scaled = feature_scaler.fit_transform(X_train_features)
        X_test_features_scaled = feature_scaler.transform(X_test_features)
        
        # Combine features
        from scipy.sparse import hstack
        X_train_combined = hstack([X_train_tfidf, X_train_features_scaled])
        X_test_combined = hstack([X_test_tfidf, X_test_features_scaled])
        
        # Use ensemble model
        rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
        lr_model = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)
        
        # Separate MultinomialNB for TF-IDF only
        nb_model = MultinomialNB(alpha=0.1)
        nb_model.fit(X_train_tfidf, y_train)
        
        self.sinhala_model = VotingClassifier(
            estimators=[
                ('rf', rf_model),
                ('lr', lr_model)
            ],
            voting='soft'
        )
        
        # Train the model
        self.sinhala_model.fit(X_train_combined, y_train)
        
        # Store the separate NB model
        self.sinhala_nb_model = nb_model
        
        # Evaluate model
        y_pred_ensemble = self.sinhala_model.predict(X_test_combined)
        y_pred_nb = nb_model.predict(X_test_tfidf)
        
        # Combine predictions
        y_pred_ensemble_proba = self.sinhala_model.predict_proba(X_test_combined)
        y_pred_nb_proba = nb_model.predict_proba(X_test_tfidf)
        y_pred_combined_proba = (y_pred_ensemble_proba + y_pred_nb_proba) / 2
        y_pred_combined = (y_pred_combined_proba[:, 1] > 0.5).astype(int)
        
        print("Sinhala Model Performance:")
        print(f"Ensemble Accuracy: {accuracy_score(y_test, y_pred_ensemble):.4f}")
        print(f"Combined Accuracy: {accuracy_score(y_test, y_pred_combined):.4f}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred_combined))
        
        return self.sinhala_model

    def predict(self, text, language='en'):
        """Make prediction with confidence score"""
        # Convert to string if not already
        text = str(text) if text is not None else ""
        
        if language == 'en':
            processed_text = self.preprocess_text(text, 'en')
            text_features = self.english_vectorizer.transform([processed_text])
            additional_features = np.array([self.extract_features(text, 'en')])
            additional_features_scaled = self.feature_scaler.transform(additional_features)
            
            from scipy.sparse import hstack
            combined_features = hstack([text_features, additional_features_scaled])
            
            # Get predictions from both models
            ensemble_pred_proba = self.english_model.predict_proba(combined_features)[0]
            nb_pred_proba = self.english_nb_model.predict_proba(text_features)[0]
            
            # Combine predictions
            combined_proba = (ensemble_pred_proba + nb_pred_proba) / 2
            prediction = (combined_proba[1] > 0.5).astype(int)
            confidence = max(combined_proba)
            
        else:  # Sinhala
            text_features = self.sinhala_vectorizer.transform([text])
            additional_features = np.array([self.extract_features(text, 'si')])
            additional_features_scaled = self.feature_scaler.transform(additional_features)
            
            from scipy.sparse import hstack
            combined_features = hstack([text_features, additional_features_scaled])
            
            # Get predictions from both models
            ensemble_pred_proba = self.sinhala_model.predict_proba(combined_features)[0]
            nb_pred_proba = self.sinhala_nb_model.predict_proba(text_features)[0]
            
            # Combine predictions
            combined_proba = (ensemble_pred_proba + nb_pred_proba) / 2
            prediction = (combined_proba[1] > 0.5).astype(int)
            confidence = max(combined_proba)
        
        return prediction, confidence

    def save_models(self):
        """Save all models and vectorizers"""
        try:
            joblib.dump(self.english_model, 'improved_english_spam_model.pkl')
            joblib.dump(self.sinhala_model, 'improved_sinhala_spam_model.pkl')
            joblib.dump(self.english_vectorizer, 'improved_english_vectorizer.pkl')
            joblib.dump(self.sinhala_vectorizer, 'improved_sinhala_vectorizer.pkl')
            joblib.dump(self.feature_scaler, 'feature_scaler.pkl')
            
            # Save the separate NB models
            if hasattr(self, 'english_nb_model'):
                joblib.dump(self.english_nb_model, 'english_nb_model.pkl')
            if hasattr(self, 'sinhala_nb_model'):
                joblib.dump(self.sinhala_nb_model, 'sinhala_nb_model.pkl')
            
            print("All models saved successfully!")
        except Exception as e:
            print(f"Error saving models: {e}")

    def load_models(self):
        """Load pre-trained models"""
        try:
            self.english_model = joblib.load('improved_english_spam_model.pkl')
            self.sinhala_model = joblib.load('improved_sinhala_spam_model.pkl')
            self.english_vectorizer = joblib.load('improved_english_vectorizer.pkl')
            self.sinhala_vectorizer = joblib.load('improved_sinhala_vectorizer.pkl')
            self.feature_scaler = joblib.load('feature_scaler.pkl')
            
            # Load the separate NB models
            try:
                self.english_nb_model = joblib.load('english_nb_model.pkl')
                self.sinhala_nb_model = joblib.load('sinhala_nb_model.pkl')
            except:
                print("Warning: Separate NB models not found. Using ensemble only.")
                
            print("All models loaded successfully!")
        except Exception as e:
            print(f"Error loading models: {e}")

# Training script
def main():
    # Initialize detector
    detector = ImprovedSpamDetector()
    
    try:
        # Load and prepare English dataset
        print("Loading English dataset...")
        eng_df = pd.read_csv("English_spam.csv", encoding="latin-1")
        eng_df = eng_df[['v1', 'v2']]
        eng_df.columns = ['Label', 'Message']
        eng_df['Label'] = eng_df['Label'].map({'ham': 0, 'spam': 1})
        
        # Remove any rows with missing Labels
        eng_df = eng_df.dropna(subset=['Label'])
        
        print(f"English dataset loaded: {len(eng_df)} samples")
        print(f"Label distribution: {eng_df['Label'].value_counts().to_dict()}")
        
        # Train English model
        detector.train_english_model(eng_df)
        
    except Exception as e:
        print(f"Error with English dataset: {e}")
    
    try:
        # Load and prepare Sinhala dataset
        print("\nLoading Sinhala dataset...")
        sin_df = pd.read_excel("sinhala spam.xlsx")
        sin_df.columns = ['Label', 'Message']
        sin_df.dropna(subset=['Label', 'Message'], inplace=True)
        sin_df['Label'] = sin_df['Label'].map({'ham': 0, 'spam': 1})
        sin_df.dropna(subset=['Label'], inplace=True)
        sin_df['Label'] = sin_df['Label'].astype(int)
        
        print(f"Sinhala dataset loaded: {len(sin_df)} samples")
        print(f"Label distribution: {sin_df['Label'].value_counts().to_dict()}")
        
        # Train Sinhala model
        detector.train_sinhala_model(sin_df)
        
    except Exception as e:
        print(f"Error with Sinhala dataset: {e}")
    
    # Save models
    detector.save_models()
    
    print("\n" + "="*50)
    print("Training completed! Models saved.")
    print("="*50)

# Example usage function
def test_detector():
    """Test the detector with sample Messages"""
    detector = ImprovedSpamDetector()
    
    try:
        detector.load_models()
        
        # Test Messages
        test_Messages = [
            ("Congratulations! You've won $1000! Click here now!", 'en'),
            ("Hi, how are you doing today?", 'en'),
            ("FREE OFFER! Limited time deal!", 'en'),
            ("Meeting at 3pm tomorrow", 'en')
        ]
        
        print("Testing detector:")
        for Message, lang in test_Messages:
            prediction, confidence = detector.predict(Message, lang)
            result = "SPAM" if prediction == 1 else "HAM"
            print(f"Message: '{Message}'")
            print(f"Prediction: {result} (Confidence: {confidence:.4f})")
            print("-" * 40)
            
    except Exception as e:
        print(f"Error testing detector: {e}")

if __name__ == "__main__":
    main()
    # Uncomment the line below to test the detector after training
    # test_detector()

Loading English dataset...
English dataset loaded: 5572 samples
Label distribution: {0: 4825, 1: 747}
Training English model with enhanced features...
English Model Performance:
Ensemble Accuracy: 0.9865
Combined Accuracy: 0.9892

Classification Report (Combined):
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       1.00      0.92      0.96       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.98      1115
weighted avg       0.99      0.99      0.99      1115


Loading Sinhala dataset...
Sinhala dataset loaded: 653 samples
Label distribution: {1: 525, 0: 128}
Training Sinhala model with enhanced features...
Enhanced Sinhala dataset: 793 samples
Label distribution: {1: 543, 0: 250}
Sinhala Model Performance:
Ensemble Accuracy: 1.0000
Combined Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00  