In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)

class TopicClassifier:
    def __init__(self, model_type='naive_bayes'):
        self.model_type = model_type
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.pipeline = None
        self.classes = ['news', 'cooking', 'football']
        
    def preprocess_text(self, text):
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words]
        
        return ' '.join(tokens)
    
    def prepare_data(self, df):
        # Ensure the dataframe has 'text' and 'label' columns
        if 'text' not in df.columns or 'label' not in df.columns:
            raise ValueError("DataFrame must contain 'text' and 'label' columns")
        
        # Check that all labels are in the expected classes
        if not all(label in self.classes for label in df['label'].unique()):
            raise ValueError(f"All labels must be one of: {self.classes}")
        
        # Apply preprocessing to text
        df['processed_text'] = df['text'].apply(self.preprocess_text)
        
        return df
    
    def build_pipeline(self):
        # TF-IDF vectorizer
        tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
        
        # Select classifier based on model_type
        if self.model_type == 'naive_bayes':
            classifier = MultinomialNB()
        elif self.model_type == 'logistic_regression':
            classifier = LogisticRegression(max_iter=1000, C=1.0)
        elif self.model_type == 'svm':
            classifier = LinearSVC(C=1.0)
        else:
            raise ValueError("model_type must be one of: 'naive_bayes', 'logistic_regression', 'svm'")
        
        # Create pipeline
        self.pipeline = Pipeline([
            ('tfidf', tfidf),
            ('classifier', classifier)
        ])
    
    def train(self, df, test_size=0.2, random_state=42):
        # Prepare data
        df = self.prepare_data(df)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            df['processed_text'], 
            df['label'], 
            test_size=test_size, 
            random_state=random_state,
            stratify=df['label']
        )
        
        # Build and train pipeline
        self.build_pipeline()
        self.pipeline.fit(X_train, y_train)
        
        # Evaluate model
        predictions = self.pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        report = classification_report(y_test, predictions)
        
        return {
            'accuracy': accuracy,
            'report': report,
            'X_test': X_test,
            'y_test': y_test,
            'predictions': predictions
        }
    
    def predict(self, query):
        """Predict the class of a user query"""
        if self.pipeline is None:
            raise ValueError("Model has not been trained yet. Call train() first.")
        
        # Preprocess the query
        processed_query = self.preprocess_text(query)
        
        # Make prediction
        predicted_class = self.pipeline.predict([processed_query])[0]
        
        # Get probabilities if available
        if hasattr(self.pipeline, 'predict_proba'):
            probabilities = self.pipeline.predict_proba([processed_query])[0]
            proba_dict = {self.classes[i]: probabilities[i] for i in range(len(self.classes))}
            return {'class': predicted_class, 'probabilities': proba_dict}
        
        return {'class': predicted_class}

# Example usage:
def example_usage():
    # Sample data (replace with your actual labeled dataset)
    data = {
        'text': [
            "Latest updates on the presidential election",
            "Breaking news: Stock market hits record high",
            "How to make a perfect spaghetti carbonara",
            "Best chocolate chip cookie recipe",
            "Manchester United wins against Chelsea",
            "NFL draft picks for the upcoming season",
            "Government announces new economic policy",
            "5 ways to prepare quick breakfast",
            "Champions League final match results"
        ],
        'label': [
            "news", "news", "cooking", "cooking", "football", 
            "football", "news", "cooking", "football"
        ]
    }
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Initialize and train the classifier
    classifier = TopicClassifier(model_type='naive_bayes')
    results = classifier.train(df)
    
    print(f"Model Accuracy: {results['accuracy']:.2f}")
    print("\nClassification Report:")
    print(results['report'])
    
    # Test with a new query
    test_queries = [
        "What's the latest on the coronavirus vaccine?",
        "How do I bake a chocolate cake?",
        "When is the next World Cup match?"
    ]
    
    print("\nPredictions for test queries:")
    for query in test_queries:
        prediction = classifier.predict(query)
        print(f"Query: '{query}'")
        print(f"Predicted class: {prediction['class']}")
        if 'probabilities' in prediction:
            print("Class probabilities:")
            for cls, prob in prediction['probabilities'].items():
                print(f"  {cls}: {prob:.2f}")
        print()

if __name__ == "__main__":
    example_usage()


def load_and_train_with_real_data(file_path):
    df = pd.read_csv(file_path)
    
    # Initialize classifier with preferred model
    # Options: 'naive_bayes', 'logistic_regression', 'svm'
    classifier = TopicClassifier(model_type='logistic_regression')
    
    # Train model
    results = classifier.train(df)
    
    print(f"Model Accuracy: {results['accuracy']:.2f}")
    print("\nClassification Report:")
    print(results['report'])
    
    return classifier

# Function to classify user queries in your application
def classify_user_query(classifier, query):
    result = classifier.predict(query)
    return result['class']